In [1]:
# Creates a list of documents from a directory of documents

import os 

def gather_data(filefolder):
    """ Produces List of Documents from a Directory
        filefolder (str): a path of .txt files
        returns list of strings 
    """
    
    data = []
    
    files = os.listdir(filefolder)
    
    for article in files: 
        
        path = os.path.join(filefolder, article)
                    
        if  path[-3:] == 'txt':
            with open(path, 'rb') as f:
                data.append(f.read())
    
    return data

data = gather_data('./data')

In [2]:
data[0]

b'Mobiles rack up 20 years of use\n\nMobile phones in the UK are celebrating their 20th anniversary this weekend.\n\nBritain\'s first mobile phone call was made across the Vodafone network on 1 January 1985 by veteran comedian Ernie Wise. In the 20 years since that day, mobile phones have become an integral part of modern life and now almost 90% of Britons own a handset. Mobiles have become so popular that many people use their handset as their only phone and rarely use a landline.\n\nThe first ever call over a portable phone was made in 1973 in New York but it took 10 years for the first commercial mobile service to be launched. The UK was not far behind the rest of the world in setting up networks in 1985 that let people make calls while they walked. The first call was made from St Katherine\'s dock to Vodafone\'s head office in Newbury which at the time was over a curry house. For the first nine days of 1985 Vodafone was the only firm with a mobile network in the UK. Then on 10 Janu

In [3]:
# Imports
import re
import string
from collections import Counter

import pandas as pd
import numpy as np
import spacy
from spacy.tokenizer import Tokenizer

from bs4 import BeautifulSoup

In [4]:
# Load the raw data - data files are split in two due to their size
df1 = pd.read_csv('https://raw.githubusercontent.com/JimKing100/techsearch/master/data/techsearch_p1.csv')
df1 = df1.drop(df1.columns[0], axis=1)
df2 = pd.read_csv('https://raw.githubusercontent.com/JimKing100/techsearch/master/data/techsearch_p2.csv')
df2 = df2.drop(df2.columns[0], axis=1)
df = pd.concat([df1, df2], ignore_index=True)

In [5]:
# Clean the text
def clean_text(text):
    text = text.replace('\n', ' ')                # remove newline
    text = BeautifulSoup(text, "lxml").get_text() # remove html
    text = text.replace('/', ' ')                 # remove forward slashes
    text = re.sub(r'[^a-zA-Z ^0-9]', '', text)    # letters and numbers only
    text = text.lower()                           # lower case
    text = re.sub(r'(x.[0-9])', '', text)         # remove special characters
    return text

df['description'] = df.apply(lambda x: clean_text(x['description']), axis=1)

In [6]:
# Initialize the tokenizer
nlp = spacy.load("en_core_web_lg")
tokenizer = Tokenizer(nlp.vocab)
STOP_WORDS = nlp.Defaults.stop_words.union(['year'])

In [7]:
# Tokenizer pipe removing stop words and blank words and lemmatizing
tokens = []

for doc in tokenizer.pipe(df['description'], batch_size=500):
    
    doc_tokens = []
    for token in doc:
        if (token.lemma_ not in STOP_WORDS) & (token.text != ' '):
            doc_tokens.append(token.lemma_)

    tokens.append(doc_tokens)

df['tokens'] = tokens

In [8]:
# Create a count function
def count(docs):

        word_counts = Counter()
        appears_in = Counter()
        
        total_docs = len(docs)

        for doc in docs:
            word_counts.update(doc)
            appears_in.update(set(doc))

        temp = zip(word_counts.keys(), word_counts.values())
        
        wc = pd.DataFrame(temp, columns = ['word', 'count'])

        wc['rank'] = wc['count'].rank(method='first', ascending=False)
        total = wc['count'].sum()

        wc['pct_total'] = wc['count'].apply(lambda x: x / total)
        
        wc = wc.sort_values(by='rank')
        wc['cul_pct_total'] = wc['pct_total'].cumsum()

        t2 = zip(appears_in.keys(), appears_in.values())
        ac = pd.DataFrame(t2, columns=['word', 'appears_in'])
        wc = ac.merge(wc, on='word')

        wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)
        
        return wc.sort_values(by='rank')

In [9]:
df.head()

Unnamed: 0,job_title,company,location,description,counts,city,job,low_salary,high_salary,tokens
0,Data Scientist (All Levels) - Santa Clara,LeanTaaS,"Santa Clara, CA 95050",help build technology that saves lives were a...,1259,San Jose,data scientist,,,"[help, build, technology, save, live, fast, gr..."
1,Data Scientist (Intern) - United States,Cisco Careers,"San Jose, CA",what youll doacquire clean and structure data ...,1259,San Jose,data scientist,,,"[youll, doacquire, clean, structure, datum, mu..."
2,Data Scientist,Stanford University,"Stanford, CA",data scientist data analyst 2 job family infor...,1259,San Jose,data scientist,,,"[datum, scientist, datum, analyst, 2, job, fam..."
3,"Data Scientist in Santa Clara, CA (corp-corp c...",Advantine Technologies,"Santa Clara, CA",job description title data scientist locatio...,1259,San Jose,data scientist,,,"[job, description, title, datum, scientist, lo..."
4,Data Scientist,Palo Verde Consulting,"Campbell, CA 95008",job title data scientistlocation campbell ca 9...,1259,San Jose,data scientist,150000.0,210000.0,"[job, title, datum, scientistlocation, campbel..."


In [10]:
wc = count(df['tokens'])
wc.head(10)

Unnamed: 0,word,appears_in,count,rank,pct_total,cul_pct_total,appears_in_pct
179,experience,7522,45476,1.0,0.018375,0.018375,0.961032
16,work,7195,33104,2.0,0.013376,0.031751,0.919254
181,design,5985,29764,3.0,0.012027,0.043778,0.764661
63,team,6737,26086,4.0,0.01054,0.054319,0.860738
38,datum,4377,25700,5.0,0.010384,0.064703,0.559218
15,development,5889,18240,6.0,0.00737,0.072073,0.752396
1,product,4720,17262,7.0,0.006975,0.079048,0.603041
246,business,4584,14252,8.0,0.005759,0.084807,0.585665
1038,application,4614,13533,9.0,0.005468,0.090275,0.589498
86,skill,5726,13453,10.0,0.005436,0.095711,0.73157


In [11]:
# Function to use spacy tokenizer
def tokenize(document):    
    doc = nlp(document)   
    return [token.lemma_.strip() for token in doc if (token.is_stop != True) and 
                                                     (token.is_punct != True) and
                                                     (token.text != ' ')]

In [12]:
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# create the transformer (instantiate)
vect = CountVectorizer(tokenizer=tokenize, stop_words=STOP_WORDS)

# tokenize and build vocab (fit)
vect.fit(df['description'])

# transform text (transform)
dtm = vect.transform(df['description'])

# Get Word Counts for each document
dtm_wc = pd.DataFrame(dtm.todense(), columns=vect.get_feature_names())
dtm_wc.head()

  'stop_words.' % sorted(inconsistent))


Unnamed: 0,Unnamed: 1,-PRON-,0,000,00001476,00008572,00054667,0012,002,003,...,zurb,zurich,zvxphofgbz,zwave,zymer,zynga,zyngas,zyngawide,zypmedia,zyrl
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# create the transformer (instantiate)
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=STOP_WORDS)

# tokenize and build vocab (fit)
tfidf.fit(df['description'])

# transform text (transform)
dtm = tfidf.transform(df['description'])

# Get Word Counts for each document
dtm = pd.DataFrame(dtm.todense(), columns=vect.get_feature_names())
dtm.head()

Unnamed: 0,Unnamed: 1,-PRON-,0,000,00001476,00008572,00054667,0012,002,003,...,zurb,zurich,zvxphofgbz,zwave,zymer,zynga,zyngas,zyngawide,zypmedia,zyrl
0,0.02885,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.027125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Instantiate
from sklearn.neighbors import NearestNeighbors

# Fit on TF-IDF Vectors
nn = NearestNeighbors(n_neighbors=10, algorithm='ball_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=10, p=2, radius=1.0)

In [15]:
ideal_job = ["""The ideal job would include python, sql, and machine learning in the real estate field"""]

In [16]:
# Transform ideal_job and find similar jobs
new = tfidf.transform(ideal_job)
nn.kneighbors(new.todense())

(array([[1.18038236, 1.18038236, 1.20051734, 1.2319825 , 1.23339742,
         1.23549709, 1.23560133, 1.24149907, 1.24966922, 1.25091728]]),
 array([[4876, 4869, 6789, 2360, 7501, 3206, 3198,  391, 7516, 7534]]))

In [18]:
df['description'][4876]

'about the team were looking for a passionate student to join our team at zillow as an intern you will own a project from beginning to end and will gain exposure to design product management and engineers zillow has an incredible internship program during the summer where you will meet students from all over attend fun events and work in downtown seattle or san francisco  we are a multidisciplinary team bringing together ux design visual design content strategy and user research to deliver excellent user experiences for consumers real estate agents and internal operations team members our team cares deeply about our users and are continually striving to better our craft our team designs end to end experiences across multiple platforms including mobile apps mobile web and desktop  research is a critical input into the product development process at zillow group  one that helps our teams understand user needs define problems generate ideas and evaluate potential solutions you will work c