In [8]:
# def generate_indeed_data(search, num_pages = 1):
    
#     def scrape_data(tag = None, class_name = None, soup= None):
#         data = []
        
#         if class_name is None:
#             tag_list = soup.find_all(tag)
#             for num in range(len(tag_list)):
#                 data.append(tag_list[num].get_text())
#             return data
        
#         elif tag is None:
#             tag_list = soup.find_all(class_ = class_name)
#             for num in range(len(tag_list)):
#                 data.append(tag_list[num].get_text())
#             return data
        
#         else:
#             tag_list = soup.find_all(tag, class_=class_name)
#             for num in range(len(tag_list)):
#                 data.append(tag_list[num].get_text())
#             return data
    
#     urls = ["https://www.indeed.com/jobs?q="+str(search)+ "&start=" + str(num * 10) for num in range(num_pages)]
#     df = pd.DataFrame(columns= ['title', 'location', 'summary'])
    
#     for url in urls:
#         request = requests.get(url)
#         soup = BeautifulSoup(request.text, 'html.parser')
#         #grabbing titles, locations, and summaries
#         titles = scrape_data(class_name = 'title', soup= soup)
#         locations = scrape_data(class_name= 'location', soup= soup)
#         summaries = scrape_data(class_name = 'summary', soup= soup)
#         #little bit of data cleaning
#         titles = [title.strip('\n').strip() for title in titles]
#         summaries = [summary.strip('\n').strip() for summary in summaries]
#         #concat new data to end of existing dataframe
#         columns = ['title', 'location', 'summary']
#         data = [[titles[num], locations[num], summaries[num]] for num in range(len(titles))]
#         df = pd.concat([df, pd.DataFrame(data= data, columns=columns)], ignore_index= True)
    
#     return df
# # This function will allow you to quickly create a data frame with indeed jobs. 
# # You just put a search in like "data scientist" and the number of pages you want to scrape.

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 10000)

In [60]:
from urllib.parse import urlencode
from requests_html import HTMLSession
from multiprocessing.dummy import Pool
from itertools import chain

class IndeedJobListings:
    '''
    Multi-theaded Indeed Job Listings Crawler
    Usage:
    descriptions = IndeedJobListings('Data Scientist', 'Seattle, WA').get_descriptions()
    '''
    def __init__(self, search_keyword, location, threads=12):
        self.threads = threads
        self.base_url = 'https://www.indeed.com'
        self.query_url = f'{self.base_url}/jobs?' +\
        urlencode({'q': search_keyword, 'l': location})
        self.session = HTMLSession()
    
    def _get_posting_urls(self, url):
        doc = self.session.get(url)
        posting_urls = [f'{self.base_url}{e.attrs["href"]}' for e in doc.html.find('.jobtitle.turnstileLink')]
        return posting_urls

    def _get_description_text(self, url):
        doc = self.session.get(url)
        description_text = doc.html.find('#jobDescriptionText')[0].text
        return description_text
                        
    def get_descriptions(self, pages=1):
        list_urls = [self.query_url] + [f'{self.query_url}&start={x*10}'
                                        for x in range(1, pages)]
        p = Pool(self.threads)
        post_urls = chain(*p.map(self._get_posting_urls, list_urls))
        descriptions = p.map(self._get_description_text, post_urls)
        return descriptions


listings = IndeedJobListings('Data Scientist', 'New York, NY')
data = listings.get_descriptions(pages=50)

In [61]:
len(data)

826

In [43]:
#5. TFIDF

# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words='english', max_features=50000)
tfidf.fit(data)
# Create a vocabulary and get word counts per document
sparse = tfidf.fit_transform(data)
#dtm = tfidf.transform(new_data)

# Print word counts

# Get feature names to use as dataframe column headers

# View Feature Matrix as DataFrame
docs = pd.DataFrame(sparse.todense(), columns = tfidf.get_feature_names())
docs.head()

Unnamed: 0,00,000,03,04,07,09,10,100,1000,100m,...,yes,york,yorkers,youtube,zendesk,zoomrx,zr,ﬁeld,ﬁnd,ﬁndings
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.044386,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.10576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.042659,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
#6. NearestNeighbor Model
from sklearn.neighbors import NearestNeighbors

nn = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')

# Fit on TF-IDF Vectors
nn.fit(docs)
#nn.fit(dtm.todense())
nn.kneighbors([docs.iloc[0]])

ideal_job_posting_sample = [ """Ability to translate business needs into data science products.Backed by Sequoia Capital, Madrone Partners and Jackson Square Ventures, Strava is expanding in order to exceed the needs of our growing community of global athletes. By joining our team, you will help push Strava forward in fresh, innovative ways. You will engage in interesting and challenging work that will improve the lives of our athletes every day. And in the same way that Strava is deeply committed to unlocking the potential of our athletes, we are dedicated to providing a world-class workplace where our employees can grow and thrive. Join us! Strava is an equal opportunity employer.  In keeping with the values of Strava, we make all employment decisions including hiring, evaluation, termination, promotional and training opportunities, without regard to race, religion, color, sex, age, national origin, ancestry, sexual orientation, physical handicap, mental disability, medical condition, disability, pregnancy or pregnancy-related condition, marital status, height and or weight.Apply machine learning and other relevant techniques to business problems.ou will work on a variety of projects across product teams, including but not limited to recommendation engines, geographic information systems (GIS), and anomaly detection."""]


In [56]:
new = tfidf.transform(ideal_job_posting_sample)

nn.kneighbors(new.todense())

(array([[1.29889119, 1.29889119, 1.30158619, 1.30776593, 1.31633876]]),
 array([[ 41,  68, 100, 121,  39]], dtype=int64))

In [54]:
data[41]

'Who You Are:\nGetty Images is looking for individuals that enjoy leveraging new and traditional Machine Learning methods to help turn large-scale business data into actionable insights.\nThe mission of the Data Science team at Getty Images Inc. is to leverage internal and third-party data to inform other groups on how to interact with its customer base. We achieve this goal by 1) building automated solutions that apply best-in-class Machine Learning and Engineering practices and 2) continuous interactions with stakeholders to identify critical needs that deliver results relevant to the business.\nAs a Data Scientist - ML/AI at Getty Images, you will have end-to-end autonomy and ownership of your projects, and will work closely with other business units to build scalable and robust Machine Learning pipelines that will aim to improve workflows and improve business outcomes.\n\nYour Next Challenge:\nYou will join a growing team of highly-collaborative and curious Data Scientists and Data

In [50]:
data[100]

"The New York Times is seeking inventive and motivated data engineers at all levels of experience to join the Data Engineering group. In this role, you will build critical data infrastructure that surfaces data and insights across the company.\nAbout Us\nOur Data Engineering teams are at the intersection of business analytics, data warehousing, and software engineering. As Maxime Beauchemin wrote in “The Rise of Data Engineering”, ETL and data modeling have evolved, and the changes are about distributed systems, stream processing, and computation at scale. They’re about working with data using the same practices that guide software engineering at large. A strong data foundation is essential for The New York Times and we’re responsible for it. We use our data infrastructure to power analytics and data products and to deliver relevant experiences to our customers in real-time. We enable our company to validate strategic decisions, make smarter choices, and react to the fast changing worl

In [51]:
data[121]

"The New York Times is a technology company committed to producing the world’s most reliable and highest quality journalism. Our ability to do so relies on a talented team of expert technologists who help NYT learn from a tremendous abundance of data unique to this company. The Times seeks a Data Scientist to join the Data Science Group applying machine learning methods to meet this challenge, in close collaboration with working partners across the company.\nResponsibilities:\nReframe newsroom and business objectives as machine learning tasks that can deliver actionable insights, accurate predictions, and effective optimization.\nImplement and execute machine learning research with reliability and reproducibility.\nCommunicate results and impact to newsroom and business stakeholders.\nTurn models into data products, collaborate with engineering teams, and integrate into process throughout The Times.\nQualifications:\nTechnical:\nPhD, MS, or 3+ years experience in computer science, appl

In [39]:
data[24]

"The New York Times is a technology company committed to producing the world’s most reliable and highest quality journalism. Our ability to do so relies on a talented team of expert technologists who help NYT learn from a tremendous abundance of data unique to this company. The Times seeks a Data Scientist to join the Data Science Group applying machine learning methods to meet this challenge, in close collaboration with working partners across the company.\nResponsibilities:\nReframe newsroom and business objectives as machine learning tasks that can deliver actionable insights, accurate predictions, and effective optimization.\nImplement and execute machine learning research with reliability and reproducibility.\nCommunicate results and impact to newsroom and business stakeholders.\nTurn models into data products, collaborate with engineering teams, and integrate into process throughout The Times.\nQualifications:\nTechnical:\nPhD, MS, or 3+ years experience in computer science, appl

In [52]:
data[39]

'As a member of RISIRISA Team, you will be joining an interdisciplinary group of scientists, mathematicians, designers, and engineers that work with commercial, public and social sector clients to help them solve their most challenging problems and get the most out of their data.\nAs a Data Scientist, you will apply your mathematical, scientific, and/or economic training to analyze large volumes of data, model complex human-scale problems, and develop algorithms to serve various needs.\nYou will work in collaboration with data engineers and design technologists to come up with creative solutions to challenging client problems, most often with a clear line of sight from your work to real-world impact. You will work across sectors (e.g. healthcare, cybersecurity, global development, music, etc.) and have the opportunity to regularly experiment with new tools and techniques.\n\n\nRequirements\nAdvanced Degree(s) in mathematics and/or computer science, or at least four years of relevant ex

In [70]:
df = pd.DataFrame(data)
df.shape

(826, 1)

In [78]:
business = df.loc[((df[0]).str.contains('business')) & ((df[0]).str.contains('growing')) & ~((df[0]).str.contains('Senior'))]
business.shape

(95, 1)

In [84]:
business.sample(3)

Unnamed: 0,0
373,Overview\nDo you love numbers and finding the ...
344,Who we are\nWe are a new and rapidly growing t...
443,About our team\nWe are the Ipsos Behavioral Da...


In [83]:
df.rename(columns={0:'Description'}, inplace = True)
df.head(1)

Unnamed: 0,Description
0,"Risk Data Scientist, NYC\nPosition Summary:\nM..."


In [85]:
import numpy as np
df['Preference'] = np.where(df.index.isin(business.index), 'want', 'nah')

In [86]:
df.sample(3)

Unnamed: 0,Description,Preference
730,"Job description: - Using Tensorflow Lite, impl...",nah
285,Associate Consultant (Business Analysis Team)\...,want
653,"Job description: - Using Tensorflow Lite, impl...",nah


In [98]:
df.iloc[653]['Description']

'Job description: - Using Tensorflow Lite, implement data collection methods, processing, and storage (principally facial features, pupil size, ambient sound, and ambient light).\n- Clean data for pattern/signal detection.\n- Train data against known stimuli.\n- Create models to predict user reactions (for example, facial features and pupil size).\n- Brainstorm and create innovative solutions. (Understand that the variables/features thought to be indicative of outputs may very well be the extensions of bias, not the best indicators of the measured output/change.)\nExperience: Minimum 4+ years(Must have references.)\nExpert in:\nPythonAzure\nFamiliar with:\nOpenCVTensorFlow Lite\nKeras\nQualities:\nLearns quicklyAttention to detailCreative problem solverDelvers on time\nInterview Process\nPhone interview\nIn-person interview\nJob Type: Full-time\nSalary: $80,000.00 to $150,000.00 /year\nExperience:\nmachine learning: 2 years (Required)\nLocation:\nNew York, NY (Required)\nWork authoriza

In [91]:
# Import Statements
import spacy
nlp = spacy.load("en_core_web_md")
from sklearn.pipeline import Pipeline

from sklearn.linear_model import SGDClassifier#stochastic gradient descent
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)

# Tokenizer Pipe
tokens = []

""" Make them tokens """
for doc in tokenizer.pipe(df['Description'], batch_size=500):#similar to scikit learn pipeline
    doc_tokens = [token.text for token in doc]
    tokens.append(doc_tokens)
#     doc_tokens = []
#     for token in doc:
#         if (token.is_stop == False) & (token.is_punct == False):
#             doc_tokens.append(token.text.lower())
            
#    tokens.append(doc_tokens)
    
df['tokens'] = tokens

# Create Pipeline

vect = TfidfVectorizer(stop_words='english')#instantiate vectorizer
sgdc = SGDClassifier()#instantiate classifier

pipe = Pipeline([('vect', vect), ('clf', sgdc)])#put objects in pipe#the last item in pipeline, should have fit command
pipe.fit(df['Description'], df['Preference'])



Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [92]:
pipe.predict(['As a machine learning data scientist, you will have the opportunity to leverage our robust data and machine learning infrastructure to develop models that impact millions of users across our three audiences and tackle our most challenging business problems. You will work with data scientists, engineers, and product managers to develop and iterate on models to help us grow our business.'])

array(['nah'], dtype='<U4')

In [100]:
# Import

from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=500, #take features and reduce to how many features
                   algorithm='randomized',#how svd is working, randomized is best bet, 
                   n_iter=10)#number of passes at estimating components
# LSI

lsi = Pipeline([('vect', vect), ('svd', svd)])
# Pipe

pipe = Pipeline([('lsi', lsi), ('clf', sgdc)])

params = {
    'lsi__vect__max_df': (10, 10, 10)#least indexing
}
grid_search = GridSearchCV(pipe,parameters, cv=15, n_jobs=5, verbose=1)
# Fit
pipe.fit(df['Description'], df['Preference'])
# test['category'] = pipe.predict(test['description'])
# submission = test[['id', 'category']]
# submission.head(2)
pipe.predict(['As a machine learning data scientist, you will have the opportunity to leverage our robust data and machine learning infrastructure to develop models that impact millions of users across our three audiences and tackle our most challenging business problems. You will work with data scientists, engineers, and product managers to develop and iterate on models to help us grow our business.'])

array(['nah'], dtype='<U4')

In [102]:
import spacy
nlp = spacy.load("en_core_web_md")

def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

X = get_word_vectors(df['Description'])

sgdc.fit(X, df['Preference'])




SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [105]:
sgdc.predict(get_word_vectors(['As a machine learning data scientist, you will have the opportunity to leverage our robust data and machine learning infrastructure to develop models that impact millions of users across our three audiences and tackle our most challenging business problems. You will work with data scientists, engineers, and product managers to develop and iterate on models to help us grow our business.']))

array(['want'], dtype='<U4')