In [55]:
import pandas as pd
import numpy as np
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_distances

In [50]:
df = pd.read_csv('../Datasets/df_all_linkedin.csv')
df.drop_duplicates(subset='Description', keep='first', inplace=True)
descriptions = df['Description'].values

tokenized_descriptions = []
tokenized_descriptions_tagged = []
for idx, descrip in enumerate(descriptions):
    tok_descrip = gensim.utils.simple_preprocess(descrip)
    corpus = TaggedDocument(tok_descrip, [idx])
    tokenized_descriptions_tagged.append(corpus)
    tokenized_descriptions.append(tok_descrip)


In [51]:
model = Doc2Vec(vector_size=100, min_count=2, epochs=50)
model.build_vocab(tokenized_descriptions_tagged)
model.train(tokenized_descriptions_tagged, total_examples=model.corpus_count, epochs=model.epochs)

vectorized_docs = []
for descrip in tokenized_descriptions:
    inferred_vec = model.infer_vector(descrip)
    vectorized_docs.append(inferred_vec)


In [52]:
kmeans = KMeans(n_clusters=7, n_jobs=-1)
kmeans.fit(vectorized_docs)
cluster_centers = kmeans.cluster_centers_

In [53]:
for idx, cluster in enumerate(cluster_centers):
    print(f"\n Cluster #{idx}: ")
    distances = euclidean_distances(cluster.reshape(1,-1), vectorized_docs)
    order = np.argsort(distances)[::-1].flatten()
    for o in order[:11]:
        print(f"Job Title: {df['Job_Title'].values[o]}, Company Name: {df['Company'].values[o]}")


 Cluster #0: 
Job Title: Data Scientist II, Company Name: SMX
Job Title: Data Scientist -Tampa, Company Name: Advent Global Solutions
Job Title: Data Science Director 87614, Company Name: Ezra Penland Actuarial Recruitment
Job Title: Data Scientist, Company Name: Advent Global Solutions
Job Title: Insurance Data Scientist 84814, Company Name: Ezra Penland Actuarial Recruitment
Job Title: Data Analyst, Company Name: Daasity
Job Title: Data Scientist, Company Name: JNIT Technologies
Job Title: Insurance Data Scientist 86679, Company Name: Ezra Penland Actuarial Recruitment
Job Title: Data Scientist II -Broomfield, Company Name: Sparity
Job Title: Data Scientist II, Company Name: Sparity
Job Title: Machine Learning Engineer, Company Name: Next Level Business Services, Inc.

 Cluster #1: 
Job Title: Data Scientist -Tampa, Company Name: Advent Global Solutions
Job Title: Data Scientist, Company Name: Advent Global Solutions
Job Title: Data Scientist II, Company Name: SMX
Job Title: Machine

In [54]:
df.columns

Index(['Unnamed: 0', 'Job_Title', 'Company', 'Location',
       'Number_of_Applicants', 'Description', 'Length_of_Description',
       'Senior', 'Junior', 'Senior_Junior_or_not', 'num_applicants', 'State'],
      dtype='object')

In [56]:
for idx, cluster in enumerate(cluster_centers):
    print(f"\n Cluster #{idx}: ")
    distances = cosine_distances(cluster.reshape(1,-1), vectorized_docs)
    order = np.argsort(distances)[::-1].flatten()
    for o in order[:11]:
        print(f"Job Title: {df['Job_Title'].values[o]}, Company Name: {df['Company'].values[o]}")


 Cluster #0: 
Job Title: Data Coordinator (Melanoma) (Open Rank), Company Name: University of Colorado
Job Title: Clinical Research Coordinator (Phase 1), Company Name: University of Colorado
Job Title: Data Analyst-Statistical-Castell, Company Name: Intermountain Healthcare
Job Title: Data Analyst, Company Name: University of Colorado
Job Title: Health Data Analyst/BI Developer, Company Name: University of Colorado
Job Title: Scientist-Sr., Healthcare Delivery Institute - Intermountain Medical Center - Full time, Company Name: Intermountain Healthcare
Job Title: Data Analytics & Business Intelligence Developer, Company Name: University of Colorado
Job Title: Data Scientist - Behavioral Health, Company Name: Denver Health
Job Title: Data Analyst- Staff/Sr at Primary Children's Hospital, Company Name: Intermountain Healthcare
Job Title: Data Management Analyst II, Company Name: University of Florida
Job Title: Data Analytics AVP - Clinical Analytics, Company Name: Intermountain Healthc