In [2]:
import numpy as np
import pandas as pd
import nltk

In [3]:
df= pd.read_csv('../data/processed/nlp/cleaned_data.csv')

In [4]:
df.head()

Unnamed: 0,company_name,title,description,location,tokens
0,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,"Princeton, NJ","['job', 'descriptiona', 'lead', 'real', 'estat..."
1,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...","Fort Collins, CO","['aspen', 'therapy', 'wellness', 'commit', 'se..."
2,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,"Cincinnati, OH","['national', 'exemplar', 'accept', 'applicatio..."
3,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,"New Hyde Park, NY","['senior', 'associate', 'attorney', 'elder', '..."
4,,Service Technician,Looking for HVAC service tech with experience ...,"Burlington, IA","['looking', 'hvac', 'service', 'tech', 'experi..."


In [5]:
df.isnull().sum()

company_name    1034
title              0
description        0
location           0
tokens             0
dtype: int64

TFIDF

In [6]:
df = df[df["tokens"].apply(len) > 0]


In [7]:
import ast

def parse_tokens(x):
    try:
        return ast.literal_eval(x)
    except:
        return []


df["tokens"] = df["tokens"].apply(parse_tokens)


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
df["text"] = df["tokens"].apply(lambda tokens: " ".join(tokens))


vectorizer = TfidfVectorizer(
    max_features=10000,
    min_df=2,
    max_df=0.98,
    ngram_range=(1,2),
    stop_words=None
)

X_tfidf = vectorizer.fit_transform(df["text"])

np.set_printoptions(edgeitems=30, linewidth=100000, 
    formatter=dict(float=lambda x: "%.3g" % x))

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(X_tfidf)


In [10]:
similarity_matrix

array([[1, 0.0414, 0.0256, 0.0908, 0.00284, 0.0886, 0.059, 0.0522, 0.0259, 0.0602, 0.0642, 0.0775, 0.0782, 0.0555, 0.2, 0.0632, 0.0915, 0.0466, 0.0616, 0.144, 0.04, 0.0568, 0.013, 0.0258, 0.0493, 0.0652, 0.0514, 0.0588, 0.0969, 0.0146, ..., 0.039, 0.039, 0.0289, 0.041, 0.0532, 0.0524, 0.0378, 0.0498, 0.0498, 0.0423, 0.105, 0.0647, 0.0217, 0.0751, 0.108, 0.0511, 0.0599, 0.0288, 0.0381, 0.0131, 0.066, 0.0226, 0.0535, 0.0978, 0.0215, 0.041, 0.0447, 0.0689, 0.0666, 0.0359],
       [0.0414, 1, 0.0702, 0.0464, 0.024, 0.0801, 0.0237, 0.0825, 0.143, 0.0658, 0.0512, 0.0571, 0.0817, 0.0289, 0.0522, 0.162, 0.0661, 0.0531, 0.0425, 0.0209, 0.0485, 0.165, 0.0162, 0.00877, 0.121, 0.0608, 0.0639, 0.0396, 0.0713, 0.0506, ..., 0.0723, 0.0723, 0.0792, 0.0678, 0.0882, 0.069, 0.0743, 0.0942, 0.0942, 0.0505, 0.0789, 0.0556, 0.0359, 0.118, 0.0554, 0.0198, 0.0579, 0.0644, 0.0323, 0.0315, 0.0891, 0.0324, 0.0547, 0.0854, 0.0215, 0.0482, 0.0445, 0.0487, 0.0513, 0.0183],
       [0.0256, 0.0702, 1, 0.00873, 0.0099

In [14]:
row = X_tfidf[0]

row.indices      
row.data         
feature_names = vectorizer.get_feature_names_out()

for i, v in zip(row.indices[:10], row.data[:10]):
    print(feature_names[i], v)


job 0.021685729460679242
lead 0.029276973879254323
real 0.10946617307188378
estate 0.1335358393722898
firm 0.049890214292548536
new 0.026023228799450532
jersey 0.07946226389360266
seek 0.03322078371937916
administrative 0.051497700351800825
marketing 0.3541977535793666


In [20]:
def get_job_profile(index, top_n=10):
    if index < 0 or index >= X_tfidf.shape[0]:
        raise ValueError("Invalid job index")
    row = X_tfidf[index]
    feature_names = vectorizer.get_feature_names_out()
    pairs = list(zip(row.indices, row.data))
    sorted_pairs = sorted(pairs, key=lambda x: x[1], reverse=True)  
    top_features = [(feature_names[pair[0]], pair[1]) for pair in sorted_pairs[:top_n]]
    return top_features

In [22]:
get_job_profile(7)

[('club', np.float64(0.3281341427961785)),
 ('building', np.float64(0.23661700275131994)),
 ('emergency response', np.float64(0.2123660638634041)),
 ('maintenance', np.float64(0.1744835797302859)),
 ('emergency', np.float64(0.16010014753959292)),
 ('premium', np.float64(0.1516014292287037)),
 ('desire', np.float64(0.14608740829343286)),
 ('response', np.float64(0.13023925713190176)),
 ('system', np.float64(0.11879090897568315)),
 ('paid', np.float64(0.11611601888368796))]

In [23]:
from sklearn.cluster import MiniBatchKMeans

kmeans = MiniBatchKMeans(n_clusters=20, random_state=42)

df["cluster"] = kmeans.fit_predict(X_tfidf)
df.groupby("cluster")["title"].head(5)


0                                    Marketing Coordinator
1                        Mental Health Therapist/Counselor
2                              Assitant Restaurant Manager
3        Senior Elder Law / Trusts and Estates Associat...
4                                       Service Technician
                               ...                        
7058                            Oracle EPM Project Manager
7059                               Data Programmer Analyst
7060     System Administrator (Direct Hire, On-Site, W2...
7061                Loan Officer - Signing Bonus Offered!!
36757       2nd Shift Machining Production Supervisor (AZ)
Name: title, Length: 96, dtype: str

In [24]:
def search_jobs(query, top_n=10):
    
    q_vec = vectorizer.transform([query])
    scores = cosine_similarity(q_vec, X_tfidf)[0]
    
    top = scores.argsort()[-top_n:][::-1]
    
    return df.iloc[top][["title","company_name","location"]]


In [32]:
search_jobs("marketing real estate coordinator")
search_jobs("software engineer python")
search_jobs("data scientist machine learning")


Unnamed: 0,title,company_name,location
32892,Machine Learning Engineer,Searchability®,"New York, NY"
55986,AI /Machine Learning Engineer / Architect,Lorven Technologies Inc.,"Nashville, TN"
56748,"Director, Deep Learning & AI",Birdseye Partners,"Massachusetts, United States"
38454,Data Scientist with GenAI,TOPSYS IT,"Plano, TX"
59271,Machine Learning Engineer,eduPhoria.ai,"California, United States"
52024,Machine Learning Engineer- 10+year exp*,"Q1 Technologies, Inc.","Sunnyvale, CA"
28346,Senior Machine Learning Engineer,Stefanini North America and APAC,"Dearborn, MI"
50933,Data Scientist with BERT and Genomics Experience,Stellite Works LLC,United States
37212,Senior Developer Relationship Manager – Physic...,NVIDIA,"Santa Clara, CA"
42693,Principal Data Scientist,"Denken Solutions, Inc.","Round Rock, TX"


In [27]:
skill_vocab = set()

for i in range(1000):
    for w,_ in get_job_profile(i, 10):
        skill_vocab.add(w)

len(skill_vocab)


3666

In [None]:
skill_vocab = sorted(skill_vocab)

{'ab': 0,
 'abb': 1,
 'ability': 2,
 'ability deliver': 3,
 'ability effectively': 4,
 'ability interact': 5,
 'ability thrive': 6,
 'ability understand': 7,
 'ability willingness': 8,
 'able': 9,
 'able demonstrate': 10,
 'able effectively': 11,
 'able work': 12,
 'abuse': 13,
 'academic': 14,
 'academy': 15,
 'accept application': 16,
 'acceptance': 17,
 'access': 18,
 'access compensation': 19,
 'accessible': 20,
 'accessory': 21,
 'accident': 22,
 'accommodation': 23,
 'accord': 24,
 'accord company': 25,
 'account': 26,
 'account executive': 27,
 'account management': 28,
 'account manager': 29,
 'account payable': 30,
 'account receivable': 31,
 'accountability': 32,
 'accountant': 33,
 'accounting': 34,
 'accounting experience': 35,
 'accounting finance': 36,
 'accounting principle': 37,
 'accounting software': 38,
 'accounting team': 39,
 'accounts': 40,
 'accounts payable': 41,
 'accounts receivable': 42,
 'accreditation': 43,
 'accuracy': 44,
 'accurate': 45,
 'accurately': 4