In [None]:
import numpy as np
import pandas as pd
import nltk

In [None]:
df= pd.read_csv('../data/processed/nlp/cleaned_data.csv')

In [None]:
df.head()

In [None]:
df.isnull().sum()

TFIDF

In [None]:
df = df[df["tokens"].apply(len) > 0]


In [None]:
import ast

def parse_tokens(x):
    try:
        return ast.literal_eval(x)
    except:
        return []


df["tokens"] = df["tokens"].apply(parse_tokens)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
df["text"] = df["tokens"].apply(lambda tokens: " ".join(tokens))


vectorizer = TfidfVectorizer(
    max_features=10000,
    min_df=2,
    max_df=0.98,
    ngram_range=(1,2),
    stop_words=None
)

X_tfidf = vectorizer.fit_transform(df["text"])

np.set_printoptions(edgeitems=30, linewidth=100000, 
    formatter=dict(float=lambda x: "%.3g" % x))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(X_tfidf)


In [None]:
similarity_matrix

In [None]:
row = X_tfidf[0]

row.indices      
row.data         
feature_names = vectorizer.get_feature_names_out()

for i, v in zip(row.indices[:10], row.data[:10]):
    print(feature_names[i], v)


In [None]:
def get_job_profile(index, top_n=10):
    if index < 0 or index >= X_tfidf.shape[0]:
        raise ValueError("Invalid job index")
    row = X_tfidf[index]
    feature_names = vectorizer.get_feature_names_out()
    pairs = list(zip(row.indices, row.data))
    sorted_pairs = sorted(pairs, key=lambda x: x[1], reverse=True)  
    top_features = [(feature_names[pair[0]], pair[1]) for pair in sorted_pairs[:top_n]]
    return top_features

In [None]:
get_job_profile(7)

In [None]:
from sklearn.cluster import MiniBatchKMeans

kmeans = MiniBatchKMeans(n_clusters=20, random_state=42)

df["cluster"] = kmeans.fit_predict(X_tfidf)
df.groupby("cluster")["title"].head(5)


In [None]:
def search_jobs(query, top_n=10):
    
    q_vec = vectorizer.transform([query])
    scores = cosine_similarity(q_vec, X_tfidf)[0]
    
    top = scores.argsort()[-top_n:][::-1]
    
    return df.iloc[top][["title","company_name","location"]]


In [None]:
search_jobs("marketing real estate coordinator")
search_jobs("software engineer python")
search_jobs("data scientist machine learning")


In [None]:
skill_vocab = set()

for i in range(1000):
    for w,_ in get_job_profile(i, 10):
        skill_vocab.add(w)

len(skill_vocab)


In [None]:
skill_vocab = sorted(skill_vocab)

In [None]:
skill_vocab