In [1]:
#Importing libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import plotly.graph_objects as go
from gensim.models import FastText

In [None]:
# Load the job ads data
job_ads = pd.read_csv('processed_job_ads.csv')

# Load the vocabulary from vocab.txt
vocab = open("vocab.txt", "r").read().splitlines()
word_to_index_cleaned = {word.split(":")[0]: idx for idx, word in enumerate(vocab)}

In [3]:
# Count Vector Representation
vectorizer = CountVectorizer(vocabulary=word_to_index_cleaned)
count_vectors = vectorizer.transform(job_ads['Tokenized Description'].astype(str))

In [4]:
# Function to save count vectors
def save_count_vectors_to_file(count_vectors, job_ads, filename="count_vectors.txt"):
    with open(filename, "w") as f:
        for idx, row in enumerate(count_vectors):
            vector_str = ",".join([f"{feature_idx}:{int(freq)}" for feature_idx, freq in zip(row.indices, row.data)])
            f.write(f"#{job_ads.iloc[idx]['Webindex']},{vector_str}\n")

save_count_vectors_to_file(count_vectors, job_ads)

In [5]:
# Unweighted Embeddings using FastText
def get_unweighted_embedding(tokens, model):
    embeddings = [model.wv[token] for token in tokens if token in model.wv.index_to_key]
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(model.vector_size)

tokenized_descriptions = job_ads['Tokenized Description'].apply(eval).tolist()
fasttext_model = FastText(sentences=tokenized_descriptions, vector_size=100, window=5, min_count=1, workers=4)
unweighted_embeddings = np.array([get_unweighted_embedding(tokens, fasttext_model) for tokens in tokenized_descriptions])

In [None]:
# TF-IDF Weighted Embeddings
tfidf_vectorizer = TfidfVectorizer(vocabulary=word_to_index_cleaned)
tfidf_matrix = tfidf_vectorizer.fit_transform(job_ads['Tokenized Description'].astype(str))

def get_tfidf_weighted_embedding(tokens, model, tfidf_vector):
    embeddings = [model.wv[token] * tfidf_vector[tfidf_vectorizer.vocabulary_[token]] 
                  for token in tokens if token in model.wv.index_to_key and token in tfidf_vectorizer.vocabulary_]
    return np.sum(embeddings, axis=0) if embeddings else np.zeros(model.vector_size)

tfidf_weighted_embeddings = np.array([get_tfidf_weighted_embedding(tokens, fasttext_model, tfidf_vector.toarray()[0]) 
                                      for tokens, tfidf_vector in zip(tokenized_descriptions, tfidf_matrix)])

In [None]:
# Classification and Evaluation
logreg = LogisticRegression(max_iter=1000, random_state=42)

# For count vectors
count_vector_scores = cross_val_score(logreg, count_vectors, job_ads['Category'], cv=5)

# For unweighted vectors
unweighted_scores = cross_val_score(logreg, unweighted_embeddings, job_ads['Category'], cv=5)

# For tfidf weighted vectors
tfidf_weighted_scores = cross_val_score(logreg, tfidf_weighted_embeddings, job_ads['Category'], cv=5)

# Print Q1 Results
print("\nQ1: Language Model Comparisons")
print(f"Count Vectors: {np.mean(count_vector_scores):.2%}")
print(f"Unweighted Embeddings: {np.mean(unweighted_scores):.2%}")
print(f"TF-IDF Weighted Embeddings: {np.mean(tfidf_weighted_scores):.2%}")

# Create bar chart for Q1
fig1 = go.Figure([go.Bar(y=['Count Vectors', 'Unweighted Embeddings', 'TF-IDF Weighted Embeddings'], 
                          x=[np.mean(count_vector_scores), np.mean(unweighted_scores), np.mean(tfidf_weighted_scores)], 
                          orientation='h')])
fig1.update_layout(title='Q1: Classification Accuracies for Different Representations',
                   yaxis_title='Method',
                   xaxis_title='Accuracy (%)')
fig1.show()


Q1: Language Model Comparisons
Count Vectors: 88.14%
Unweighted Embeddings: 41.76%
TF-IDF Weighted Embeddings: 69.20%


In [8]:
# Generate Count Vectors for Titles
title_vectorizer = CountVectorizer(vocabulary=word_to_index_cleaned)
title_vectors = title_vectorizer.transform(job_ads['Title'].astype(str))

# Combine Title and Description Vectors
combined_data = np.hstack([title_vectors.toarray(), count_vectors.toarray()])

# Evaluate models for Q2
title_scores = cross_val_score(logreg, title_vectors, job_ads['Category'], cv=5)
description_scores = cross_val_score(logreg, count_vectors, job_ads['Category'], cv=5)
combined_scores = cross_val_score(logreg, combined_data, job_ads['Category'], cv=5)

# Print and Plot Q2 Results
print("\nQ2: Does more information provide higher accuracy?")
print(f"Title Only: {np.mean(title_scores):.2%}")
print(f"Description Only: {np.mean(description_scores):.2%}")
print(f"Title & Description: {np.mean(combined_scores):.2%}")

fig2 = go.Figure([go.Bar(y=['Title Only', 'Description Only', 'Title & Description'], 
                          x=[np.mean(title_scores), np.mean(description_scores), np.mean(combined_scores)], 
                          orientation='h')])
fig2.update_layout(title='Q2: Impact of Additional Information',
                   yaxis_title='Information Type',
                   xaxis_title='Accuracy (%)')
fig2.show()


Q2: Does more information provide higher accuracy?
Title Only: 80.04%
Description Only: 88.14%
Title & Description: 88.27%
