In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
import numpy as np
import pandas as pd
from surprise import Dataset, Reader, KNNWithMeans
import random
from datetime import datetime
import subprocess
import os
from resume_parser import parse_resume


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Function to calculate "Match" score using TF-IDF (can replace this with the gemini-api function)
def calculate_match_TFIDF_features(cv_text, job_desc):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([cv_text, job_desc])
    return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]



def get_utility_matrix(users, jobs=pd.read_csv("./jobs_dataset/cleaned_dataset/cleaned_jobs_data.csv"), K=1000, new_user_flag=False):
    # Prepare random samples (K = 1000 for example)
    # K = 1000
    N = len(users)
    M = len(jobs)

    random_samples = []

    random.seed(datetime.now().timestamp())
    user_idxs = np.random.randint(0, N, size=K)
    job_idxs = np.random.randint(0, M, size=K)

    for i in range(K):
        user_idx = user_idxs[i]
        job_idx = job_idxs[i]
        user = users[user_idx]
        job = jobs.iloc[job_idx]

        match_score = calculate_match_TFIDF_features(user["CV"], job["Job_Desc"])
        pay = (job["Min_Salary"] + job["Max_Salary"]) / 2 if job["Min_Salary"] != -1 else np.nan
        rating = job["Rating"]

        # Example Black-Box Prediction
        predicted_rating = (
            0.4 * match_score +
            0.2 * (user["Preferred_Location"] == job["City"]) +
            0.3 * (pay / user["Pay_Expectation"] if not np.isnan(pay) else 0) +
            0.1 * rating
        )
        random_samples.append((user_idx, job_idx, predicted_rating))
    

    # adding extra pre-matrix-factorization initialization for new user based on TIF score of resume and Job-description
    if new_user_flag:
        user_idx = N-1
        job_indices = np.random.randint(0, M, size=int(0.1*M))      
        user = users[user_idx]
        for job_idx in job_indices:
            job = jobs.iloc[int(job_idx)]

            match_score = calculate_match_TFIDF_features(user["CV"], job["Job_Desc"])
            pay = (job["Min_Salary"] + job["Max_Salary"]) / 2 if job["Min_Salary"] != -1 else np.nan
            rating = job["Rating"]

            # Example Black-Box Prediction
            predicted_rating = (
                0.4 * match_score +
                0.2 * (user["Preferred_Location"] == job["City"]) +
                0.3 * (pay / user["Pay_Expectation"] if not np.isnan(pay) else 0) +
                0.1 * rating
            )
            random_samples.append((user_idx, job_idx, predicted_rating))



    N = len(users)
    M = len(jobs)
    P = np.zeros((N,M))
    for tup in random_samples:
        i,j,r = tup
        P[i,j] = r

    samples_df = pd.DataFrame(random_samples, columns=["User", "Job", "Rating"])

    return P, samples_df




In [3]:
# Black-box function for extracting location and pay
def extract_location_and_pay(resume_str):
    """
    A black-box function that processes the resume string to return:
    - Preferred_Location: str
    - Pay_Expectation: int
    """
    # Dummy implementation (replace with actual logic)
    if "New York" in resume_str:
        location = "NY"
    elif "California" in resume_str:
        location = "CA"
    else:
        location = "Other"

    if "high salary" in resume_str:
        pay = 100000
    elif "entry-level" in resume_str:
        pay = 50000
    else:
        pay = 75000

    return location, pay

def extract_users(resume_df, categories):
    df = resume_df
    selected_categories = categories
    filtered_df = df[df["Category"].isin(selected_categories)]
    
    users = []
    for _, row in filtered_df.iterrows():
        cv = row["Resume_str"]
        location, pay = extract_location_and_pay(cv)  # Extract location and pay
        user_dict = {
            "CV": cv,
            "Preferred_Location": location,
            "Pay_Expectation": pay
        }
        users.append(user_dict)
    return users

In [4]:
resume_file = "./user_dataset/Resume/Resume.csv"
df = pd.read_csv(resume_file)
selected_categories = ["INFORMATION-TECHNOLOGY"]
users = extract_users(df,selected_categories)
print(len(users), "users extracted from selected categories ",selected_categories)


120 users extracted from selected categories  ['INFORMATION-TECHNOLOGY']


In [5]:
jobs = pd.read_csv("./jobs_dataset/cleaned_dataset/cleaned_jobs_data.csv")
# users = [
#         {"CV": "Machine learning and Python experience", "Preferred_Location": "NY", "Pay_Expectation": 100000},
#         {"CV": "Sales and Marketing Experience", "Preferred_Location": "CA", "Pay_Expectation": 50000},
#         {"CV": "Management Experience", "Preferred_Location": "NY", "Pay_Expectation": 200000}
#          ]
print(len(jobs), "jobs extracted")

2849 jobs extracted


In [6]:
P,samples_df = get_utility_matrix(users, jobs, K=10000)
N = len(users)
M = len(jobs)
print("P = ",P)
print("shape of P : ",P.shape)
print(np.count_nonzero(P)/(N*M),"fraction of non-0 entries")

P =  [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
shape of P :  (120, 2849)
0.028793728793728792 fraction of non-0 entries


In [7]:
def matrix_factorization_1(samples_df):
    # Convert the utility matrix to a format compatible with Surprise
    reader = Reader(rating_scale=(0, 5))
    data = Dataset.load_from_df(samples_df, reader)

    # Split into training and testing
    trainset, testset = train_test_split(data, test_size=0.2)

    # Train SVD
    model = SVD()
    model.fit(trainset)

    # Predict missing ratings
    predictions = model.test(testset)

    # Evaluate the model
    print("RMSE:", rmse(predictions))
    print(P)
    utility_matrix = P
    for i in range(N):
        for j in range(M):
            if utility_matrix[i, j] == 0:  # Missing entry
                utility_matrix[i, j] = model.predict(i, j).est
    print(utility_matrix)

    return utility_matrix


In [8]:
def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    '''
    R: rating matrix
    P: |U| * K (User features matrix)
    Q: |D| * K (Item features matrix)
    K: latent features
    steps: iterations
    alpha: learning rate
    beta: regularization parameter'''
    Q = Q.T

    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    # calculate error
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        # calculate gradient with a and beta parameter
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = np.dot(P,Q)
        e = 0

        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        # 0.001: local minimum
        if e < 0.001:
            break

    return P, Q.T


In [9]:
new_P = matrix_factorization_1(samples_df)
print("New_P = \n",new_P)
print("New_P shape = \n",new_P.shape)

RMSE: 0.2069
RMSE: 0.20688334437942163
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0.68374854 0.65893502 1.03838854 ... 0.89626532 0.83871323 0.88791164]
 [0.8688744  0.5499361  0.92822238 ... 0.73193809 0.8275216  0.78564763]
 [0.71564859 0.69138677 0.84555713 ... 0.76747995 0.82466968 0.83230852]
 ...
 [0.89564076 0.69045007 0.93455258 ... 0.66094277 0.98444497 0.83282262]
 [0.8167667  0.69134954 0.85558509 ... 0.65004102 0.9274587  0.83220007]
 [0.806967   0.61296818 0.90607237 ... 0.99597632 0.97329295 0.82105959]]
New_P = 
 [[0.68374854 0.65893502 1.03838854 ... 0.89626532 0.83871323 0.88791164]
 [0.8688744  0.5499361  0.92822238 ... 0.73193809 0.8275216  0.78564763]
 [0.71564859 0.69138677 0.84555713 ... 0.76747995 0.82466968 0.83230852]
 ...
 [0.89564076 0.69045007 0.93455258 ... 0.66094277 0.98444497 0.83282262]
 [0.8167667  0.69134954 0.85558509 ... 0.65004102 0.92

In [10]:

def add_new_user_to_utility_matrix(cv_text, preferred_location, pay_expectation):
    """
    Adds a new user to the utility matrix and calculates predicted ratings using collaborative filtering.
    
    Parameters:
    - utility_matrix: np.ndarray of shape (num_users, num_jobs), existing matrix with user-job ratings.
    - jobs_df: pd.DataFrame containing job information.
    - city: str, current city of the user.
    - preferred_location: str, user's preferred location for jobs.
    - pay_expectation: float, user's expected salary.
    
    Returns:
    - new_P: np.ndarray, utility matrix with the new user added.
    - recommendations: list, predicted ratings for the new user for all jobs.
    """
    cv = cv_text
    location, pay = preferred_location, pay_expectation
    user_dict = {
        "CV": cv,
        "Preferred_Location": location,
        "Pay_Expectation": pay
    }
    users.append(user_dict)
    P, samples_df = get_utility_matrix(users, jobs=pd.read_csv("./jobs_dataset/cleaned_dataset/cleaned_jobs_data.csv"), K=1000, new_user_flag=True)
    new_P = matrix_factorization_1(samples_df)
    recommendations = new_P[len(new_P)-1]
    
    return new_P, recommendations
    


In [11]:
file_path = "./user_resume/sample_resume.pdf"
cv_text = parse_resume(file_path,"text")
loc, pay = extract_location_and_pay(cv_text)

new_P, recommendations = add_new_user_to_utility_matrix(cv_text=cv_text, preferred_location=loc, pay_expectation=pay)

RMSE: 0.2328
RMSE: 0.2327844644055725
[[0.68374854 0.65893502 1.03838854 ... 0.89626532 0.83871323 0.88791164]
 [0.8688744  0.5499361  0.92822238 ... 0.73193809 0.8275216  0.78564763]
 [0.71564859 0.69138677 0.84555713 ... 0.76747995 0.82466968 0.83230852]
 ...
 [0.89564076 0.69045007 0.93455258 ... 0.66094277 0.98444497 0.83282262]
 [0.8167667  0.69134954 0.85558509 ... 0.65004102 0.9274587  0.83220007]
 [0.806967   0.61296818 0.90607237 ... 0.99597632 0.97329295 0.82105959]]
[[0.68374854 0.65893502 1.03838854 ... 0.89626532 0.83871323 0.88791164]
 [0.8688744  0.5499361  0.92822238 ... 0.73193809 0.8275216  0.78564763]
 [0.71564859 0.69138677 0.84555713 ... 0.76747995 0.82466968 0.83230852]
 ...
 [0.89564076 0.69045007 0.93455258 ... 0.66094277 0.98444497 0.83282262]
 [0.8167667  0.69134954 0.85558509 ... 0.65004102 0.9274587  0.83220007]
 [0.806967   0.61296818 0.90607237 ... 0.99597632 0.97329295 0.82105959]]


In [12]:
print(recommendations)

[0.806967   0.61296818 0.90607237 ... 0.99597632 0.97329295 0.82105959]


In [13]:
sorted_indices = sorted(range(len(recommendations)), key=lambda x: recommendations[x], reverse=True)
T = 10
top_T_jobs = sorted_indices[0:T]
print(top_T_jobs)

[1152, 906, 1188, 779, 761, 2436, 1318, 881, 731, 1146]


In [None]:
for job_idx in top_T_jobs:
    job = jobs.iloc[job_idx]

In [22]:
from jinja2 import Environment, FileSystemLoader


# Create a Jinja2 environment
env = Environment(loader=FileSystemLoader('.'))

# Define a template for the HTML page
template = env.get_template('./web/job_template.html')

# Prepare the data for the template
job_data = []
for job_idx in top_T_jobs:
    job = jobs.iloc[job_idx]
    job_data.append({
        'title': job['Job_title'],
        'company': job['Company'],
        'state': job['State'],
        'city': job['City'],
        'min_salary': job['Min_Salary'],
        'max_salary': job['Max_Salary'],
        'job_desc': job['Job_Desc'],
        'industry': job['Industry'],
        'rating': job['Rating'],
        'date_posted': job['Date_Posted'],
        'valid_until': job['Valid_until'],
        'job_type': job['Job_Type']
    })

# Render the template with the job data
html_content = template.render(jobs=job_data)

with open('./web/top_jobs.html', 'w', encoding='utf-8') as f:
  f.write(html_content)