In [19]:
from google.colab import files
uploaded = files.upload()  # Choose the updated_job_matching_platform_dataset.xlsx file

Saving updated_job_matching_platform_dataset.xlsx to updated_job_matching_platform_dataset (1).xlsx


In [20]:
!pip install pandas numpy scikit-learn openpyxl



In [21]:
import pandas as pd

file_path = "updated_job_matching_platform_dataset.xlsx"  # Replace with your uploaded file name
user_data = pd.read_excel(file_path, sheet_name="user_data")
job_data = pd.read_excel(file_path, sheet_name="job_data")
interaction_data = pd.read_excel(file_path, sheet_name="interaction_data")

In [22]:
from sklearn.model_selection import train_test_split

# Split the interaction_data into training and testing sets (80/20 split)
train_data, test_data = train_test_split(interaction_data, test_size=0.2, random_state=42)

# Create the interaction matrix for the training set
train_interaction_matrix = train_data.pivot_table(
    index='User ID', columns='Job ID', values='Interaction Value', fill_value=0
)

# Save the user and job IDs for the training set
train_user_ids = train_interaction_matrix.index.tolist()
train_job_ids = train_interaction_matrix.columns.tolist()

# For testing, we will keep the raw test_data as it is, to compare predicted values with actual values later

In [23]:
# Content-Based Filtering Implementation

# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
user_data = pd.read_excel("updated_job_matching_platform_dataset.xlsx", sheet_name="user_data")
job_data = pd.read_excel("updated_job_matching_platform_dataset.xlsx", sheet_name="job_data")

# Step 1: Combine textual features into a single field
user_data['Profile'] = (
    user_data['Skills'].fillna('') + ' ' +
    user_data['Interests'].fillna('') + ' ' +
    user_data['Previous Jobs'].fillna('') + ' ' +
    user_data['Looking Jobs'].fillna('') + ' ' +
    user_data['Description'].fillna('')
)

job_data['Details'] = (
    job_data['Job Title'].fillna('') + ' ' +
    job_data['Skills Required'].fillna('') + ' ' +
    job_data['Experience Required'].fillna('') + ' ' +
    job_data['Job Description'].fillna('')
)

# Step 2: Vectorize text data using a unified vocabulary
combined_text = pd.concat([user_data['Profile'], job_data['Details']], axis=0)

tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(combined_text)

# Split the combined matrix back into user and job matrices
user_tfidf = tfidf_matrix[:len(user_data)]
job_tfidf = tfidf_matrix[len(user_data):]

# Step 3: Compute cosine similarity between users and jobs
similarity_matrix = cosine_similarity(user_tfidf, job_tfidf)

# Step 4: Generate recommendations
# Create a DataFrame to store recommendations
recommendations = []

for user_idx, user_id in enumerate(user_data['User ID']):
    # Get similarity scores for the user
    similarity_scores = list(enumerate(similarity_matrix[user_idx]))

    # Sort jobs by similarity scores (descending order)
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the top 5 recommended jobs for this user
    top_jobs = [job_data.iloc[job_idx]['Job ID'] for job_idx, score in sorted_scores[:5]]

    # Store recommendations
    recommendations.append({"User ID": user_id, "Recommended Jobs": top_jobs})

# Convert recommendations to a DataFrame for better visualization
recommendations_df = pd.DataFrame(recommendations)

# Display recommendations
print(recommendations_df)

   User ID                     Recommended Jobs
0    US001  [JB016, JB025, JB003, JB030, JB023]
1    US002  [JB030, JB028, JB015, JB029, JB013]
2    US003  [JB010, JB011, JB012, JB024, JB022]
3    US004  [JB029, JB028, JB030, JB013, JB008]
4    US005  [JB020, JB019, JB021, JB018, JB016]
5    US006  [JB028, JB030, JB013, JB029, JB015]
6    US007  [JB018, JB026, JB016, JB017, JB025]
7    US008  [JB010, JB012, JB011, JB015, JB019]
8    US009  [JB018, JB016, JB026, JB017, JB027]
9    US010  [JB013, JB030, JB028, JB014, JB015]
10   US011  [JB018, JB016, JB017, JB026, JB025]
11   US012  [JB016, JB018, JB017, JB026, JB025]
12   US013  [JB018, JB016, JB026, JB017, JB027]
13   US014  [JB030, JB015, JB028, JB010, JB013]
14   US015  [JB030, JB013, JB028, JB029, JB014]
15   US016  [JB030, JB015, JB028, JB013, JB014]
16   US017  [JB004, JB006, JB005, JB018, JB017]
17   US018  [JB004, JB006, JB005, JB008, JB019]
18   US019  [JB004, JB005, JB006, JB022, JB023]
19   US020  [JB010, JB012, JB011, JB015,

In [24]:
# Save recommendations to an Excel file
recommendations_df.to_excel("content_based_recommendations.xlsx", index=False)
print("Recommendations saved to content_based_recommendations.xlsx")

Recommendations saved to content_based_recommendations.xlsx


In [25]:
from google.colab import files
files.download("content_based_recommendations.xlsx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [27]:
# Collaborative Filtering Implementation

# Import necessary libraries
from sklearn.decomposition import TruncatedSVD
import numpy as np

# Step 1: Prepare the interaction matrix
interaction_matrix = interaction_data.pivot_table(
    index='User ID', columns='Job ID', values='Interaction Value', fill_value=0
)

# Save the user and job IDs for reference
user_ids = interaction_matrix.index.tolist()
job_ids = interaction_matrix.columns.tolist()

# Step 2: Apply SVD for Matrix Factorization
# Adjust n_components dynamically based on the number of jobs
n_components = min(33, interaction_matrix.shape[1])  # Adjust to the number of jobs

# Decompose the interaction matrix
svd = TruncatedSVD(n_components=n_components, random_state=42)
latent_matrix = svd.fit_transform(interaction_matrix)

# Reconstruct the predicted interaction matrix
predicted_matrix = np.dot(latent_matrix, svd.components_)

# Step 3: Generate Recommendations
# Create a DataFrame for the predicted matrix
predicted_df = pd.DataFrame(predicted_matrix, index=train_user_ids, columns=train_job_ids)

# Generate top recommendations for each user
collab_recommendations = []

for user_id in user_ids:
    # Get the predicted scores for this user
    user_scores = predicted_df.loc[user_id].sort_values(ascending=False)

    # Get the top 5 recommended jobs
    top_jobs = user_scores.head(5).index.tolist()

    # Store recommendations
    collab_recommendations.append({"User ID": user_id, "Recommended Jobs": top_jobs})

# Convert recommendations to a DataFrame
collab_recommendations_df = pd.DataFrame(collab_recommendations)

# Display recommendations
print(collab_recommendations_df)

   User ID                       Recommended Jobs
0    US001    [JB016, JB018, JB017, JB020, JB003]
1    US002    [JB028, JB030, JB029, JB011, JB010]
2    US003    [JB010, JB023, JB012, JB011, JB022]
3    US004    [JB029, JB030, JB028, JB007, JB009]
4    US005  [JB020, JB0019, JB021, JB0017, JB018]
5    US006    [JB028, JB030, JB029, JB018, JB001]
6    US007  [JB025, JB026, JB0017, JB0016, JB027]
7    US008    [JB012, JB011, JB010, JB024, JB023]
8    US009    [JB018, JB017, JB016, JB021, JB020]
9    US010    [JB008, JB006, JB009, JB005, JB007]
10   US011    [JB026, JB017, JB019, JB021, JB027]
11   US012    [JB026, JB016, JB021, JB020, JB027]
12   US013    [JB019, JB025, JB006, JB005, JB026]
13   US014    [JB013, JB011, JB014, JB010, JB012]
14   US015    [JB030, JB013, JB014, JB015, JB029]
15   US016    [JB014, JB029, JB030, JB028, JB013]
16   US017    [JB004, JB016, JB005, JB006, JB018]
17   US018    [JB005, JB008, JB009, JB007, JB006]
18   US019    [JB007, JB006, JB008, JB004, JB023]


In [28]:
collab_recommendations_df.to_excel("collaborative_recommendations.xlsx", index=False)
print("Collaborative recommendations saved to collaborative_recommendations.xlsx")

Collaborative recommendations saved to collaborative_recommendations.xlsx


In [29]:
from google.colab import files
files.download("collaborative_recommendations.xlsx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [30]:
# Hybrid Model

# Step 1: Normalize Scores
from sklearn.preprocessing import MinMaxScaler

# Normalize content-based scores
content_scores = similarity_matrix.copy()
scaler = MinMaxScaler()
content_scores = scaler.fit_transform(content_scores)

# Normalize collaborative filtering scores
collab_scores = predicted_matrix.copy()
collab_scores = scaler.fit_transform(collab_scores)

# Step 2: Identify Common Job IDs
# Find the intersection of job IDs between job_data and interaction_data
common_job_ids = list(set(job_data['Job ID']).intersection(set(job_ids)))

# Filter job IDs to ensure alignment with the matrices
job_indices_content = [list(job_data['Job ID']).index(job_id) for job_id in common_job_ids]
job_indices_collab = [job_ids.index(job_id) for job_id in common_job_ids]

# Step 3: Filter Scores Matrices to Include Only Common Jobs
# Filter content-based and collaborative scores
content_scores_filtered = content_scores[:, job_indices_content]
collab_scores_filtered = collab_scores[:, job_indices_collab]

# Ensure both matrices are aligned
assert content_scores_filtered.shape == collab_scores_filtered.shape, "Matrices do not align!"

# Step 4: Compute Hybrid Scores
# Define weights for content-based and collaborative filtering
content_weight = 0.5
collab_weight = 0.5

# Calculate the hybrid score
hybrid_scores = (content_weight * content_scores_filtered) + (collab_weight * collab_scores_filtered)

# Step 5: Generate Recommendations
hybrid_recommendations = []

for user_idx, user_id in enumerate(user_ids):
    # Get hybrid scores for this user
    user_hybrid_scores = hybrid_scores[user_idx]

    # Rank jobs by hybrid scores
    sorted_jobs = sorted(
        enumerate(user_hybrid_scores),
        key=lambda x: x[1],
        reverse=True
    )

    # Get the top 5 recommended jobs
    top_jobs = [common_job_ids[job_idx] for job_idx, score in sorted_jobs[:5]]

    # Store recommendations
    hybrid_recommendations.append({"User ID": user_id, "Recommended Jobs": top_jobs})

# Convert recommendations to a DataFrame
hybrid_recommendations_df = pd.DataFrame(hybrid_recommendations)

# Step 6: Save Recommendations
hybrid_recommendations_df.to_excel("hybrid_recommendations.xlsx", index=False)
print("Hybrid recommendations saved to hybrid_recommendations.xlsx")

Hybrid recommendations saved to hybrid_recommendations.xlsx


In [31]:
from google.colab import files
files.download("hybrid_recommendations.xlsx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [36]:
# Testing and Evaluating the Model

# Step 1: Prepare the Testing Interaction Matrix
# Create a pivot table for the testing set
test_interactions = test_data.pivot_table(
    index='User ID', columns='Job ID', values='Interaction Value', fill_value=0
)

# Step 2: Align Predicted and Testing Data
# Find common users and jobs between predicted_df and test_interactions
common_users = list(set(predicted_df.index).intersection(set(test_interactions.index)))
common_jobs = list(set(predicted_df.columns).intersection(set(test_interactions.columns)))

# Filter predicted_df and test_interactions to include only common users and jobs
predicted_df_filtered = predicted_df.loc[common_users, common_jobs]
test_interactions_filtered = test_interactions.loc[common_users, common_jobs]

# Step 3: Define Precision@K and Recall@K Functions
def precision_at_k(predictions, actual, k=5):
    precision_scores = []
    for user in predictions.index:
        if user not in actual.index:
            continue  # Skip users who are not in the actual data
        # Get the top K predictions
        top_k_predictions = predictions.loc[user].sort_values(ascending=False).head(k).index
        # Get the actual jobs for this user
        actual_jobs = actual.loc[user][actual.loc[user] > 0].index
        # Calculate precision
        hits = len(set(top_k_predictions).intersection(set(actual_jobs)))
        precision_scores.append(hits / k)
    return sum(precision_scores) / len(precision_scores) if precision_scores else 0

def recall_at_k(predictions, actual, k=5):
    recall_scores = []
    for user in predictions.index:
        if user not in actual.index:
            continue  # Skip users who are not in the actual data
        # Get the top K predictions
        top_k_predictions = predictions.loc[user].sort_values(ascending=False).head(k).index
        # Get the actual jobs for this user
        actual_jobs = actual.loc[user][actual.loc[user] > 0].index
        # Calculate recall
        hits = len(set(top_k_predictions).intersection(set(actual_jobs)))
        recall_scores.append(hits / len(actual_jobs) if len(actual_jobs) > 0 else 0)
    return sum(recall_scores) / len(recall_scores) if recall_scores else 0

# Step 4: Evaluate Precision@K and Recall@K
precision = precision_at_k(predicted_df_filtered, test_interactions_filtered, k=5)
recall = recall_at_k(predicted_df_filtered, test_interactions_filtered, k=5)

# Print the results
print(f"Precision@5: {precision:.4f}")
print(f"Recall@5: {recall:.4f}")

Precision@5: 0.2800
Recall@5: 0.8000


In [37]:
# Making Predictions with New Data

# Step 1: Add a New User Profile (Example)
new_user_profile = {
    "Skills": "Cooking, Cleaning, Teamwork",
    "Interests": "Hospitality, Organization",
    "Previous Jobs": "Housemaid",
    "Looking Jobs": "Waiter, Babysitter",
    "Description": "Experienced housemaid looking for new opportunities."
}

# Combine new user data into a single profile
new_user_text = (
    new_user_profile['Skills'] + ' ' +
    new_user_profile['Interests'] + ' ' +
    new_user_profile['Previous Jobs'] + ' ' +
    new_user_profile['Looking Jobs'] + ' ' +
    new_user_profile['Description']
)

# Step 2: Compute Similarity with Existing Jobs (Content-Based)
new_user_vector = tfidf.transform([new_user_text])  # Transform new user profile into TF-IDF vector
new_user_similarities = cosine_similarity(new_user_vector, job_tfidf).flatten()  # Calculate similarity

# Step 3: Recommend Top 5 Jobs
top_jobs_for_new_user = [job_data.iloc[i]['Job ID'] for i in new_user_similarities.argsort()[-5:][::-1]]

print("Top recommended jobs for the new user:")
print(top_jobs_for_new_user)

Top recommended jobs for the new user:
['JB018', 'JB016', 'JB017', 'JB002', 'JB020']


In [38]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Flatten the filtered test_interactions and predicted_df_filtered for comparison
actual_values = test_interactions_filtered.values.flatten()
predicted_values = predicted_df_filtered.values.flatten()

# Only consider non-zero actual values to avoid comparing irrelevant pairs
non_zero_indices = actual_values > 0
actual_values_non_zero = actual_values[non_zero_indices]
predicted_values_non_zero = predicted_values[non_zero_indices]

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(actual_values_non_zero, predicted_values_non_zero))
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

Root Mean Squared Error (RMSE): 0.0000


In [41]:
# Set a threshold for considering a prediction as relevant
threshold = 0.5

# Flatten the testing and predicted matrices
actual_values = test_interactions_filtered.values.flatten()
predicted_values = predicted_df_filtered.values.flatten()

# Only consider non-zero actual values
non_zero_indices = actual_values > 0
actual_values_non_zero = actual_values[non_zero_indices]
predicted_values_non_zero = predicted_values[non_zero_indices]

# Apply the threshold to predicted values
predicted_classes = (predicted_values_non_zero >= threshold).astype(int)
actual_classes = (actual_values_non_zero >= threshold).astype(int)

# Calculate accuracy
correct_predictions = (predicted_classes == actual_classes).sum()
total_predictions = len(actual_classes)
accuracy = (correct_predictions / total_predictions) * 100

print(f"Overall Model Accuracy: {accuracy:.2f}%")

Overall Model Accuracy: 100.00%


In [42]:
for threshold in [0.4, 0.5, 0.6, 0.7]:
    predicted_classes = (predicted_values_non_zero >= threshold).astype(int)
    accuracy = (predicted_classes == actual_classes).mean() * 100
    print(f"Accuracy at threshold {threshold}: {accuracy:.2f}%")

Accuracy at threshold 0.4: 100.00%
Accuracy at threshold 0.5: 100.00%
Accuracy at threshold 0.6: 100.00%
Accuracy at threshold 0.7: 100.00%


In [43]:
import pickle

# Save the TF-IDF vectorizer
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

# Save the collaborative filtering components (SVD)
with open("svd_model.pkl", "wb") as f:
    pickle.dump(svd, f)

# Save the predicted matrix (optional, for quick lookup in the web app)
predicted_df_filtered.to_pickle("predicted_matrix.pkl")

print("Model components have been saved!")

Model components have been saved!


In [44]:
from google.colab import files

# Download the saved files
files.download("tfidf_vectorizer.pkl")
files.download("svd_model.pkl")
files.download("predicted_matrix.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>