In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from flask import Flask, request, jsonify
from flask_cors import CORS


In [3]:
# Load the dataset
jobs_df = pd.read_csv('upwork-jobs.csv')

# Display the first few rows
jobs_df.head()


Unnamed: 0,title,link,description,published_date,is_hourly,hourly_low,hourly_high,budget,country
0,Experienced Media Buyer For Solar Pannel and R...,https://www.upwork.com/jobs/Experienced-Media-...,We’re looking for a talented and hardworking a...,2024-02-17 09:09:54+00:00,False,,,500.0,
1,Full Stack Developer,https://www.upwork.com/jobs/Full-Stack-Develop...,Job Title: Full Stack DeveloperWe are seeking ...,2024-02-17 09:09:17+00:00,False,,,1100.0,United States
2,SMMA Bubble App,https://www.upwork.com/jobs/SMMA-Bubble-App_%7...,I need someone to redesign my bubble.io site t...,2024-02-17 09:08:46+00:00,True,10.0,30.0,,United States
3,Talent Hunter Specialized in Marketing,https://www.upwork.com/jobs/Talent-Hunter-Spec...,Join Our Growing Team!We are an innovative com...,2024-02-17 09:08:08+00:00,,,,,United States
4,Data Engineer,https://www.upwork.com/jobs/Data-Engineer_%7E0...,We are looking for a resource who can work par...,2024-02-17 09:07:42+00:00,False,,,650.0,India


In [None]:
# 📌 Code for Data Cleaning

# Check for missing values
print(jobs_df.isnull().sum())




In [7]:

# Drop rows with too many missing values (if needed)
jobs_df.dropna(subset=['title', 'description'], inplace=True)

# Fill missing values in numerical columns using .loc
jobs_df.loc[:, 'hourly_low'] = jobs_df['hourly_low'].fillna(jobs_df['hourly_low'].median())
jobs_df.loc[:, 'hourly_high'] = jobs_df['hourly_high'].fillna(jobs_df['hourly_high'].median())
jobs_df.loc[:, 'budget'] = jobs_df['budget'].fillna(jobs_df['budget'].median())




In [9]:
# Convert text to lowercase
jobs_df['title'] = jobs_df['title'].str.lower()
jobs_df['description'] = jobs_df['description'].str.lower()



In [12]:

def clean_text(text):
    tokens = text.split()  # Tokenize text
    tokens = [word for word in tokens if word.isalpha()]  # Remove non-alphabetic tokens
    return " ".join(tokens)

jobs_df['description'] = jobs_df['description'].apply(clean_text)

# Display cleaned dataset
jobs_df.head()

Unnamed: 0,title,link,description,published_date,is_hourly,hourly_low,hourly_high,budget,country
0,experienced media buyer for solar pannel and r...,https://www.upwork.com/jobs/Experienced-Media-...,looking for a talented and hardworking ads man...,2024-02-17 09:09:54+00:00,False,15.0,30.0,500.0,
1,full stack developer,https://www.upwork.com/jobs/Full-Stack-Develop...,job full stack developerwe are seeking a talen...,2024-02-17 09:09:17+00:00,False,15.0,30.0,1100.0,United States
2,smma bubble app,https://www.upwork.com/jobs/SMMA-Bubble-App_%7...,i need someone to redesign my site to optimize...,2024-02-17 09:08:46+00:00,True,10.0,30.0,100.0,United States
3,talent hunter specialized in marketing,https://www.upwork.com/jobs/Talent-Hunter-Spec...,join our growing are an innovative company exp...,2024-02-17 09:08:08+00:00,,15.0,30.0,100.0,United States
4,data engineer,https://www.upwork.com/jobs/Data-Engineer_%7E0...,we are looking for a resource who can work for...,2024-02-17 09:07:42+00:00,False,15.0,30.0,650.0,India


In [13]:
# Vectorizing Job Descriptions


# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting to 5000 most important words

# Fit and transform job descriptions into vectors
job_vectors = tfidf_vectorizer.fit_transform(jobs_df['description'])

# Display vectorized shape
print(f"TF-IDF matrix shape: {job_vectors.shape}")


TF-IDF matrix shape: (53058, 5000)


In [18]:
# 📌 Save the Model

import joblib

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")

# Save the job vectors
joblib.dump(job_vectors, "job_vectors.pkl")

# Save the processed job dataset
jobs_df.to_csv("cleaned_upwork_jobs.csv", index=False)

print("Model and data saved successfully!")


Model and data saved successfully!


In [19]:
# 📌 Load the Model Later
# Load the saved models
tfidf_vectorizer = joblib.load("tfidf_vectorizer.pkl")
job_vectors = joblib.load("job_vectors.pkl")

# Load the cleaned job dataset
jobs_df = pd.read_csv("cleaned_upwork_jobs.csv")

print("Model and data loaded successfully!")


Model and data loaded successfully!


🛠 Step 3: Implementing Job Recommendations

We will:
✅ Allow users to input their skills and experience
✅ Convert user input into a TF-IDF vector
✅ Compare it with job descriptions using Cosine Similarity
✅ Recommend jobs with the highest match percentage


In [20]:
def recommend_jobs(user_input, top_n=5):
    """
    Recommend jobs based on user input.
    Args:
        user_input (str): User skills and experience.
        top_n (int): Number of job recommendations to return.
    Returns:
        DataFrame: Top recommended jobs with match percentage.
    """
    
    # Convert user input into TF-IDF vector
    user_vector = tfidf_vectorizer.transform([user_input])

    # Compute cosine similarity between user input and job descriptions
    similarity_scores = cosine_similarity(user_vector, job_vectors)

    # Get top N job indices
    job_indices = similarity_scores.argsort()[0][-top_n:][::-1]  # Get highest similarity scores

    # Extract top job recommendations
    recommended_jobs = jobs_df.iloc[job_indices].copy()

    # Add match percentage
    recommended_jobs['match_percentage'] = similarity_scores[0][job_indices] * 100

    return recommended_jobs[['title', 'link', 'match_percentage']]


In [21]:
user_input = "Python developer with experience in machine learning and data analysis"
recommendations = recommend_jobs(user_input)
print(recommendations)


                                                   title  \
24454                               python data analysis   
35226                machine learning trainer/instructor   
48265                    python/tensorflow expert needed   
32669                        machine learning researcher   
52827  coding of a python function for image segmenta...   

                                                    link  match_percentage  
24454  https://www.upwork.com/jobs/Python-Data-analys...         63.922103  
35226  https://www.upwork.com/jobs/Machine-Learning-T...         63.162852  
48265  https://www.upwork.com/jobs/Python-TensorFlow-...         61.991082  
32669  https://www.upwork.com/jobs/Machine-Learning-R...         59.227719  
52827  https://www.upwork.com/jobs/Coding-Python-func...         58.445111  


In [22]:
from IPython.display import display

# Sample user input
user_input = "Python developer with experience in machine learning and data analysis"

# Get job recommendations
recommendations = recommend_jobs(user_input)

# Display as a table
display(recommendations.style.format({'match_percentage': '{:.2f}%'}))


Unnamed: 0,title,link,match_percentage
24454,python data analysis,https://www.upwork.com/jobs/Python-Data-analysis_%7E01d768b2d453fed809?source=rss,63.92%
35226,machine learning trainer/instructor,https://www.upwork.com/jobs/Machine-Learning-Trainer-Instructor_%7E01476faa2e44fae29b?source=rss,63.16%
48265,python/tensorflow expert needed,https://www.upwork.com/jobs/Python-TensorFlow-Expert-Needed_%7E01584564aa599e759e?source=rss,61.99%
32669,machine learning researcher,https://www.upwork.com/jobs/Machine-Learning-Researcher_%7E01d62094fe872e9601?source=rss,59.23%
52827,coding of a python function for image segmentation in indoor environment,https://www.upwork.com/jobs/Coding-Python-function-for-image-segmentation-indoor-environment_%7E01efa542f1f8b0cd51?source=rss,58.45%


In [17]:
# Step 4: Interactive User Input Form

In [23]:
# 📌 Install & Import ipywidgets
!pip install ipywidgets




In [24]:
import ipywidgets as widgets
from IPython.display import display


In [28]:
# 📌 Create Interactive User Form

# Create input widget
user_input_widget = widgets.Textarea(
    placeholder="Enter your skills, experience, and job preferences...",
    layout=widgets.Layout(width="100%", height="100px")
)

# Create a button
recommend_button = widgets.Button(description="Find Jobs")

# Output display area
output_area = widgets.Output()

# Function to handle recommendations
def on_recommend_button_clicked(b):
    with output_area:
        output_area.clear_output()  # Clear previous results
        user_input = user_input_widget.value  # Get input text
        recommendations = recommend_jobs(user_input)  # Get recommendations
        display(recommendations.style.format({'match_percentage': '{:.2f}%'}))  # Show table

# Link button click to function
recommend_button.on_click(on_recommend_button_clicked)

# Display the UI
display(user_input_widget, recommend_button, output_area)

    

Textarea(value='', layout=Layout(height='100px', width='100%'), placeholder='Enter your skills, experience, an…

Button(description='Find Jobs', style=ButtonStyle())

Output()

In [34]:
user_input_widget.value = "Frontend developer skilled in JavaScript, React, and CSS. Looking for UI/UX design projects."


Step 5.1: Show a Bar Chart of Job Match Percentages

We will:
✅ Use Matplotlib & Seaborn to visualize top job matches.
✅ Show a bar chart with job titles and their match percentages.


In [35]:
# 📌 Install & Import Matplotlib/Seaborn
!pip install matplotlib seaborn




In [36]:
import matplotlib.pyplot as plt
import seaborn as sns


In [38]:
# 📌 Create Interactive User Form

# Create input widget
user_input_widget = widgets.Textarea(
    placeholder="Enter your skills, experience, and job preferences...",
    layout=widgets.Layout(width="100%", height="100px")
)

# Create a button
recommend_button = widgets.Button(description="Find Jobs")

# Output display area
output_area = widgets.Output()

# Function to handle recommendations
def on_recommend_button_clicked(b):
    with output_area:
        output_area.clear_output()  # Clear previous results
        user_input = user_input_widget.value  # Get input text
        recommendations = recommend_jobs(user_input)  # Get recommendations
        display(recommendations.style.format({'match_percentage': '{:.2f}%'}))  # Show table

# Link button click to function
recommend_button.on_click(on_recommend_button_clicked)

# Display the UI
display(user_input_widget, recommend_button, output_area)

    

Textarea(value='', layout=Layout(height='100px', width='100%'), placeholder='Enter your skills, experience, an…

Button(description='Find Jobs', style=ButtonStyle())

Output()

In [39]:
def recommend_jobs(user_input, top_n=5, show_chart=True):
    """
    Recommend jobs based on user input and visualize results.
    """
    # Convert user input into TF-IDF vector
    user_vector = tfidf_vectorizer.transform([user_input])

    # Compute cosine similarity
    similarity_scores = cosine_similarity(user_vector, job_vectors)

    # Get top job indices
    job_indices = similarity_scores.argsort()[0][-top_n:][::-1]

    # Extract top job recommendations
    recommended_jobs = jobs_df.iloc[job_indices].copy()
    recommended_jobs['match_percentage'] = similarity_scores[0][job_indices] * 100

    # Display Table
    display(recommended_jobs[['title', 'link', 'match_percentage']].style.format({'match_percentage': '{:.2f}%'}))
    
    # Show Bar Chart
    if show_chart:
        plt.figure(figsize=(8, 4))
        sns.barplot(y=recommended_jobs['title'], x=recommended_jobs['match_percentage'], palette='Blues_r')
        plt.xlabel("Match Percentage")
        plt.ylabel("Job Title")
        plt.title("Top Job Recommendations")
        plt.xlim(0, 100)
        plt.show()

    return recommended_jobs


 Step 5.2: Add Budget & Country Filters
 

In [42]:
def recommend_jobs(user_input, top_n=5, min_budget=0, country=None, show_chart=True):
    """
    Recommend jobs based on user input with budget and country filters.
    """
    # Convert user input into TF-IDF vector
    user_vector = tfidf_vectorizer.transform([user_input])

    # Compute cosine similarity
    similarity_scores = cosine_similarity(user_vector, job_vectors)

    # Get top job indices
    job_indices = similarity_scores.argsort()[0][-top_n * 3:][::-1]  # Get more jobs to filter later

    # Extract top job recommendations
    recommended_jobs = jobs_df.iloc[job_indices].copy()
    recommended_jobs['match_percentage'] = similarity_scores[0][job_indices] * 100

    # Apply budget filter
    recommended_jobs = recommended_jobs[recommended_jobs['budget'] >= min_budget]

    # Apply country filter (if provided)
    if country:
        recommended_jobs = recommended_jobs[recommended_jobs['country'].str.lower() == country.lower()]

    # Get the final top N results
    recommended_jobs = recommended_jobs.head(top_n)

    # Display Table
    display(recommended_jobs[['title', 'link', 'budget', 'country', 'match_percentage']].style.format({'match_percentage': '{:.2f}%', 'budget': '${:,.0f}'}))

    # Show Bar Chart
    if show_chart and not recommended_jobs.empty:
        plt.figure(figsize=(8, 4))
        sns.barplot(y=recommended_jobs['title'], x=recommended_jobs['match_percentage'], palette='Blues_r')
        plt.xlabel("Match Percentage")
        plt.ylabel("Job Title")
        plt.title("Top Job Recommendations")
        plt.xlim(0, 100)
        plt.show()

    return recommended_jobs


In [43]:
# 📌 Create Interactive User Form

# Create input widget
user_input_widget = widgets.Textarea(
    placeholder="Enter your skills, experience, and job preferences...",
    layout=widgets.Layout(width="100%", height="100px")
)

# Create a button
recommend_button = widgets.Button(description="Find Jobs")

# Output display area
output_area = widgets.Output()

# Function to handle recommendations
def on_recommend_button_clicked(b):
    with output_area:
        output_area.clear_output()  # Clear previous results
        user_input = user_input_widget.value  # Get input text
        recommendations = recommend_jobs(user_input)  # Get recommendations
        display(recommendations.style.format({'match_percentage': '{:.2f}%'}))  # Show table

# Link button click to function
recommend_button.on_click(on_recommend_button_clicked)

# Display the UI
display(user_input_widget, recommend_button, output_area)

    

Textarea(value='', layout=Layout(height='100px', width='100%'), placeholder='Enter your skills, experience, an…

Button(description='Find Jobs', style=ButtonStyle())

Output()

In [45]:
# # 🧠 Step 5.3: Improve Matching with Word Embeddings
# ✅ TF-IDF only considers word frequency—it doesn’t capture meaning.
# ✅ BERT/Word2Vec understands context & semantics, making recommendations more accurate.
# ✅ We’ll replace TF-IDF + Cosine Similarity with Sentence Transformers (BERT).

In [46]:
!pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.6.0-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.29.0-py3-none-any.whl.metadata (13 kB)
Collecting filelock (from huggingface-hub>=0.20.0->sentence-transformers)
  Downloading filelock-3.17.0-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub>=0.20.0->sentence-transformers)
  Downloading fsspec-2025.2.0-py3-none-any.whl.metadata (11 kB)
Collecting networkx (from torch>=1.11.0->sentence-transformers)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting sympy==1.13.1 (from torch>=1.11.0->sentence-transformers)
  Using cached

In [47]:
from sentence_transformers import SentenceTransformer

# Load BERT model
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert job descriptions into embeddings
job_embeddings = bert_model.encode(jobs_df['description'].astype(str), show_progress_bar=True)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1659 [00:00<?, ?it/s]


KeyboardInterrupt



In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def recommend_jobs(user_input, top_n=5, min_budget=0, country=None, show_chart=True):
    """
    Recommend jobs based on user input using BERT embeddings.
    """
    # Convert user input into a BERT embedding
    user_embedding = bert_model.encode([user_input])

    # Compute cosine similarity
    similarity_scores = cosine_similarity(user_embedding, job_embeddings)

    # Get top job indices
    job_indices = similarity_scores.argsort()[0][-top_n * 3:][::-1]  # Get more jobs to filter later

    # Extract top job recommendations
    recommended_jobs = jobs_df.iloc[job_indices].copy()
    recommended_jobs['match_percentage'] = similarity_scores[0][job_indices] * 100

    # Apply budget filter
    recommended_jobs = recommended_jobs[recommended_jobs['budget'] >= min_budget]

    # Apply country filter (if provided)
    if country:
        recommended_jobs = recommended_jobs[recommended_jobs['country'].str.lower() == country.lower()]

    # Get the final top N results
    recommended_jobs = recommended_jobs.head(top_n)

    # Display Table
    display(recommended_jobs[['title', 'link', 'budget', 'country', 'match_percentage']].style.format({'match_percentage': '{:.2f}%', 'budget': '${:,.0f}'}))

    # Show Bar Chart
    if show_chart and not recommended_jobs.empty:
        plt.figure(figsize=(8, 4))
        sns.barplot(y=recommended_jobs['title'], x=recommended_jobs['match_percentage'], palette='Blues_r')
        plt.xlabel("Match Percentage")
        plt.ylabel("Job Title")
        plt.title("Top Job Recommendations")
        plt.xlim(0, 100)
        plt.show()

    return recommended_jobs
