In [1]:
import pandas as pd 
job = pd.read_csv('/home/jovyan/MLProjects-1/job_descriptions.csv')
job.shape[0]

1615940

In [2]:
job.shape

(1615940, 23)

In [3]:
job.columns

Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location',
       'Country', 'latitude', 'longitude', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Contact Person', 'Contact',
       'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits',
       'skills', 'Responsibilities', 'Company', 'Company Profile'],
      dtype='object')

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
#Exploratory data analysis - preprocessing of data

#check for missing values

print(job.isnull().sum())


Job Id                 0
Experience             0
Qualifications         0
Salary Range           0
location               0
Country                0
latitude               0
longitude              0
Work Type              0
Company Size           0
Job Posting Date       0
Preference             0
Contact Person         0
Contact                0
Job Title              0
Role                   0
Job Portal             0
Job Description        0
Benefits               0
skills                 0
Responsibilities       0
Company                0
Company Profile     5478
dtype: int64


In [6]:
#dropping records if critical columns are missing, in this case we see the only column where records are missing is Company Profile, still I would run the below code for a good practise

job = job.dropna(subset=['Job Title', 'skills'])
job['skills'] = job['skills'].fillna('')
job['Job Description'] = job['Job Description'].fillna('')


In [7]:
#checking column distributions

print(job['skills'].value_counts().head(10))
print(job['location'].value_counts().head(10))


skills
Interaction design principles User behavior and psychology Wireframing and prototyping tools Animation and micro-interaction design Collaborative design processes                                                                                   20580
Network management Troubleshooting Network security IT certifications (e.g., CCNA)                                                                                                                                                                   17470
UI design principles and best practices Graphic design tools (e.g., Adobe Photoshop, Illustrator) Typography and color theory Visual design and layout Responsive design                                                                             14036
Social media platforms (e.g., Facebook, Twitter, Instagram) Content creation and scheduling Social media analytics and insights Community engagement Paid social advertising                                                                    

In [8]:
#cleaning text columns

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Lowercase
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

job['skills'] = job['skills'].apply(clean_text)
job['Job Description'] = job['Job Description'].apply(clean_text)
job['Job Description']


[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0          social media managers oversee organizations so...
1          frontend web developers design implement user ...
2          quality control managers establish enforce qua...
3          wireless network engineers design implement ma...
4          conference manager coordinates manages confere...
                                 ...                        
1615935    mechanical design engineers create develop mec...
1615936    director oversees organizations department tec...
1615937    mechanical design engineers create develop mec...
1615938    training coordinators design implement employe...
1615939    wedding planners specialize organizing wedding...
Name: Job Description, Length: 1615940, dtype: object

In [9]:
job['Job Description']

0          social media managers oversee organizations so...
1          frontend web developers design implement user ...
2          quality control managers establish enforce qua...
3          wireless network engineers design implement ma...
4          conference manager coordinates manages confere...
                                 ...                        
1615935    mechanical design engineers create develop mec...
1615936    director oversees organizations department tec...
1615937    mechanical design engineers create develop mec...
1615938    training coordinators design implement employe...
1615939    wedding planners specialize organizing wedding...
Name: Job Description, Length: 1615940, dtype: object

In [10]:
#checking for duplicates

job = job.drop_duplicates(subset=['Job Title', 'skills', 'Job Description'])


In [11]:
#steps to get the combined matrix for vectorization
#initial checks to confirm if we're good to build the combined matrix
#1
print("DataFrame shape:", job.shape)
print("Columns in the DataFrame:", job.columns)


DataFrame shape: (376, 23)
Columns in the DataFrame: Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location',
       'Country', 'latitude', 'longitude', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Contact Person', 'Contact',
       'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits',
       'skills', 'Responsibilities', 'Company', 'Company Profile'],
      dtype='object')


In [12]:
#2
print("Missing values in 'skills':", job['skills'].isnull().sum())
print("Missing values in 'Job Description':", job['Job Description'].isnull().sum())

print("Empty strings in 'skills':", (job['skills'].str.strip() == '').sum())
print("Empty strings in 'Job Description':", (job['Job Description'].str.strip() == '').sum())


Missing values in 'skills': 0
Missing values in 'Job Description': 0
Empty strings in 'skills': 0
Empty strings in 'Job Description': 0


In [13]:
#3
print("Sample 'skills' column:")
print(job['skills'].head())

print("\nSample 'Job Description' column:")
print(job['Job Description'].head())


Sample 'skills' column:
0    social media platforms eg facebook twitter ins...
1    html css javascript frontend frameworks eg rea...
2    quality control processes methodologies statis...
3    wireless network design architecture wifi stan...
4    event planning conference logistics budget man...
Name: skills, dtype: object

Sample 'Job Description' column:
0    social media managers oversee organizations so...
1    frontend web developers design implement user ...
2    quality control managers establish enforce qua...
3    wireless network engineers design implement ma...
4    conference manager coordinates manages confere...
Name: Job Description, dtype: object


In [14]:
#cleaning was done above but re-running before we make the combined matrix
import re

# Define a cleaning function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

# Apply the cleaning function
job['skills'] = job['skills'].apply(clean_text)
job['Job Description'] = job['Job Description'].apply(clean_text)

# Preview cleaned data
print("Cleaned 'skills' column:")
print(job['skills'].head())

print("\nCleaned 'Job Description' column:")
print(job['Job Description'].head())


Cleaned 'skills' column:
0    social media platforms eg facebook twitter ins...
1    html css javascript frontend frameworks eg rea...
2    quality control processes methodologies statis...
3    wireless network design architecture wifi stan...
4    event planning conference logistics budget man...
Name: skills, dtype: object

Cleaned 'Job Description' column:
0    social media managers oversee organizations so...
1    frontend web developers design implement user ...
2    quality control managers establish enforce qua...
3    wireless network engineers design implement ma...
4    conference manager coordinates manages confere...
Name: Job Description, dtype: object


In [15]:
#vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizers
tfidf_skills = TfidfVectorizer(stop_words='english')
tfidf_desc = TfidfVectorizer(stop_words='english')

# Fit and transform both columns
skills_matrix = tfidf_skills.fit_transform(job['skills'])
desc_matrix = tfidf_desc.fit_transform(job['Job Description'])

# Check the shapes of the matrices
print("Skills matrix shape:", skills_matrix.shape)
print("Description matrix shape:", desc_matrix.shape)

Skills matrix shape: (376, 1027)
Description matrix shape: (376, 1434)


In [16]:
#combined matrix
# Combine 'skills' and 'Job Description' into a single column
job['combined'] = job['skills'] + ' ' + job['Job Description']

# Re-run TF-IDF on the combined column
tfidf_combined = TfidfVectorizer(stop_words='english')
combined_matrix = tfidf_combined.fit_transform(job['combined'])

# Check the shape of the new combined matrix
print("Combined matrix shape:", combined_matrix.shape)



Combined matrix shape: (376, 1932)


In [17]:
#recommender system: recommending top 5 jobs based on user input
#calculating cosine similarity

from sklearn.metrics.pairwise import cosine_similarity

# Example input from the user
user_input = "python data analysis machine learning"  # Replace with actual input

# Transform the user input into the same TF-IDF space
user_vector = tfidf_combined.transform([user_input])

# Calculate cosine similarity
similarity_scores = cosine_similarity(user_vector, combined_matrix)

# Get the top N job recommendations
top_n = 5  # Number of recommendations
top_indices = similarity_scores[0].argsort()[-top_n:][::-1]  # Indices of top N scores

# Display the top recommendations
recommended_jobs = job.iloc[top_indices][['Job Title', 'skills', 'Job Description']]
print("Top Job Recommendations:")
print(recommended_jobs)


Top Job Recommendations:
             Job Title                                             skills  \
328     Data Scientist  machine learning algorithms python programming...   
157       Data Analyst  machine learning algorithms libraries eg sciki...   
258  Marketing Analyst  data analysis tools eg sql python data visuali...   
986   Research Analyst  data analysis techniques research methodologie...   
211       Data Analyst  data quality assessment improvement data profi...   

                                       Job Description  
328  machine learning engineers develop machine lea...  
157  data scientists use expertise data analysis ma...  
258  analyze data sets generate insights provide da...  
986  data analyst researcher conducts research anal...  
211  data quality analysts ensure accuracy complete...  


In [18]:
# Include similarity scores in the output: The similarity score is a numerical value that quantifies how closely two pieces of text (e.g., a resume 
# #and a job description) match in terms of content. It is calculated using cosine similarity, which is commonly used in text analysis.
recommended_jobs['Similarity Score'] = similarity_scores[0][top_indices]
print(recommended_jobs)


             Job Title                                             skills  \
328     Data Scientist  machine learning algorithms python programming...   
157       Data Analyst  machine learning algorithms libraries eg sciki...   
258  Marketing Analyst  data analysis tools eg sql python data visuali...   
986   Research Analyst  data analysis techniques research methodologie...   
211       Data Analyst  data quality assessment improvement data profi...   

                                       Job Description  Similarity Score  
328  machine learning engineers develop machine lea...          0.642625  
157  data scientists use expertise data analysis ma...          0.522936  
258  analyze data sets generate insights provide da...          0.255087  
986  data analyst researcher conducts research anal...          0.234594  
211  data quality analysts ensure accuracy complete...          0.208804  


In [19]:
###### building a recommender system based on resume

!pip install PyPDF2 python-docx




In [None]:
#upload and extract text from resume

import PyPDF2
import docx

def extract_text_from_file(file_path):
    if file_path.endswith('.pdf'):
        # Extract text from PDF
        with open(file_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            text = " ".join(page.extract_text() for page in reader.pages)
    elif file_path.endswith('.docx'):
        # Extract text from Word document
        doc = docx.Document(file_path)
        text = " ".join([p.text for p in doc.paragraphs])
    elif file_path.endswith('.txt'):
        # Extract text from plain text file
        with open(file_path, 'r') as f:
            text = f.read()
    else:
        raise ValueError("Unsupported file type. Please upload a PDF, Word, or text file.")
    
    return text

# Example usage
file_path = '/home/jovyan/MLProjects-1/Resume sample for WB testing.pdf'  # Replace with the path to the uploaded resume
resume_text = extract_text_from_file(file_path)
print("Extracted Resume Text:", resume_text[:500])  # Display the first 500 characters


Extracted Resume Text: Managed
hardware
installation,
support,
and
root
cause
analysis
for
company
IT
department.
Provided
remote
and
on-site
assistance
while
ensuring
all
computer
systems
and
programs
were
up-to-date.
●
Maintained
employee
relationships
through
consistent
communications
and
technical
support.
●
Researched
and
procured
relevant
hardware
and
directed
installation
tasks
for
company
desktops,
laptops,
and
servers.
●
Designed
and
installed
hardware
products
and
software
systems
including
circuit
boards
an


In [21]:
#preprocess resume text

resume_text_cleaned = clean_text(resume_text)  # Use the previously defined clean_text function

# Transform resume text into the same TF-IDF space
resume_vector = tfidf_combined.transform([resume_text_cleaned])
resume_vector


<1x1932 sparse matrix of type '<class 'numpy.float64'>'
	with 35 stored elements in Compressed Sparse Row format>

In [22]:
#calculate similarity and recommend job

# Compute cosine similarity
similarity_scores = cosine_similarity(resume_vector, combined_matrix)

# Get the top N job recommendations
top_n = 5  # Number of recommendations
top_indices = similarity_scores[0].argsort()[-top_n:][::-1]

# Display the top recommended jobs
recommended_jobs = job.iloc[top_indices][['Job Title', 'skills', 'Job Description']]
print("Top Job Recommendations Based on Resume:")
print(recommended_jobs)


Top Job Recommendations Based on Resume:
                       Job Title  \
59         IT Support Specialist   
85           Electrical Engineer   
41         Systems Administrator   
142        Network Administrator   
479  Customer Support Specialist   

                                                skills  \
59   desktop hardware software troubleshooting oper...   
85   electronics design pcb layout embedded systems...   
41   technical troubleshooting hardware software su...   
142  system administration server maintenance activ...   
479  help desk support ticket resolution troublesho...   

                                       Job Description  
59   desktop support technicians troubleshoot maint...  
85   electronics hardware engineers develop design ...  
41   support specialists provide technical assistan...  
142  manage maintain organizations infrastructure i...  
479  help desk analysts provide technical support a...  


In [23]:
###measuring the accuracy

test_data = [
    {
        "resume": "python machine learning data analysis",
        "relevant_jobs": ["Data Scientist", "Machine Learning Engineer"]
    },
    {
        "resume": "digital marketing social media campaigns",
        "relevant_jobs": ["Social Media Manager", "Marketing Specialist"]
    },
    # Add more test cases
]


In [24]:
def recommend_jobs(resume_text, top_n=5):
    # Preprocess the resume text
    resume_text_cleaned = clean_text(resume_text)
    resume_vector = tfidf_combined.transform([resume_text_cleaned])

    # Compute cosine similarity
    similarity_scores = cosine_similarity(resume_vector, combined_matrix)

    # Get top recommendations
    top_indices = similarity_scores[0].argsort()[-top_n:][::-1]
    return job.iloc[top_indices]['Job Title'].tolist()  # Return job titles


In [25]:
def precision_at_k(recommended_jobs, relevant_jobs, k):
    recommended_top_k = recommended_jobs[:k]
    relevant_count = sum(job in relevant_jobs for job in recommended_top_k)
    return relevant_count / k

# Example usage
for test_case in test_data:
    recommendations = recommend_jobs(test_case['resume'], top_n=5)
    precision = precision_at_k(recommendations, test_case['relevant_jobs'], k=5)
    print(f"Precision@5 for resume: {precision}")


Precision@5 for resume: 0.2
Precision@5 for resume: 0.4


In [26]:
def recall_at_k(recommended_jobs, relevant_jobs, k):
    recommended_top_k = recommended_jobs[:k]
    relevant_count = sum(job in relevant_jobs for job in recommended_top_k)
    return relevant_count / len(relevant_jobs)

# Example usage
for test_case in test_data:
    recommendations = recommend_jobs(test_case['resume'], top_n=5)
    recall = recall_at_k(recommendations, test_case['relevant_jobs'], k=5)
    print(f"Recall@5 for resume: {recall}")


Recall@5 for resume: 0.5
Recall@5 for resume: 1.0


In [27]:
def mean_reciprocal_rank(recommended_jobs, relevant_jobs):
    for i, job in enumerate(recommended_jobs):
        if job in relevant_jobs:
            return 1 / (i + 1)  # Reciprocal rank
    return 0  # No relevant job found

# Example usage
for test_case in test_data:
    recommendations = recommend_jobs(test_case['resume'], top_n=10)
    mrr = mean_reciprocal_rank(recommendations, test_case['relevant_jobs'])
    print(f"MRR for resume: {mrr}")


MRR for resume: 1.0
MRR for resume: 0.5


In [28]:
#######enhancing the model as the above accuracy is not good enough

from gensim.models import Word2Vec

# Prepare data for Word2Vec (split sentences into tokens)
corpus = [text.split() for text in job['combined']]

# Train Word2Vec
model = Word2Vec(corpus, vector_size=100, window=5, min_count=1, workers=4)

# Example: Generate vectors for a sentence
def get_sentence_vector(sentence, model):
    tokens = sentence.split()
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if vectors:
        return sum(vectors) / len(vectors)
    else:
        return np.zeros(100)  # Default zero vector if no tokens found

job['vector'] = job['combined'].apply(lambda x: get_sentence_vector(x, model))


In [29]:
!pip install sentence-transformers




In [30]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for job descriptions and resumes
job['embedding'] = job['combined'].apply(lambda x: model.encode(x))
resume_embedding = model.encode(resume_text)


In [31]:
job['weighted_combined'] = 2 * job['skills'] + job['Job Description']


In [32]:
!pip install surprise




In [33]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split

# Prepare interaction data (user, job, rating)
interaction_data = pd.DataFrame({
    'user': ['user1', 'user2', 'user1', 'user3'],
    'job': ['job1', 'job2', 'job3', 'job1'],
    'rating': [5, 4, 3, 5]
})
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(interaction_data, reader)

# Train collaborative filtering model
trainset, testset = train_test_split(data, test_size=0.2)
algo = SVD()
algo.fit(trainset)

# Predict ratings for a user
predictions = algo.test(testset)


In [34]:
from sklearn.metrics import ndcg_score

# Simulate relevance scores (1 for relevant, 0 for irrelevant)
true_relevance = [[1, 0, 1, 0, 0]]  # True relevance for top 5 jobs
predicted_scores = [[0.8, 0.5, 0.7, 0.3, 0.1]]  # Predicted similarity scores

ndcg = ndcg_score(true_relevance, predicted_scores, k=5)
print("NDCG Score:", ndcg)


NDCG Score: 1.0


In [35]:
###re-measuring the accuracy



test_data = [
    {
        "resume": "python machine learning data analysis",
        "relevant_jobs": ["Data Scientist", "Machine Learning Engineer"]
    },
    {
        "resume": "digital marketing social media campaigns",
        "relevant_jobs": ["Social Media Manager", "Marketing Specialist"]
    },
    # Add more test cases
]


In [36]:
def recommend_jobs(resume_text, top_n=5):
    # Preprocess the resume text
    resume_text_cleaned = clean_text(resume_text)
    resume_vector = tfidf_combined.transform([resume_text_cleaned])

    # Compute cosine similarity
    similarity_scores = cosine_similarity(resume_vector, combined_matrix)

    # Get top recommendations
    top_indices = similarity_scores[0].argsort()[-top_n:][::-1]
    return job.iloc[top_indices]['Job Title'].tolist()  # Return job titles

In [37]:
def precision_at_k(recommended_jobs, relevant_jobs, k):
    recommended_top_k = recommended_jobs[:k]
    relevant_count = sum(job in relevant_jobs for job in recommended_top_k)
    return relevant_count / k

# Example usage
for test_case in test_data:
    recommendations = recommend_jobs(test_case['resume'], top_n=5)
    precision = precision_at_k(recommendations, test_case['relevant_jobs'], k=5)
    print(f"Precision@5 for resume: {precision}")

Precision@5 for resume: 0.2
Precision@5 for resume: 0.4


In [38]:
def recall_at_k(recommended_jobs, relevant_jobs, k):
    recommended_top_k = recommended_jobs[:k]
    relevant_count = sum(job in relevant_jobs for job in recommended_top_k)
    return relevant_count / len(relevant_jobs)

# Example usage
for test_case in test_data:
    recommendations = recommend_jobs(test_case['resume'], top_n=5)
    recall = recall_at_k(recommendations, test_case['relevant_jobs'], k=5)
    print(f"Recall@5 for resume: {recall}")

Recall@5 for resume: 0.5
Recall@5 for resume: 1.0


In [39]:
def mean_reciprocal_rank(recommended_jobs, relevant_jobs):
    for i, job in enumerate(recommended_jobs):
        if job in relevant_jobs:
            return 1 / (i + 1)  # Reciprocal rank
    return 0  # No relevant job found

# Example usage
for test_case in test_data:
    recommendations = recommend_jobs(test_case['resume'], top_n=10)
    mrr = mean_reciprocal_rank(recommendations, test_case['relevant_jobs'])
    print(f"MRR for resume: {mrr}")

MRR for resume: 1.0
MRR for resume: 0.5


In [40]:
###another attempt for improvement

# Create a weighted combined text column
job['weighted_combined'] = 2 * job['skills'] + ' ' + job['Job Description']

# Vectorize the weighted combined text
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_combined = TfidfVectorizer(stop_words='english')
combined_matrix = tfidf_combined.fit_transform(job['weighted_combined'])

print("Weighted Combined Matrix Shape:", combined_matrix.shape)


Weighted Combined Matrix Shape: (376, 2279)


In [41]:
def recommend_jobs(resume_text, top_n=5):
    resume_text_cleaned = clean_text(resume_text)
    resume_vector = tfidf_combined.transform([resume_text_cleaned])
    similarity_scores = cosine_similarity(resume_vector, combined_matrix)
    top_indices = similarity_scores[0].argsort()[-top_n:][::-1]
    return job.iloc[top_indices][['Job Title', 'skills', 'Job Description']]


In [42]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for job postings
job['embedding'] = job['weighted_combined'].apply(lambda x: model.encode(x))

# Generate embedding for resume
resume_text_cleaned = clean_text(resume_text)
resume_embedding = model.encode(resume_text_cleaned)

# Compute similarity
import numpy as np
job['similarity'] = job['embedding'].apply(lambda x: np.dot(resume_embedding, x) / 
                                           (np.linalg.norm(resume_embedding) * np.linalg.norm(x)))

# Get top recommendations
job = job.sort_values(by='similarity', ascending=False)
top_jobs = job[['Job Title', 'skills', 'Job Description']].head(5)
print(top_jobs)


                       Job Title  \
41         Systems Administrator   
59         IT Support Specialist   
616             IT Administrator   
612           Network Technician   
355  Customer Support Specialist   

                                                skills  \
41   technical troubleshooting hardware software su...   
59   desktop hardware software troubleshooting oper...   
616  system administration network administration t...   
612  network troubleshooting support network config...   
355  technical troubleshooting customer support too...   

                                       Job Description  
41   support specialists provide technical assistan...  
59   desktop support technicians troubleshoot maint...  
616  system administrators manage maintain computer...  
612  network support specialists provide technical ...  
355  technical support specialists assist customers...  


In [43]:
def evaluate_precision_recall(test_data, k=10):
    total_precision, total_recall = 0, 0
    for test_case in test_data:
        recommendations = recommend_jobs(test_case['resume'], top_n=k)
        total_precision += precision_at_k(recommendations, test_case['relevant_jobs'], k)
        total_recall += recall_at_k(recommendations, test_case['relevant_jobs'], k)
    
    avg_precision = total_precision / len(test_data)
    avg_recall = total_recall / len(test_data)
    return avg_precision, avg_recall

# Example usage
avg_precision, avg_recall = evaluate_precision_recall(test_data, k=10)
print(f"Precision@10: {avg_precision}")
print(f"Recall@10: {avg_recall}")


Precision@10: 0.0
Recall@10: 0.0
