## Job Recommendation

Dataset: https://www.kaggle.com/datasets/ravindrasinghrana/job-description-dataset

In [1]:
import pandas as pd

df = pd.read_csv('job_descriptions.csv')

In [2]:
# Drop column: 'Contact'
df = df.drop(columns=['Contact'])
df = df.drop(columns=['Job Id'])

In [3]:
df.columns

Index(['Experience', 'Qualifications', 'Salary Range', 'location', 'Country',
       'latitude', 'longitude', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Contact Person', 'Job Title', 'Role',
       'Job Portal', 'Job Description', 'Benefits', 'skills',
       'Responsibilities', 'Company', 'Company Profile'],
      dtype='object')

In [4]:
df.isnull().sum()

Experience             0
Qualifications         0
Salary Range           0
location               0
Country                0
latitude               0
longitude              0
Work Type              0
Company Size           0
Job Posting Date       0
Preference             0
Contact Person         0
Job Title              0
Role                   0
Job Portal             0
Job Description        0
Benefits               0
skills                 0
Responsibilities       0
Company                0
Company Profile     5478
dtype: int64

In [5]:
import polars as pl

# Using Polar dataframe to Preprocessing the Data
df_polars = pl.DataFrame(df)

# Cleaned '$' and 'K' in 'Salary Range'
df_polars = df_polars.with_columns([
    pl.col("Salary Range").str.replace_all(r"[\$K]", "").str.split("-").alias("Salary Range Split")
])

# Split Max and Min Salary
df_polars = df_polars.with_columns([
    (pl.col("Salary Range Split").list.get(0).cast(pl.Float64) * 1000).alias("Min Salary"),
    (pl.col("Salary Range Split").list.get(1).cast(pl.Float64) * 1000).alias("Max Salary")
])

df_polars = df_polars.with_columns([
    pl.col('skills').fill_null('').str.to_lowercase(),
    pl.col('Qualifications').fill_null('').str.to_lowercase(),
    pl.col('Job Description').fill_null('').str.to_lowercase()
])

In [6]:
df = df_polars.to_pandas()

In [7]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Use TF-IDF to vectorize the 'Skills' column
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['skills'])

# Function to recommend jobs based on skills
def recommend_jobs(job_seeker_skills, top_n=5):
    # Convert job seeker's skills to a vector
    seeker_skills_vector = tfidf.transform([job_seeker_skills.lower()])
    
    # Compute cosine similarity between job seeker's skills and all job postings
    cosine_sim = cosine_similarity(seeker_skills_vector, tfidf_matrix).flatten()
    
    # Get the top N jobs with highest similarity
    top_jobs_indices = cosine_sim.argsort()[-top_n:][::-1]
    
    return df.iloc[top_jobs_indices][['Job Title', 'Company', 'skills', 'Salary Range']]

In [8]:
# Example usage: recommend jobs based on a job seeker's skills
job_seeker_skills = "python, machine learning, data analysis"
recommended_jobs = recommend_jobs(job_seeker_skills)
print(recommended_jobs)

              Job Title                   Company  \
630408   Data Scientist      Bristol-Myers Squibb   
137643   Data Scientist                      Olin   
655749   Data Scientist  TravelCenters of America   
1069428  Data Scientist     L3Harris Technologies   
857581   Data Scientist              W.R. Berkley   

                                                    skills Salary Range  
630408   machine learning algorithms python programming...   $58K-$125K  
137643   machine learning algorithms python programming...   $58K-$112K  
655749   machine learning algorithms python programming...   $60K-$114K  
1069428  machine learning algorithms python programming...    $58K-$85K  
857581   machine learning algorithms python programming...   $64K-$120K  


In [9]:
# Combine all criteria: skills, location, and salary
def recommend_jobs_combined(job_seeker_skills, desired_min_salary, desired_max_salary, top_n=5):
    # Step 1: Skills filtering
    seeker_skills_vector = tfidf.transform([job_seeker_skills.lower()])
    cosine_sim = cosine_similarity(seeker_skills_vector, tfidf_matrix).flatten()
    df['Skill Similarity'] = cosine_sim
    
   
    # Step 2: Salary filtering
    filtered_jobs = df[(df['Min Salary'] >= desired_min_salary) & 
                       (df['Max Salary'] <= desired_max_salary) ]  
    
    # Sort by skill similarity and get top N jobs
    top_jobs = filtered_jobs.sort_values(by='Skill Similarity', ascending=False).head(top_n)
    
    return top_jobs[['Job Title', 'Company', 'skills', 'Salary Range', 'Skill Similarity']]

# Example: Recommend jobs based on all criteria
recommended_jobs_combined = recommend_jobs_combined("python, machine learning, data analysis", 60000, 100000)
print(recommended_jobs_combined)


              Job Title               Company  \
1182591  Data Scientist                  AGCO   
88352    Data Scientist           Wells Fargo   
659611   Data Scientist                Sempra   
939425   Data Scientist  State Farm Insurance   
134294   Data Scientist          Ryder System   

                                                    skills Salary Range  \
1182591  machine learning algorithms python programming...    $63K-$95K   
88352    machine learning algorithms python programming...    $60K-$84K   
659611   machine learning algorithms python programming...    $61K-$86K   
939425   machine learning algorithms python programming...    $62K-$98K   
134294   machine learning algorithms python programming...    $64K-$91K   

         Skill Similarity  
1182591          0.609868  
88352            0.609868  
659611           0.609868  
939425           0.609868  
134294           0.609868  


In [10]:
# Example: Recommend jobs based on all criteria
recommended_jobs_combined = recommend_jobs_combined("electrical", 10000, 120000)
recommended_jobs_combined

Unnamed: 0,Job Title,Company,skills,Salary Range,Skill Similarity
1160895,Electrical Designer,Constellation Energy,electrical engineering circuit design electron...,$55K-$112K,0.467321
348030,Electrical Designer,AutoZone,electrical engineering circuit design electron...,$58K-$88K,0.467321
335545,Electrical Designer,Cisco Systems,electrical engineering circuit design electron...,$60K-$97K,0.467321
762816,Electrical Designer,Global Partners,electrical engineering circuit design electron...,$58K-$91K,0.467321
1068505,Electrical Designer,Pacific Life,electrical engineering circuit design electron...,$65K-$117K,0.467321


In [11]:
import spacy
import re

# Load pre-trained NLP model (spaCy's en_core_web_sm)
nlp = spacy.load('en_core_web_sm')

# Example resume text
resume_text = """
John Doe
Python Developer with 5 years of experience in machine learning and web development.
Skills: Python, Django, TensorFlow, Docker, AWS
Experience: Python Developer at ABC Corp (2018-2023)
Education: BSc in Computer Science, XYZ University
Contact: johndoe@example.com, +1234567890
"""

# Preprocess the resume text
def clean_resume(text):
    # Remove email addresses, phone numbers, and other irrelevant information
    text = re.sub(r'\S+@\S+', '', text)  # Remove emails
    text = re.sub(r'\+\d{9,}', '', text)  # Remove phone numbers
    return text

# Parse the resume using spaCy
doc = nlp(clean_resume(resume_text))

# Extract specific entities (NER)
for ent in doc.ents:
    print(f"{ent.label_}: {ent.text}")

# Example: Extracting skills, experience, and education manually
skills = re.findall(r"Skills: (.+)", resume_text)
experience = re.findall(r"Experience: (.+)", resume_text)
education = re.findall(r"Education: (.+)", resume_text)

print("Skills:", skills)
print("Experience:", experience)
print("Education:", education)


PERSON: John Doe
Python
DATE: 5 years
PERSON: Django
ORG: TensorFlow
PERSON: Docker
ORG: ABC Corp
DATE: 2018-2023
ORG: Computer Science
ORG: XYZ University
Skills: ['Python, Django, TensorFlow, Docker, AWS']
Experience: ['Python Developer at ABC Corp (2018-2023)']
Education: ['BSc in Computer Science, XYZ University']


In [12]:
def parse_resume(resume_text):
    doc = nlp(clean_resume(resume_text))
    skills = re.findall(r"Skills: (.+)", resume_text)
    experience = re.findall(r"Experience: (.+)", resume_text)
    education = re.findall(r"Education: (.+)", resume_text)
    
    return {
        'skills': skills[0] if skills else '',
        'experience': experience[0] if experience else '',
        'education': education[0] if education else ''
    }

def recommend_jobs(resume_text, job_descriptions_df):
    # Parse the resume
    resume_data = parse_resume(resume_text)
    
    # Job descriptions from the dataset
    job_descriptions = job_descriptions_df['Job Description'].fillna('').tolist()
    
    # Create TF-IDF vectors
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(job_descriptions + [resume_data['skills']])
    
    # Calculate similarity
    cosine_sim = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
    
    # Get top job matches
    top_matches = cosine_sim.argsort()[-5:][::-1]
    recommended_jobs = job_descriptions_df.iloc[top_matches][['Job Title', 'Company', 'skills']]
    
    return recommended_jobs

In [13]:
# Example usage
resume_text = """
Harry Garry
Python Developer with 1 years of experience in machine learning.
Skills: Python, Javascript, TensorFlow
Experience: Python Developer at Osborn Corp (2018-2023)
Education: Bachelor in Computer Science, Harvard University
Contact: HG@example.com, +4421
"""

recommended_jobs = recommend_jobs(resume_text, df)
print(recommended_jobs)

                  Job Title                     Company  \
176479   Front-End Engineer                     Hershey   
299718   Front-End Engineer  Indiabulls Housing Finance   
502858   Front-End Engineer  Farmers Insurance Exchange   
1426315  Front-End Engineer      Tractor Supply Company   
641987   Front-End Engineer      Mondelez International   

                                                    skills  
176479   javascript programming frontend development fr...  
299718   javascript programming frontend development fr...  
502858   javascript programming frontend development fr...  
1426315  javascript programming frontend development fr...  
641987   javascript programming frontend development fr...  
