# **Job recommendation system**

![Alt Text](job.webp)

## **Import important packages**

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

## **Dataset reading**

In [14]:
jd = pd.read_csv('E:\\ERU\\Level 4\\S1\\ML\\Project\\.venv\\Job Posting\\postings.csv')
jd.head()

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,,...,Requirements: \n\nWe are seeking a College or ...,1713398000000.0,,0,FULL_TIME,USD,BASE_SALARY,38480.0,8540.0,34021.0
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,1712858000000.0,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0
2,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,65000.0,YEARLY,"Cincinnati, OH",64896719.0,8.0,,...,We are currently accepting resumes for FOH - A...,1713278000000.0,,0,FULL_TIME,USD,BASE_SALARY,55000.0,45202.0,39061.0
3,23221523,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,175000.0,YEARLY,"New Hyde Park, NY",766262.0,16.0,,...,This position requires a baseline understandin...,1712896000000.0,,0,FULL_TIME,USD,BASE_SALARY,157500.0,11040.0,36059.0
4,35982263,,Service Technician,Looking for HVAC service tech with experience ...,80000.0,YEARLY,"Burlington, IA",,3.0,,...,,1713452000000.0,,0,FULL_TIME,USD,BASE_SALARY,70000.0,52601.0,19057.0


## **Preprocessing**

### **Data preprocessing**

In [18]:
final_df = jd_clean.drop_duplicates(subset='description')
final_df.shape

(79633, 12)

In [15]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(jd):
    # Drop column: 'zip_code'
    jd = jd.drop(columns=['zip_code'])
    # Drop column: 'company_id'
    jd = jd.drop(columns=['company_id'])
    # Replace missing values with 0 in column: 'remote_allowed'
    jd = jd.fillna({'remote_allowed': 0})
    # Change column type to int16 for column: 'remote_allowed'
    jd = jd.astype({'remote_allowed': 'int16'})
    # Drop column: 'application_url'
    jd = jd.drop(columns=['application_url'])
    # Drop column: 'closed_time'
    jd = jd.drop(columns=['closed_time'])
    # Drop column: 'skills_desc'
    jd = jd.drop(columns=['skills_desc'])
    # Drop column: 'sponsored'
    jd = jd.drop(columns=['sponsored'])
    # Drop column: 'expiry'
    jd = jd.drop(columns=['expiry'])
    # Drop column: 'listed_time'
    jd = jd.drop(columns=['listed_time'])
    # Drop column: 'med_salary'
    jd = jd.drop(columns=['med_salary'])
    # Drop column: 'min_salary'
    jd = jd.drop(columns=['min_salary'])
    # Drop column: 'applies'
    jd = jd.drop(columns=['applies'])
    # Drop column: 'original_listed_time'
    jd = jd.drop(columns=['original_listed_time'])
    # Drop column: 'work_type'
    jd = jd.drop(columns=['work_type'])
    # Drop column: 'currency'
    jd = jd.drop(columns=['currency'])
    # Drop column: 'compensation_type'
    jd = jd.drop(columns=['compensation_type'])
    # Drop column: 'normalized_salary'
    jd = jd.drop(columns=['normalized_salary'])
    # Drop rows with missing data in column: 'company_name'
    jd = jd.dropna(subset=['company_name'])
    # Drop rows with missing data in column: 'description'
    jd = jd.dropna(subset=['description'])
    # Drop column: 'max_salary'
    jd = jd.drop(columns=['max_salary'])
    # Drop rows with missing data in column: 'views'
    jd = jd.dropna(subset=['views'])
    # Change column type to int16 for column: 'views'
    jd = jd.astype({'views': 'int16'})
    # Drop rows with missing data in column: 'formatted_experience_level'
    jd = jd.dropna(subset=['formatted_experience_level'])
    # Drop column: 'posting_domain'
    jd = jd.drop(columns=['posting_domain'])
    # Drop column: 'pay_period'
    jd = jd.drop(columns=['pay_period'])
    return jd

jd_clean = clean_data(jd.copy())
jd_clean.head()

Unnamed: 0,job_id,company_name,title,description,location,views,formatted_work_type,remote_allowed,job_posting_url,application_type,formatted_experience_level,fips
70,2147609789,Revature,Entry Level Oracle Financial Technology Consul...,About RevatureRevature is one of the largest a...,"East Chicago, IN",2,Full-time,0,https://www.linkedin.com/jobs/view/2147609789/...,ComplexOnsiteApply,Entry level,18089.0
84,2457183642,Galerie Candy and Gifts,Quality Assurance Manager,Galerie is seeking an experienced Quality Assu...,"Hebron, KY",2,Full-time,0,https://www.linkedin.com/jobs/view/2457183642/...,ComplexOnsiteApply,Mid-Senior level,21015.0
101,2989631782,ActOne Group,Administrative Assistant - CONCUR,Global Financial Services firm is seeking an e...,"New York, NY",1,Full-time,0,https://www.linkedin.com/jobs/view/2989631782/...,ComplexOnsiteApply,Associate,36061.0
102,3018278978,Aston Carter,Seasonal Office Administrator,Seasonal Office Admin\n\nResponsibilities\n\nW...,"Dayton, OR",279,Contract,0,https://www.linkedin.com/jobs/view/3018278978/...,OffsiteApply,Entry level,41071.0
109,3177010992,ABC Farigua Division,Customer Service Representative,We are seeking future agents to join our team!...,"Greater Orlando, FL",6,Full-time,0,https://www.linkedin.com/jobs/view/3177010992/...,ComplexOnsiteApply,Entry level,12095.0


### **Add industry name to the dataset**

In [16]:
ji = pd.read_csv('Job Posting\jobs\job_industries.csv')
ji.head()

Unnamed: 0,job_id,industry_id
0,3884428798,82
1,3887473071,48
2,3887465684,41
3,3887467939,82
4,3887467939,80


In [17]:
ii = pd.read_csv('Job Posting\mappings\industries.csv')
ii.head()

Unnamed: 0,industry_id,industry_name
0,1,Defense and Space Manufacturing
1,3,Computer Hardware Manufacturing
2,4,Software Development
3,5,Computer Networking Products
4,6,"Technology, Information and Internet"


In [19]:
final_df.head()

Unnamed: 0,job_id,company_name,title,description,location,views,formatted_work_type,remote_allowed,job_posting_url,application_type,formatted_experience_level,fips
70,2147609789,Revature,Entry Level Oracle Financial Technology Consul...,About RevatureRevature is one of the largest a...,"East Chicago, IN",2,Full-time,0,https://www.linkedin.com/jobs/view/2147609789/...,ComplexOnsiteApply,Entry level,18089.0
84,2457183642,Galerie Candy and Gifts,Quality Assurance Manager,Galerie is seeking an experienced Quality Assu...,"Hebron, KY",2,Full-time,0,https://www.linkedin.com/jobs/view/2457183642/...,ComplexOnsiteApply,Mid-Senior level,21015.0
101,2989631782,ActOne Group,Administrative Assistant - CONCUR,Global Financial Services firm is seeking an e...,"New York, NY",1,Full-time,0,https://www.linkedin.com/jobs/view/2989631782/...,ComplexOnsiteApply,Associate,36061.0
102,3018278978,Aston Carter,Seasonal Office Administrator,Seasonal Office Admin\n\nResponsibilities\n\nW...,"Dayton, OR",279,Contract,0,https://www.linkedin.com/jobs/view/3018278978/...,OffsiteApply,Entry level,41071.0
109,3177010992,ABC Farigua Division,Customer Service Representative,We are seeking future agents to join our team!...,"Greater Orlando, FL",6,Full-time,0,https://www.linkedin.com/jobs/view/3177010992/...,ComplexOnsiteApply,Entry level,12095.0


#### **Merge the datasets**

In [20]:
# Merge ji with ii to get industry_name
ji_industry = ji.merge(ii, on='industry_id', how='left')

# Merge final_df with ji_industry to get industry_name in final_df
final_df = final_df.merge(ji_industry[['job_id', 'industry_name']], on='job_id', how='left')

# Display the updated final_df
final_df.head()

Unnamed: 0,job_id,company_name,title,description,location,views,formatted_work_type,remote_allowed,job_posting_url,application_type,formatted_experience_level,fips,industry_name
0,2147609789,Revature,Entry Level Oracle Financial Technology Consul...,About RevatureRevature is one of the largest a...,"East Chicago, IN",2,Full-time,0,https://www.linkedin.com/jobs/view/2147609789/...,ComplexOnsiteApply,Entry level,18089.0,Financial Services
1,2457183642,Galerie Candy and Gifts,Quality Assurance Manager,Galerie is seeking an experienced Quality Assu...,"Hebron, KY",2,Full-time,0,https://www.linkedin.com/jobs/view/2457183642/...,ComplexOnsiteApply,Mid-Senior level,21015.0,Manufacturing
2,2457183642,Galerie Candy and Gifts,Quality Assurance Manager,Galerie is seeking an experienced Quality Assu...,"Hebron, KY",2,Full-time,0,https://www.linkedin.com/jobs/view/2457183642/...,ComplexOnsiteApply,Mid-Senior level,21015.0,Consumer Services
3,2989631782,ActOne Group,Administrative Assistant - CONCUR,Global Financial Services firm is seeking an e...,"New York, NY",1,Full-time,0,https://www.linkedin.com/jobs/view/2989631782/...,ComplexOnsiteApply,Associate,36061.0,Financial Services
4,3018278978,Aston Carter,Seasonal Office Administrator,Seasonal Office Admin\n\nResponsibilities\n\nW...,"Dayton, OR",279,Contract,0,https://www.linkedin.com/jobs/view/3018278978/...,OffsiteApply,Entry level,41071.0,Staffing and Recruiting


### **Text preprocessing**

In [22]:
import re
def cleanText(txt):
    cleanText = re.sub('http\S+\s', ' ', txt) # This line removes any URLs from the text
    cleanText = re.sub('RT|cc', ' ', cleanText) # This line removes any RTs or cc from the text
    cleanText = re.sub('#\S+\s', ' ', cleanText) # This line removes any hashtags from the text
    cleanText = re.sub('@\S+', '  ', cleanText) # This line removes any @ from the text
    cleanText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', cleanText) # This line removes any punctuations from the text
    cleanText = re.sub(r'[^\x00-\x7f]', ' ', cleanText) # This line removes any non-ASCII characters from the text
    cleanText = re.sub('\s+', ' ', cleanText) # This line removes any extra whitespaces from the text
    return cleanText

In [23]:
final_df['description'] = final_df['description'].apply(lambda x: cleanText(x))

#### **Export the dataset for streamlit**

In [24]:
final_df.to_csv('final_job_postings.csv', index=False)

## **Recommendation**

In [25]:
myresume = """I am a Business Analyst specializing in developing dashboards,
reports, and data models to drive performance insights.
Proficient in Python, R, SQL, Excel, and Power BI, I excel in data
analysis, advanced analytics, and automation of data processes.
Skilled in statistical analysis and data visualization, I derive
insights for data-driven decisions. Experienced in designing and
optimizing data warehouse solutions, managing ETL processes,
and ensuring data integrity and security. Additionally, I hold a
CCNA certification from Cisco, showcasing my knowledge in
networking.
"""

In [26]:
# Combine the resume and job descriptions into a single list
documents = [myresume] + final_df['description'].tolist()

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

In [27]:
# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)

In [28]:
# Calculate the cosine similarity between the resume and all job descriptions
cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

In [29]:
# Create a DataFrame with the job descriptions and their similarity scores
similarity_df = pd.DataFrame({
    'post': final_df['description'],
    'Post link': final_df['job_posting_url'],
    'similarity': cosine_similarities
})

In [30]:
# Sort the DataFrame by similarity scores in descending order
similarity_df = similarity_df.drop_duplicates(subset='post').sort_values(by='similarity', ascending=False)

In [31]:
# Display the top 10 most similar job descriptions
top_10_similar_jobs = similarity_df.head(10)
print(top_10_similar_jobs)

                                                     post  \
100418  Do you love to build innovate create and colla...   
72961   Position Overview We are excited to welcome a ...   
46801   Note Open to Contract Full Time Part Time Over...   
79993   Dice is the leading career destination for tec...   
7942    PLEASE NOTE This position is NOT C2C 3rd Party...   
95476   NOTE Proficiency in Salesforce Analytics or CR...   
83450   Data Engineer Remote Competitive Salary Bonus ...   
61963   Zaddy Solutions is doing a search for an onsit...   
1978    Must be a US citizen or GC holder Local to Orl...   
17987   Job Title Enterprise Data Modeler ArchitectLoc...   

                                                Post link  similarity  
100418  https://www.linkedin.com/jobs/view/3906230754/...    0.462453  
72961   https://www.linkedin.com/jobs/view/3904394509/...    0.457235  
46801   https://www.linkedin.com/jobs/view/3901951800/...    0.445846  
79993   https://www.linkedin.com/jobs/vi