In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle


In [2]:
jobs = pd.read_csv('postings.csv')

In [3]:
skills = pd.read_csv('job_skills.csv')
industries = pd.read_csv('job_industries.csv')
industries_name = pd.read_csv('industries.csv')
skill_name = pd.read_csv('skills.csv')
company_industries_name = pd.read_csv('company_industries.csv')
company_specialities_name = pd.read_csv('company_specialities.csv')

# Merge data

In [4]:
jobs = jobs.merge(skills, on='job_id', suffixes=('_jobs', '_skills'))

In [5]:
jobs = jobs.merge(industries, on='job_id', suffixes=('_jobs1', '_industries'))
jobs = jobs.merge(industries_name, on='industry_id', suffixes=('_jobs2', '_industries'))

In [6]:
jobs = jobs.merge(skill_name, on='skill_abr', suffixes=('_jobs3', '_skillname'))
jobs = jobs.merge(company_industries_name, on='company_id', suffixes=('_jobs4', '_companyindustriesname'))
jobs = jobs.merge(company_specialities_name, on='company_id', suffixes=('_jobs5', '_companyspecialitiesname'))

# Select relevant columns

In [7]:
columns_to_keep = [
    'company_name', 'title', 'description', 'location', 'job_posting_url', 'expiry',
    'skill_abr', 'skill_name', 'industry_name', 'industry'
]
jobs = jobs[columns_to_keep]


In [8]:
def collapse(L):
    L1 = [i.replace(" ","") for i in L]
    return L1

# Clean the data
jobs['title'] = jobs['title'].apply(lambda x: ''.join(collapse(x.split())))
jobs['skill_abr'] = jobs['skill_abr'].apply(lambda x: ''.join(collapse(x.split())))
jobs['skill_name'] = jobs['skill_name'].apply(lambda x: ''.join(collapse(x.split())))
jobs['industry'] = jobs['industry'].apply(lambda x: ''.join(collapse(x.split())))

In [22]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub('<.*?>', '', text)  # Remove HTML tags
    text = re.sub('[^a-z\s]', '', text)  # Remove special characters and digits
    return text

# Apply preprocessing
jobs['description'] = jobs['description'].apply(preprocess_text)

# Combine relevant columns to create tags
jobs['tags'] = (jobs['title'] + ' ' + jobs['skill_name'] + ' ' +
                jobs['industry_name'] + ' ' + jobs['description'])


In [23]:
jobs = jobs.sample(n=12000).reset_index(drop=True)

In [28]:
jobs.fillna('', inplace=True)


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [30]:
tfidf = TfidfVectorizer(stop_words='english')
vector = tfidf.fit_transform(jobs['tags'])

In [31]:
similarity = cosine_similarity(vector)

In [32]:
def recommend(job_title):
    job_title = preprocess_text(job_title)  # Preprocess the input
    matching_titles = jobs[jobs['title'].str.contains(job_title, case=False, na=False)]
    
    if matching_titles.empty:
        print("Job title not found. Please check the input.")
        return

    index = matching_titles.index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    recommended_jobs = [jobs.iloc[i[0]].title for i in distances[1:6]]

    print(f"Recommendations for '{job_title}':")
    for job in recommended_jobs:
        print(job)


In [37]:
def recommend(job_title, top_n=5):
    job_title = preprocess_text(job_title)  # Preprocess the input
    matching_titles = jobs[jobs['title'].str.contains(job_title, case=False, na=False)]
    
    if matching_titles.empty:
        print("Job title not found. Please check the input.")
        return
    
    index = matching_titles.index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    
    recommended_jobs = []
    for i in distances[1:top_n+1]:
        job = jobs.iloc[i[0]]
        recommended_jobs.append({
            'Title': job['title'],
            'Company': job['company_name'],
            'Location': job['location'],
            'URL': job['job_posting_url']
        })
    
    print(f"Recommendations for '{job_title}':")
    for job in recommended_jobs:
        print(f"Title: {job['Title']}")
        print(f"Company: {job['Company']}")
        print(f"Location: {job['Location']}")
        print(f"URL: {job['URL']}")
        print("---")


In [38]:
recommend('Analyst')

Recommendations for 'analyst':
Title: PowerBIDeveloper
Company: Ascendo Resources
Location: Metro Jacksonville
URL: https://www.linkedin.com/jobs/view/3904429144/?trk=jobs_biz_prem_srch
---
Title: FinancialBusinessIntelligenceDeveloper
Company: POWER Engineers
Location: Boise, ID
URL: https://www.linkedin.com/jobs/view/3906259167/?trk=jobs_biz_prem_srch
---
Title: FinancialSystemsAnalyst
Company: Creative Financial Staffing (CFS)
Location: Grand Rapids, MI
URL: https://www.linkedin.com/jobs/view/3885858365/?trk=jobs_biz_prem_srch
---
Title: Sr.BusinessAnalyst1
Company: Skyworks Solutions, Inc.
Location: Irvine, CA
URL: https://www.linkedin.com/jobs/view/3901963396/?trk=jobs_biz_prem_srch
---
Title: SharepointDeveloper
Company: BCforward
Location: Houston, TX
URL: https://www.linkedin.com/jobs/view/3902911945/?trk=jobs_biz_prem_srch
---


In [39]:
# Improved recommend function
def recommend(job_title, location, top_n=5):
    job_title = preprocess_text(job_title)  # Preprocess the input job title
    location = preprocess_text(location)  # Preprocess the input location
    
    matching_jobs = jobs[(jobs['title'].str.contains(job_title, case=False, na=False)) &
                         (jobs['location'].str.contains(location, case=False, na=False))]
    
    if matching_jobs.empty:
        print("No matching jobs found for the given job title and location. Please check the input.")
        return
    
    index = matching_jobs.index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    
    recommended_jobs = []
    for i in distances[1:top_n+1]:
        job = jobs.iloc[i[0]]
        recommended_jobs.append({
            'Title': job['title'],
            'Company': job['company_name'],
            'Location': job['location'],
            'URL': job['job_posting_url']
        })
    
    print(f"Recommendations for '{job_title}' in '{location}':")
    for job in recommended_jobs:
        print(f"Title: {job['Title']}")
        print(f"Company: {job['Company']}")
        print(f"Location: {job['Location']}")
        print(f"URL: {job['URL']}")
        print("---")

Recommendations for 'analyst' in 'new york':
Title: DataAnalyst
Company: Genpact
Location: San Antonio, Texas Metropolitan Area
URL: https://www.linkedin.com/jobs/view/3903848583/?trk=jobs_biz_prem_srch
---
Title: DataAnalyst
Company: Genpact
Location: San Antonio, Texas Metropolitan Area
URL: https://www.linkedin.com/jobs/view/3903848583/?trk=jobs_biz_prem_srch
---
Title: DataEngineer-Remote
Company: Orbit Recruitment Group 
Location: New York, NY
URL: https://www.linkedin.com/jobs/view/3905208625/?trk=jobs_biz_prem_srch
---
Title: DataEngineer
Company: V2Soft
Location: Auburn Hills, MI
URL: https://www.linkedin.com/jobs/view/3902351966/?trk=jobs_biz_prem_srch
---
Title: Office/Administration-DataAnalystLv3#:24-02340
Company: HireTalent - Diversity Staffing & Recruiting Firm
Location: Wayzata, MN
URL: https://www.linkedin.com/jobs/view/3887718042/?trk=jobs_biz_prem_srch
---


Recommendations for 'salesperson' in 'new york':
Title: OutsideSalesperson
Company: The Viscusi Group
Location: New York City Metropolitan Area
URL: https://www.linkedin.com/jobs/view/3741582728/?trk=jobs_biz_prem_srch
---
Title: OutsideSalesperson
Company: The Viscusi Group
Location: New York City Metropolitan Area
URL: https://www.linkedin.com/jobs/view/3741582728/?trk=jobs_biz_prem_srch
---
Title: OutsideSalesperson
Company: The Viscusi Group
Location: New York City Metropolitan Area
URL: https://www.linkedin.com/jobs/view/3741582728/?trk=jobs_biz_prem_srch
---


In [43]:
# ...
# (previous code remains the same)
# ...

# Improved recommend function
def recommend(job_title, location, top_n=5):
    job_title = preprocess_text(job_title)  # Preprocess the input job title
    location = preprocess_text(location)  # Preprocess the input location
    
    matching_jobs = jobs[(jobs['title'].str.contains(job_title, case=False, na=False)) &
                         (jobs['location'].str.contains(location, case=False, na=False))]
    
    if matching_jobs.empty:
        # If no exact matches found, try partial matching for location
        matching_jobs = jobs[(jobs['title'].str.contains(job_title, case=False, na=False)) &
                             (jobs['location'].apply(lambda x: location in preprocess_text(x)))]
    
    if matching_jobs.empty:
        print("No matching jobs found for the given job title and location. Please check the input.")
        return
    
    job_indices = matching_jobs.index
    distances = sorted([(i, similarity[i][job_indices].mean()) for i in job_indices], reverse=True, key=lambda x: x[1])
    
    recommended_jobs = []
    recommended_urls = set()
    for i, _ in distances:
        job = matching_jobs.loc[i]
        if job['job_posting_url'] not in recommended_urls:
            recommended_jobs.append({
                'Title': job['title'],
                'Company': job['company_name'],
                'Location': job['location'],
                'URL': job['job_posting_url']
            })
            recommended_urls.add(job['job_posting_url'])
        
        if len(recommended_jobs) == top_n:
            break
    
    print(f"Recommendations for '{job_title}' in '{location}':")
    for job in recommended_jobs:
        print(f"Title: {job['Title']}")
        print(f"Company: {job['Company']}")
        print(f"Location: {job['Location']}")
        print(f"URL: {job['URL']}")
        print("---")

# Test the recommendation function
recommend('Analyst', 'New York', top_n=3)

Recommendations for 'analyst' in 'new york':
Title: PrivateEquityAnalyst
Company: Augment Jobs
Location: New York City Metropolitan Area
URL: https://www.linkedin.com/jobs/view/3898162368/?trk=jobs_biz_prem_srch
---
Title: SeniorAnalyst,StrategicFinance
Company: OneTrust
Location: New York City Metropolitan Area
URL: https://www.linkedin.com/jobs/view/3888479494/?trk=jobs_biz_prem_srch
---
Title: JuniorKYC/AMLAnalyst
Company: ApTask
Location: New York, NY
URL: https://www.linkedin.com/jobs/view/3894290208/?trk=jobs_biz_prem_srch
---


In [44]:
pickle.dump(jobs,open('jobs_list.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))