In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import re
from nltk.util import ngrams
import string
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz



In [2]:
df = pd.read_csv(r"C:\Users\Owner\Downloads\Jobs_NYC_Postings.csv")

In [3]:
df2 = pd.read_excel(r"C:\Users\Owner\Documents\UserDataset1.xlsx")

In [4]:
number_words = {
    'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5,
    'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10,
}

In [5]:
def extract_experience_for_degree(description, user_degree):
    if isinstance(description, str):
        pattern = r'(\d+|one|two|three|four|five|six|seven|eight|nine|ten)[\s-]*(?:years?|yrs?)\s*(?:of\s*)?(?:experience)?'

        match = re.search(fr'{user_degree}.*?{pattern}', description, flags=re.IGNORECASE)
        
        if match:
            years_exp = match.group(1).lower()
            return number_words.get(years_exp, 0)
        else:
            return 0  
    else:
        return 0  


In [6]:
def fuzzy_string_match(s1, s2):
    return fuzz.token_set_ratio(s1.lower(), s2.lower())

In [7]:
def extract_education_and_experience(description):
    education_levels = ['baccalaureate degree', 'high school graduation','master\'s', 'doctorate']

    if isinstance(description, str):
        pattern = r'(\d+|one|two|three|four|five|six|seven|eight|nine|ten)[\s-]*(?:years?|yrs?)\s*(?:of\s*)?(?:experience)?'

        extracted_data = {}
        highest_level_found = None  

        for level in education_levels:
            regex = fr'{level}.*?(\d+|one|two|three|four|five|six|seven|eight|nine|ten)[\s-]*(?:years?|yrs?)\s*(?:of\s*)?(?:experience)?'
            match = re.search(regex, description, flags=re.IGNORECASE)
            
            if match:
                highest_level_found = level
                years_exp = match.group(1).lower()
                numeric_years = number_words.get(years_exp, 0)
                extracted_data[level] = numeric_years
            elif highest_level_found:
                extracted_data[level] = 0  
            else:
                extracted_data[level] = 0  

        return pd.Series(extracted_data)
    else:
        return pd.Series({level: 0 for level in education_levels})

In [8]:
education_experience_df = df['Minimum Qual Requirements'].apply(extract_education_and_experience)

In [9]:
df = pd.concat([df, education_experience_df], axis=1)

In [10]:
df['Preferred Skills'] = df['Preferred Skills'].apply(lambda x: '' if pd.isna(x) or isinstance(x, float) else x)


In [11]:
users1 = df2.dropna()

In [12]:
user_item_matrix = pd.read_csv('User Item Matrix.csv')

In [13]:
user_item_matrix = user_item_matrix.pivot(index='Job ID', columns='User Id', values='Score')

In [14]:
user_degree_to_job_degree = {
    'bachelor\'s': 'baccalaureate degree',
    'high school graduation': 'high school graduation',
    'master\'s': 'master\'s',
    'doctorate': 'doctorate'
}

In [15]:
recommendations = {}
for user_idx, user_row in users1.iterrows():
    user_id = user_row['User Id']
    user_skills = set([skill.strip() for skill in user_row['Skills'].split(',')])
    user_degree = user_row['Degree']
    user_experience = user_row['Years of experience']
    user_field = user_row['Field']
    user_current_job = user_row['Current Job']


    vectorizer = TfidfVectorizer(max_features=1000)
    user_skills_tf_idf = vectorizer.fit_transform([",".join(user_skills)])

    job_similarity = {}
    for idx, job_row in df.iterrows():
        job_id = job_row['Job ID']
        job_description = job_row['Preferred Skills']
        job_business_title = job_row['Business Title']
        job_career_level = job_row['Career Level']

        job_skills_tf_idf = vectorizer.transform([job_description])


        skills_similarity = cosine_similarity(user_skills_tf_idf, job_skills_tf_idf)[0][0]


        job_degree = user_degree_to_job_degree.get(user_degree.lower(), None)

        if job_degree:  
            required_experience = job_row[job_degree] 

            education_experience_similarity = 1 if user_experience >= required_experience else 0
        else:
            education_experience_similarity = 0  


        field_similarity = 1 if user_field.lower() in job_business_title.lower() else 0


        if user_current_job.lower() == 'student':
            if job_career_level.lower() in ['entry-level', 'student']:
                current_job_similarity = 1
            else:
                current_job_similarity = 0
        else:
            similarity_score = fuzzy_string_match(user_current_job.lower(), job_business_title.lower())
            if similarity_score >= 90:
                current_job_similarity = 1
            elif similarity_score >= 80:
                current_job_similarity = 0.75
            elif similarity_score >= 70:
                current_job_similarity = 0.50
            else:
                current_job_similarity = 0
                
        skills_weight = 0.2
        education_experience_weight = 0.15
        field_weight = 0.15
        current_job_weight = 0.5
        
        combined_score = (
            (skills_similarity * skills_weight) +
            (education_experience_similarity * education_experience_weight) +
            (field_similarity * field_weight) +
            (current_job_similarity * current_job_weight)) 
        
        job_similarity[job_id] = combined_score  
        

    recommendations[user_id] = sorted(job_similarity.items(), key=lambda x: x[1], reverse=True)[:5]

In [16]:
recommendations

{2: [(607141, 0.6414213562373096),
  (614216, 0.5),
  (607092, 0.5),
  (607036, 0.5),
  (608859, 0.39142135623730956)],
 3: [(599962, 0.85),
  (583000, 0.85),
  (611713, 0.85),
  (601864, 0.85),
  (600449, 0.85)],
 4: [(605118, 0.6897366596101028),
  (594373, 0.6889822365046137),
  (609025, 0.6889822365046137),
  (611713, 0.6732050807568878),
  (605039, 0.6414213562373096)],
 6: [(591504, 0.8132993161855453),
  (568499, 0.7914213562373096),
  (549966, 0.7914213562373096),
  (554402, 0.7914213562373096),
  (549925, 0.7914213562373096)],
 7: [(564205, 0.6825741858350554),
  (595959, 0.6732050807568878),
  (581069, 0.6632993161855453),
  (597841, 0.6632993161855453),
  (597884, 0.6632993161855453)],
 8: [(611713, 0.7),
  (609025, 0.6897366596101028),
  (605039, 0.6889822365046137),
  (576363, 0.6889822365046137),
  (600449, 0.6732050807568878)],
 10: [(598072, 0.8232050807568878),
  (598056, 0.8232050807568878),
  (591504, 0.8132993161855453),
  (568499, 0.7914213562373096),
  (549966, 0.

In [18]:
data = []
for user, jobs in recommendations.items():
    for job, score in jobs:
        data.append({'User Id': user, 'Job ID': job, 'Score': score})

dff = pd.DataFrame(data)
dff1 = dff.merge(df[['Job ID','Business Title']], how='left', left_on = 'Job ID', right_on='Job ID')
dff1 = dff1.merge(users1[['User Id','Current Job']], how='left', on='User Id')
dff1.to_csv('ContentBased.csv')

In [19]:
def calculate_user_similarity(user1, user2):
    similarity_score = 0
    
    if user1['Major'] == user2['Major']:
        similarity_score += 1
    
    if user1['Degree'] == user2['Degree']:
        similarity_score += 1
    
    if user1['Field'] == user2['Field']:
        similarity_score += 1
    
    if user1['Current Job'] == user2['Current Job']:
        similarity_score += 1
    
    return similarity_score


In [20]:
recommendations = {}
for user_idx, user_row in users1.iterrows():
    user_id = user_row['User Id']
    user_skills = set([skill.strip() for skill in user_row['Skills'].split(',')])
    user_degree = user_row['Degree']
    user_experience = user_row['Years of experience']
    user_field = user_row['Field']
    user_current_job = user_row['Current Job']


    vectorizer = TfidfVectorizer(max_features=1000)
    user_skills_tf_idf = vectorizer.fit_transform([",".join(user_skills)])

    job_similarity = {}
    for idx, job_row in df.iterrows():
        job_id = job_row['Job ID']
        job_description = job_row['Preferred Skills']
        job_business_title = job_row['Business Title']
        job_career_level = job_row['Career Level']

        job_skills_tf_idf = vectorizer.transform([job_description])


        skills_similarity = cosine_similarity(user_skills_tf_idf, job_skills_tf_idf)[0][0]


        job_degree = user_degree_to_job_degree.get(user_degree.lower(), None)

        if job_degree:  
            required_experience = job_row[job_degree]  

            education_experience_similarity = 1 if user_experience >= required_experience else 0
        else:
            education_experience_similarity = 0  

    
        field_similarity = 1 if user_field.lower() in job_business_title.lower() else 0

        
        if user_current_job.lower() == 'student':
            if job_career_level.lower() in ['entry-level', 'student']:
                current_job_similarity = 1
            else:
                current_job_similarity = 0
        else:
            similarity_score = fuzzy_string_match(user_current_job.lower(), job_business_title.lower())
            if similarity_score >= 90:
                current_job_similarity = 1
            elif similarity_score >= 80:
                current_job_similarity = 0.75
            elif similarity_score >= 70:
                current_job_similarity = 0.50
            else:
                current_job_similarity = 0
                
        collaborative_filtering_score = 0  
        max_collaborative_score = float('-inf')  

        for other_user_idx, other_user_row in users1.iterrows():
            other_user_id = other_user_row['User Id']
            if other_user_id != user_id:  
                user_similarity_score = calculate_user_similarity(user_row, other_user_row)
                other_user_rating_for_job = user_item_matrix.loc[job_id, other_user_id]
                collaborative_filtering_contribution = (user_similarity_score * other_user_rating_for_job)
                collaborative_filtering_score += collaborative_filtering_contribution  
                max_collaborative_score = max(max_collaborative_score, collaborative_filtering_contribution)


        if max_collaborative_score != 0:  
            collaborative_filtering_score /= max_collaborative_score  
            collaborative_filtering_score = min(collaborative_filtering_score, 1)  
        
        
        skills_weight = 0.2
        education_experience_weight = 0.2
        field_weight = 0.15
        current_job_weight = 0.2
        collaborative_filter_weight = 0.25
        
      
        combined_score = (
            (skills_similarity * skills_weight) +
            (education_experience_similarity * education_experience_weight) +
            (field_similarity * field_weight) +
            (current_job_similarity * current_job_weight) +
            (collaborative_filtering_score * collaborative_filter_weight)  
        )
        
        job_similarity[job_id] = combined_score  
        
        

    recommendations[user_id] = sorted(job_similarity.items(), key=lambda x: x[1], reverse=True)[:5]

In [21]:
recommendations

{2: [(607141, 0.5914213562373095),
  (608859, 0.49142135623730954),
  (607046, 0.49142135623730954),
  (606803, 0.45000000000000007),
  (601864, 0.45000000000000007)],
 3: [(599962, 0.8500000000000001),
  (583000, 0.8500000000000001),
  (611713, 0.8500000000000001),
  (601864, 0.8500000000000001),
  (600449, 0.8500000000000001)],
 4: [(605118, 0.6397366596101028),
  (594373, 0.6389822365046136),
  (609025, 0.6389822365046136),
  (611713, 0.6232050807568879),
  (586459, 0.6)],
 6: [(591504, 0.7632993161855453),
  (568499, 0.7414213562373095),
  (549966, 0.7414213562373095),
  (554402, 0.7414213562373095),
  (549925, 0.7414213562373095)],
 7: [(564205, 0.6325741858350554),
  (595959, 0.6232050807568879),
  (581069, 0.6132993161855452),
  (597841, 0.6132993161855452),
  (597884, 0.6132993161855452)],
 8: [(611713, 0.65),
  (609025, 0.6397366596101028),
  (605039, 0.6389822365046136),
  (576363, 0.6389822365046136),
  (600449, 0.6232050807568879)],
 10: [(598072, 0.7732050807568878),
  (59

In [22]:
data = []
for user, jobs in recommendations.items():
    for job, score in jobs:
        data.append({'User Id': user, 'Job ID': job, 'Score': score})

dff = pd.DataFrame(data)
dff1 = dff.merge(df[['Job ID','Business Title']], how='left', left_on = 'Job ID', right_on='Job ID')
dff1 = dff1.merge(users1[['User Id','Current Job']], how='left', on='User Id')
dff1.to_csv('Hybrid.csv')

In [43]:
d = {'User Id': [100], 'Major': ['Data Science'], 'Degree': ['masters'],'Field':['Data Analytics'], 'Current Job': ['Data Analyst'], 'Years of experience': [1], 'Skills': ['Python, SQL, Excel, Power BI, Machine Learning']}

In [44]:
users2 = pd.DataFrame(data=d)

In [45]:
recommendations = {}
for user_idx, user_row in users2.iterrows():
    user_id = user_row['User Id']
    user_skills = set([skill.strip() for skill in user_row['Skills'].split(',')])
    user_degree = user_row['Degree']
    user_experience = user_row['Years of experience']
    user_field = user_row['Field']
    user_current_job = user_row['Current Job']

  # Vectorize user skills with TF-IDF
    vectorizer = TfidfVectorizer(max_features=1000)
    user_skills_tf_idf = vectorizer.fit_transform([",".join(user_skills)])

    job_similarity = {}
    for idx, job_row in df.iterrows():
        job_id = job_row['Job ID']
        job_description = job_row['Preferred Skills']
        job_business_title = job_row['Business Title']
        job_career_level = job_row['Career Level']

        job_skills_tf_idf = vectorizer.transform([job_description])

    # Calculate cosine similarity between user skills and job description
        skills_similarity = cosine_similarity(user_skills_tf_idf, job_skills_tf_idf)[0][0]

    # Matching user's degree to the respective job education level
        job_degree = user_degree_to_job_degree.get(user_degree.lower(), None)

        if job_degree:  # If a matching job education level is found for the user's degree
            required_experience = job_row[job_degree]  # Years of experience required for the job education level
      # Check if user's experience matches job requirements
            education_experience_similarity = 1 if user_experience >= required_experience else 0
        else:
            education_experience_similarity = 0  # If no matching job education level is found

    # Define thresholds for matching user's field and current job with job titles
        field_similarity = 1 if user_field.lower() in job_business_title.lower() else 0

        # If user's current job is "Student", compare it with job career level
        if user_current_job.lower() == 'student':
            if job_career_level.lower() in ['entry-level', 'student']:
                current_job_similarity = 1
            else:
                current_job_similarity = 0
        else:
            similarity_score = fuzzy_string_match(user_current_job.lower(), job_business_title.lower())
            if similarity_score >= 90:
                current_job_similarity = 1
            elif similarity_score >= 80:
                current_job_similarity = 0.75
            elif similarity_score >= 70:
                current_job_similarity = 0.50
            else:
                current_job_similarity = 0
                
        collaborative_filtering_score = 0  # Initialize collaborative filtering score
        max_collaborative_score = float('-inf')  # Initialize max collaborative score

        for other_user_idx, other_user_row in users1.iterrows():
            other_user_id = other_user_row['User Id']
            if other_user_id != user_id:  # Exclude the primary user
                user_similarity_score = calculate_user_similarity(user_row, other_user_row)
                other_user_rating_for_job = user_item_matrix.loc[job_id, other_user_id]
                collaborative_filtering_contribution = (user_similarity_score * other_user_rating_for_job)
                collaborative_filtering_score += collaborative_filtering_contribution  # Accumulate collaborative filtering contribution
                max_collaborative_score = max(max_collaborative_score, collaborative_filtering_contribution)

# Normalize collaborative filtering score
        if max_collaborative_score != 0:  # Ensure no division by zero
            collaborative_filtering_score /= max_collaborative_score  # Normalize to the maximum collaborative contribution
            collaborative_filtering_score = min(collaborative_filtering_score, 1)  # Cap the collaborative score at 1ribution
        
        # Assign weights to different criteria
        skills_weight = 0.2
        education_experience_weight = 0.2
        field_weight = 0.15
        current_job_weight = 0.2
        collaborative_filter_weight = 0.25
        
        # Combine scores from different criteria including collaborative filtering
        combined_score = (
            (skills_similarity * skills_weight) +
            (education_experience_similarity * education_experience_weight) +
            (field_similarity * field_weight) +
            (current_job_similarity * current_job_weight) +
            (collaborative_filtering_score * collaborative_filter_weight)  # Include collaborative filtering as a separate criteria
        )
        
        job_similarity[job_id] = combined_score  # Using 'Job ID' as the key
        
        
    # Sort job postings by combined score and recommend top matches
    recommendations[user_id] = sorted(job_similarity.items(), key=lambda x: x[1], reverse=True)[:5]

In [46]:
recommendations

{100: [(600449, 0.6190308509457034),
  (609025, 0.6103567451474547),
  (594373, 0.6100264528397278),
  (611713, 0.6011857892036909),
  (605118, 0.5934274331201272)]}

In [47]:
data = []
for user, jobs in recommendations.items():
    for job, score in jobs:
        data.append({'User Id': user, 'Job ID': job, 'Score': score})

dff = pd.DataFrame(data)
dff1 = dff.merge(df[['Job ID','Business Title']], how='left', left_on = 'Job ID', right_on='Job ID')
dff1 = dff1.merge(users1[['User Id','Current Job']], how='left', on='User Id')

In [48]:
dff1

Unnamed: 0,User Id,Job ID,Score,Business Title,Current Job
0,100,600449,0.619031,Analyst Data and Systems,
1,100,600449,0.619031,Analyst Data and Systems,
2,100,609025,0.610357,"Data Analyst Policy, Research & Analysis",
3,100,609025,0.610357,"Data Analyst Policy, Research & Analysis",
4,100,594373,0.610026,Senior Business Intelligence Data Analyst,
5,100,594373,0.610026,Senior Business Intelligence Data Analyst,
6,100,611713,0.601186,"Data Analyst, Bureau of Hepatitis, HIV, and STI",
7,100,611713,0.601186,"Data Analyst, Bureau of Hepatitis, HIV, and STI",
8,100,605118,0.593427,Senior Data Analyst,
9,100,605118,0.593427,Senior Data Analyst,
