In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

jobs_df = pd.read_csv('it-analysis-main/DataStore/jobData.csv', on_bad_lines='skip')

# Sample user data
data = {
    'Skills': ['c++, rust'],
    'Experience': ['mid'],
    'Education': ['Bachelor in Computer Science'],
    'Desired Salary': [80000]
}

# Create the user DataFrame
user_df = pd.DataFrame(data)

# Function to preprocess job and user data
def preprocess_data(jobs_df, user_df):
    # Handle missing values
    processed_jobs_df = jobs_df.fillna('Unknown')

    # Normalize salaries, categorize experience levels, encode categorical data
    processed_jobs_df[['normalized_salary']] = normalize_salary(processed_jobs_df[['Salary']])
    processed_user_df['experience_level'] = categorize_experience(processed_user_df['Experience'])

    return processed_jobs_df, processed_user_df

# Function to normalize salary
def normalize_salary(salary_column):
    salary_column = salary_column.copy()
    mask = salary_column['Salary'].str.match('^\S+\s+\S+$')
    salary_column = salary_column[mask]
    split_salaries = salary_column['Salary'].str.split(' ', expand=True)
    
    # Using .loc to assign values
    salary_column.loc[mask, 'Salary'] = split_salaries[0]
    
    # Converting to numeric and handling errors
    salary_column['Salary'] = pd.to_numeric(salary_column['Salary'], errors='coerce')
    
    return salary_column


# Function to categorize experience
def categorize_experience(experience_column):
    categorized_experience = []

    for experience in experience_column:
        if experience.lower() in ['entry', 'junior', 'intern']:
            level = 'junior'
        elif experience.lower() in ['mid', 'mid-level', 'intermediate']:
            level = 'mid-level'
        elif experience.lower() in ['senior', 'expert', 'lead']:
            level = 'senior'
        else:
            # Default category or handle unknown cases
            level = 'unknown'

        categorized_experience.append(level)

    return categorized_experience

# Function for content-based filtering
def content_based_filtering(user_profile, jobs_df):
    # Combine relevant job information into a single string for each job
    jobs_df['combined_info'] = jobs_df['Position'] + ' ' + jobs_df['Used Technologies'] + ' ' + jobs_df['Experience']

    # Use TF-IDF on the combined job information
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(jobs_df['combined_info'])

    # Create a user profile vector
    user_profile_vector = tfidf.transform([user_profile['Skills']])

    # Calculate cosine similarity between user profile and job information
    cosine_sim = cosine_similarity(user_profile_vector, tfidf_matrix)

    # Get top matching jobs based on similarity scores
    top_matches = get_top_matching_jobs(cosine_sim, jobs_df)

    return top_matches

def get_top_matching_jobs(cosine_sim, jobs_df):
    # Get similarity scores for all jobs
    similarity_scores = list(enumerate(cosine_sim[0]))

    # Sort the jobs based on similarity scores in descending order
    sorted_jobs = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 job indices
    top_job_indices = [i[0] for i in sorted_jobs[:10]]

    # Return the top matching jobs
    top_jobs = jobs_df.iloc[top_job_indices]

    return top_jobs

# Preprocess the data
processed_jobs_df, processed_user_df = preprocess_data(jobs_df, user_df)
# print(processed_jobs_df.columns)
# print(processed_jobs_df[['Salary', 'normalized_salary']])

# Get job recommendations
recommended_jobs = content_based_filtering(processed_user_df.iloc[0], processed_jobs_df)

# Print or return the recommended jobs
recommended_jobs.head()

Unnamed: 0,Position,Company,Experience,Salary,Used Technologies,Optional Technologies,normalized_salary,combined_info
0,Embedded Software Engineer,Fluke Corportaion,mid,13000 18000,"{'C': 'regular', 'C++': 'regular', 'Linux': 'r...",-,13000.0,"Embedded Software Engineer {'C': 'regular', 'C..."
1,Azure Platform Engineer (AI Department),Procter & Gamble,mid,-,"{'Microsoft Azure': 'advanced', 'Python': 'adv...",-,,Azure Platform Engineer (AI Department) {'Micr...
2,.NET Developer,UN7,mid,20000 28000,{'.Net': 'advanced'},-,20000.0,.NET Developer {'.Net': 'advanced'} mid
3,Spec. Projektant Robotyzacji Procesów,Credit Agricole Bank Polska S.A.,mid,-,"{'C#': 'regular', 'VB.Net': 'regular', 'Java':...",-,,Spec. Projektant Robotyzacji Procesów {'C#': '...
4,HT Functional Consultant with ERP Industry Sol...,Accenture,mid,-,"{'English': 'master', 'ERP': 'advanced', 'Prob...",-,,HT Functional Consultant with ERP Industry Sol...
