In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
# from sklearn.decomposition import SVD
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture



In [2]:
linkedin_dataset = pd.read_csv('linkedin_dataset.csv')
linkedin_dataset['skills_desc']

0                                                      NaN
1                                                      NaN
2                                                      NaN
3        Bachelor's Degree in Mechanical Engineering pr...
4                                                      NaN
                               ...                        
95119                                                  NaN
95120                                                  NaN
95121                                                  NaN
95122                                                  NaN
95123    Must be a seasoned stylist with an existing bo...
Name: skills_desc, Length: 95124, dtype: object

In [3]:
cols = ['skills_desc', 'type', 'pay_period', 'currency', 'compensation_type', 'posting_domain',
                           'application_url', 'formatted_experience_level', 'company_size', 'zip_code', 'address',
                           'state', 'url', 'city', 'country', 'name']
for col in cols:
    linkedin_dataset[col].fillna("Not Specified", inplace=True)

cols_fill_zero = ['applies', 'views', 'follower_count', 'employee_count']
for col in cols_fill_zero:
    linkedin_dataset[col].fillna(0, inplace=True)

linkedin_dataset['remote_allowed'].fillna("Unknown", inplace=True)

linkedin_dataset['description_x'].fillna("Not Specified", inplace=True)

linkedin_dataset_cleaned = linkedin_dataset.drop_duplicates()


In [4]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

linkedin_dataset_cleaned['combined_text'] = linkedin_dataset_cleaned['description_x']+ ' ' + linkedin_dataset_cleaned['skills_desc'] + ' ' + linkedin_dataset_cleaned['skills_desc']

job_types =  [
    'INFORMATION-TECHNOLOGY',
    'PUBLIC-RELATIONS',
    'SALES',
    'TEACHER',
    'FINANCE',
    'FITNESS',
    'HEALTHCARE',
    'HR',
    'CONSTRUCTION',
    'CONSULTANT',
    'DESIGNER',
    'DIGITAL-MEDIA',
    'ENGINEERING',
    'AVIATION',
    'BANKING',
    'BPO',
    'BUSINESS-DEVELOPMENT',
    'CHEF',
    'ACCOUNTANT',
    'ADVOCATE',
    'AGRICULTURE',
    'APPAREL',
    'ARTS',
    'AUTOMOBILE'
]  


vectorizer = TfidfVectorizer()
combined_text = linkedin_dataset_cleaned['combined_text'].apply(lambda x: ' '.join(x.split()))
X = vectorizer.fit_transform(combined_text)
similarity_matrix = cosine_similarity(X, vectorizer.transform(job_types))

linkedin_dataset_cleaned['PredictedJobType'] = [job_types[idx] for idx in similarity_matrix.argmax(axis=1)]



In [5]:
def predict_job_type(skills_resume, category, years_of_experience, education, summary):
    skills = skills_resume + str(years_of_experience) + education + summary
    
    filtered_dataset = linkedin_dataset_cleaned[linkedin_dataset_cleaned['PredictedJobType'] == category]
    
    vectorizer = TfidfVectorizer()
    skills_vectorized = vectorizer.fit_transform(filtered_dataset['skills_desc'])
    resume_vectorized = vectorizer.transform([skills])
    
    similarity_scores = cosine_similarity(resume_vectorized, skills_vectorized).flatten()
    
    filtered_dataset['Similarity_skills'] = similarity_scores
    
    sorted_dataset = filtered_dataset.sort_values(by=['med_salary', 'max_salary', 'min_salary', 'Similarity_skills'], ascending=[False, False, False, False])
    
    unique_sorted_dataset = sorted_dataset.drop_duplicates(subset=['company_id', 'title'])
    csv_file_path = 'best_predicted_jobs.csv'
    unique_sorted_dataset.head(5).drop(columns=['combined_text','PredictedJobType','Similarity_skills']).to_csv(csv_file_path, index=False)
    print(f'Data has been written to {csv_file_path}.')
    return unique_sorted_dataset.head(5).drop(columns=['combined_text','PredictedJobType','Similarity_skills'])
