## Career Recommendation Model Using Deep Learning.

a career recommendation model that uses both content based and collaborative filtering to make its recommendations
the model takes in a user profile and does collaborative filtering to find careers that the user has high probability of success in and passes that career path to the content based filter to find other paths that are similar to that one. the final output is the three career 1 from collaborative filtering and the other two from content based filtering.

In [1]:
# necessary libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

## Exploratory Data Analysis

In [26]:
job_data = pd.read_csv("jobs_data.csv")
job_data.head()

Unnamed: 0,jobdescription,jobid,jobtitle,postdate,site_name,skills
0,Looking for Selenium engineers...must have sol...,Dice Id : 10110693,AUTOMATION TEST ENGINEER,1 hour ago,,SEE BELOW
1,The University of Chicago has a rapidly growin...,Dice Id : 10114469,Information Security Engineer,1 week ago,,"linux/unix, network monitoring, incident respo..."
2,"GalaxE.SolutionsEvery day, our solutions affec...",Dice Id : CXGALXYS,Business Solutions Architect,2 weeks ago,,"Enterprise Solutions Architecture, business in..."
3,Java DeveloperFull-time/direct-hireBolingbrook...,Dice Id : 10113627,"Java Developer (mid level)- FT- GREAT culture,...",2 weeks ago,,Please see job description
4,Midtown based high tech firm has an immediate ...,Dice Id : matrixga,DevOps Engineer,48 minutes ago,,"Configuration Management, Developer, Linux, Ma..."


In [41]:
career_dataset = pd.read_csv("career_paths.csv")
career_dataset.head()

Unnamed: 0,Field,Skills,Interest,Unnamed: 3,difficulty
0,software engineering,"c++,java,python,testing and debugging,mobile d...","programming,problem solving,Project Management",,hard
1,data sciences,"python,R,sql,database,power bi,teablue,pandas,...","virtualization,research,Analytics,data analyst...",,hard
2,IT development,"Backend Frameworks,Documentation,Networking an...","Debugging and Troubleshooting,Deployment and D...",,hard
3,cyber security,"hacking,Cloud Computing,Operating System Secur...","Networking,Security,ethical hacking,Security A...",,medium
4,Robotics,"electronics, circuits, automation,dld,computer...","Mechatronics,Technology, Hardwares",,easy


In [23]:
job_data.describe()

Unnamed: 0,jobdescription,jobid,jobtitle,postdate,site_name,skills
count,22000,22000,22000,22000,3490,21957
unique,20512,4415,15242,93,1,18967
top,Title IT Security Analyst – Threats and Vulner...,Dice Id : cybercod,Java Developer,2 weeks ago,www.dice.com,Telecommuting not available Travel not required
freq,10,335,174,3149,3490,141


In [14]:
job_data.isna().sum()

jobdescription        0
jobid                 0
jobtitle              0
postdate              0
site_name         18510
skills               43
dtype: int64

In [56]:
job_data = job_data.dropna(subset='skills')
job_data = job_data.drop_duplicates(subset='jobdescription', keep='first')
job_data = job_data.drop(columns=['jobid', 'postdate', 'site_name'])
job_data.describe()

Unnamed: 0,jobdescription,jobtitle,skills,job_feature,field
count,20470,20470,20470,20470,20470
unique,20470,15158,18929,20470,24
top,Looking for Selenium engineers...must have sol...,Java Developer,Telecommuting not available Travel not required,looking for selenium engineers...must have sol...,astronomy
freq,1,162,122,1,9206


In [57]:
# combining the job description and skills to form a job_feature
job_data['job_feature'] = job_data['jobdescription'].str.lower() + ' ' + job_data['skills'].str.lower()

# the tf-idf vectoriser to categorise career path for each job post will make use of career paths dataset
# combining interests and skills
career_dataset['combined_features'] = career_dataset['Interest'].str.lower() + ' ' + career_dataset['Skills'].str.lower()

# TF-IDF Vectorisation
vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(career_dataset['combined_features'])

# Calculating similarity scores
content_similarity_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)

# function to get content based recommendations
def categorise_path(job_feature, content_similarity_matrix, top_field=3):
    job_vector = vectorizer.transform([job_feature])
    scores = linear_kernel(job_vector, tfidf_matrix).flatten()
    # finding indices of top similarity scores
    top_field_indices = scores.argsort()[-top_field:][::-1]
    field = career_dataset['Field'].iloc[top_field_indices].to_list()

    return field

feats = []
for feat in job_data['job_feature']:
    field = categorise_path(feat, content_similarity_matrix)
    feats.append(field[0])

job_data['field'] = feats

job_data.head()

Unnamed: 0,jobdescription,jobtitle,skills,job_feature,field
0,Looking for Selenium engineers...must have sol...,AUTOMATION TEST ENGINEER,SEE BELOW,looking for selenium engineers...must have sol...,software engineering
1,The University of Chicago has a rapidly growin...,Information Security Engineer,"linux/unix, network monitoring, incident respo...",the university of chicago has a rapidly growin...,physics
2,"GalaxE.SolutionsEvery day, our solutions affec...",Business Solutions Architect,"Enterprise Solutions Architecture, business in...","galaxe.solutionsevery day, our solutions affec...",data sciences
3,Java DeveloperFull-time/direct-hireBolingbrook...,"Java Developer (mid level)- FT- GREAT culture,...",Please see job description,java developerfull-time/direct-hirebolingbrook...,software engineering
4,Midtown based high tech firm has an immediate ...,DevOps Engineer,"Configuration Management, Developer, Linux, Ma...",midtown based high tech firm has an immediate ...,IT development


  (0, 595)	0.07088726145189311
  (0, 592)	0.06282478755786006
  (0, 587)	0.03544363072594656
  (0, 580)	0.03544363072594656
  (0, 576)	0.03544363072594656
  (0, 573)	0.06282478755786006
  (0, 561)	0.4476718096470822
  (0, 549)	0.28271154401037024
  (0, 547)	0.15706196889465013
  (0, 541)	0.022988343542257716
  (0, 540)	0.03544363072594656
  (0, 536)	0.03544363072594656
  (0, 533)	0.0790009075847792
  (0, 530)	0.06282478755786006
  (0, 523)	0.03544363072594656
  (0, 521)	0.1772181536297328
  (0, 515)	0.03544363072594656
  (0, 509)	0.020489711752875582
  (0, 508)	0.07088726145189311
  (0, 507)	0.13166817930796534
  (0, 505)	0.10100219231474748
  (0, 494)	0.10633089217783967
  (0, 459)	0.028552185646908635
  (0, 448)	0.02452094869989211
  (0, 442)	0.03544363072594656
  :	:
  (0, 279)	0.40836111912609036
  (0, 230)	0.07088726145189311
  (0, 224)	0.06282478755786006
  (0, 207)	0.03544363072594656
  (0, 196)	0.06896503062677316
  (0, 195)	0.03141239377893003
  (0, 159)	0.1580018151695584
  (

## Collaborative Filtering

## Content based Filtering

In [27]:
content_data = pd.read_csv("career_paths.csv")

# combining interests and skills
content_data['combined_features'] = content_data['Interest'] + ' ' + content_data['Skills']

# TF-IDF Vectorisation
vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(content_data['combined_features'])

# Calculating similarity scores
content_similarity_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)

In [36]:
# function to get content based recommendations
def content_based_recommendation(user_profile, content_similarity_matrix, num_recommendations=2):
    print('starting function')
    user_vector = vectorizer.transform([user_profile])
    print(f'user vector: {user_vector}')
    scores = linear_kernel(user_vector, tfidf_matrix).flatten()
    print(f'scores: {scores}')
    # finding indices of top similarity scores
    top_field_indices = scores.argsort()[-num_recommendations:][::-1]
    recommended_fields = content_data['Field'].iloc[top_field_indices].to_list()
    print(f'recommended fields: {recommended_fields}')

    return recommended_fields

In [37]:
# loading pickle
with open('vectorizer.pickle', 'wb') as f:
    pickle.dump(vectorizer, f)


with open('similarity_matrix.pickle', 'wb') as f:
    pickle.dump(content_similarity_matrix, f)


import cloudpickle

# Use cloudpickle since linear_kernel might not be picklable directly
with open('recommendation_function.pickle', 'wb') as f:
    cloudpickle.dump(content_based_recommendation, f)
