### Import the Required Library and Load the Sample Dataset

In [None]:
# import all the required library
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import string 
import numpy as np
import pandas as pd

# load the sample data
sample_data_path = "data/sample_data/"
jobs = pd.read_csv(sample_data_path + 'jobs_sample_cleared.csv')
apps = pd.read_csv(sample_data_path + 'apps_sample.csv')
users = pd.read_csv(sample_data_path + 'users_sample.csv')

### Data Transformation

In [None]:
# convert all the attribute of each job into a job string. 
jobs_collection = []
for i in range (len(jobs)):
    text = "{} {} {} {} {}".format(jobs.loc[i, 'Title'],
                                   jobs.loc[i, 'Description'],
                                   jobs.loc[i, 'Requirements'],
                                   jobs.loc[i, 'State'],
                                   jobs.loc[i, 'City'])
    jobs_collection.append(text)

### Data Cleaning

In [None]:

from nltk.corpus import stopwords
import re
stopwords = stopwords.words('english')
def clean_string(text):
    # remove punctuation
    text = ''.join([word for word in text if word not in string.punctuation])
    # lower all letters 
    text = text.lower()
    # remove stop words 
    text = ' '.join([word for word in text.split() if word not in stopwords])
    # remove any digit and letter combinations
    text = re.sub(r'\w*[0-9]\w*','',text)
    return text

In [None]:
# clean all the jobs string in jobs_collections
cleaned_jobs_collection = list(map(clean_string, jobs_collection))

### Transform Each Job Text String into a Similarity Matrix 

In [None]:
# create the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
 
# transform the job description into a tfidf vector
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(cleaned_jobs_collection)

In [None]:
# generate the TF-IDF words for each job string and only keep the words that has a TF-IDF value higher than 0.2
tfidf_jobs_description = []
for i in range(len(cleaned_jobs_collection)):
    first_vector_tfidfvectorizer = tfidf_vectorizer_vectors[i]
    tfidf = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
    sorted_tfidf = tfidf.sort_values(by=["tfidf"],ascending=False)
    sorted_tfidf = sorted_tfidf.head(10)
    tfidf_list = []
    for i in range(len(sorted_tfidf)):
        if sorted_tfidf.iloc[i]['tfidf'] >= 0.2:
            tfidf_list.append(sorted_tfidf.iloc[i].name)
    tfidf_jobs_description.append(tfidf_list)


In [None]:
# create the matrix with TF-IDF words as each column and job_id as row
vectorizer = CountVectorizer().fit_transform(tfidf_jobs_description)
vectors = vectorizer.toarray()

In [None]:
# convert the number higher than 1 in the matrix to 1. 
vector = np.where(vectors > 0, 1, vectors)

In [None]:
# calculate the cosine_similarity of each pair of the job_id
cos_sim_matrix = cosine_similarity(vectors)

In [None]:
# convert_job_id_to_index
def job_id_to_index(job_id):
    return jobs.index[jobs['JobID'] == job_id].values[0]

# convert_job_index_to_job_id
def job_index_to_id(job_index):
    return jobs.iloc[job_index]['JobID']

# find similar job of a specific job
def find_similar_job(job_id):
    job_index = job_id_to_index(job_id)
    similar_jobs = []
    for i in range(len(vectors)):
        if cos_sim_matrix[job_index][i] >= 0.1 and i != job_index:
            similar_jobs.append(jobs.iloc[i]['JobID'])
    return similar_jobs

In [None]:
# find the similarity between any two jobs
def cosine_sim(job_id_1, job_id_2):
    job_index_1 = job_id_to_index(job_id_1)
    job_index_2 = job_id_to_index(job_id_2)
    return cos_sim_matrix[job_index_1][job_index_2] 

In [None]:
# check whether a user applied a job or not
def user_applied_jobs_check(user_id, job_id):
    user_applied_jobs = apps.loc[apps['UserID'] == user_id]
    if job_id in user_applied_jobs['JobID'] is False:
        return 0
    else:
        return 1

# find what jobs the user has applied
def user_applied_jobs_list(user_id):
    user_applied_jobs = apps.loc[apps['UserID'] == user_id]
    return user_applied_jobs['JobID']

### Generate the Recommended Jobs for Users. 

In [None]:
# calculate the preference value for a user and a job
def user_job_pref(user_id, job_id):
    similar_jobs = find_similar_job(job_id)
    user_apply_job = user_applied_jobs_list(user_id)
    mutual_job = set(similar_jobs) & set(user_apply_job)
    if len(mutual_job) == 0:
        return 0
    sum1 = 0 
    sum2 = 0
    for sim_job_id in mutual_job:
        sum1 = cosine_sim(job_id, sim_job_id) * user_applied_jobs_check(user_id, sim_job_id) + sum1
    return sum1
    

In [None]:
# find the recommeded jobs for a specific user
def recommed_jobs(user_id):
    job_id_recommend = pd.DataFrame(columns=['JobID', 'Preference'])
    for i in range(3915):
        job_id = job_index_to_id(i)
        preference = user_job_pref(user_id, job_id)
        if preference > 0.3:
            job_id_recommend = job_id_recommend.append({'JobID': job_id,'Preference':preference}, ignore_index=True)
    job_id_recommendation = job_id_recommend.sort_values(by=['Preference'],ascending=False)
    return job_id_recommendation['JobID']

### Recommendation Result Evaluation

In [None]:
# calculate the precision and recall of the model based on 1000 users data.
TP = 0 
TN = 0
FN = 0
FP = 0

for i in range(1000):
    user_id = users.iloc[i]['UserID']
    print(user_id)
    recommend_list = recommed_jobs(user_id)
    app_list = user_applied_jobs_list(user_id)
    print(set(recommend_list) & set(app_list))
    TP = len(set(recommend_list) & set(app_list)) + TP
    FN = (len(set(app_list)) - len(set(recommend_list) & set(app_list))) + FN
    FP = (len(set(recommend_list)) - len(set(recommend_list) & set(app_list))) + FP
    
precision = TP / (TP + FP)
recall = TP / (TP + FN)