In [1]:
import json
import pandas as pd
import nltk
import numpy as np
import os

# Build NLP Content-Based Recommender System 

## Clean up Candidate Data

In [2]:
# consolidate 30 samples of candidate data into a list of dict.
def consolidate_data(path):
    data = []
    for x in os.listdir("./Candidate"):
        candidate = json.load(open(path+"/"+x, encoding='utf8'))
        data.append(candidate[0])
    return data

candidates_data = consolidate_data("./Candidate")
print("Total Number of Candidates: ", len(candidates_data))
print("Sample data: \n",candidates_data[0])

Total Number of Candidates:  30
Sample data: 
 {'data': {'certifications': [], 'location': {'city': 'Seremban', 'state': 'Negeri Sembilan', 'country': 'Malaysia'}, 'objective': '', 'publications': [], 'sections': [{'sectionType': 'Summary', 'text': 'PROFESSIONAL SUMMARY To secure a challenging position in a reputable organization, to expand my knowledge and skills for better experience. To develop good relationship and network among personnel involved, build up individual skills and leadership while dealing with parties involved in the industries. Strength: Optimistic, good team work, interpersonal skills and self-motivated.'}, {'sectionType': 'WorkExperience', 'text': "PROFESSIONAL EXPERIENCE March 2018 – Present Shucare Malaysia Sdn Bhd Sales and Marketing Executive • Monitoring and coordinating sales progress for local market. • Input monthly and quantity sales for local market (Al-Ikhsan, Royal Sporting House, Larrie, Parkson & Metrojaya). • Prepare and calculate costings of produc

In [3]:
# check unique data features
def check_feature(data):
    cols = []
    for candidate in data:
        for col in candidate['data'].keys():
            if col not in cols:
                cols.append(col)
    return cols

cols = check_feature(candidates_data)
print("Unique Features Number:", len(cols))
print(cols)

Unique Features Number: 9
['certifications', 'location', 'objective', 'publications', 'sections', 'languages', 'summary', 'totalYearsExperience', 'profession']


In [4]:
# check unique sectionType
def check_sectionType(data):
    sectionType = []
    for candidate in data:
        for i in range(len(candidate['data']['sections'])):
            if candidate['data']['sections'][i]['sectionType'] not in sectionType:
                sectionType.append(candidate['data']['sections'][i]['sectionType'])
    return sectionType

sect_cols = ["sectionType_"+i for i in check_sectionType(candidates_data)]
print("Unique Features in Sections:", len(sect_cols))
print(sect_cols)

Unique Features in Sections: 11
['sectionType_Summary', 'sectionType_WorkExperience', 'sectionType_Education', 'sectionType_Footer', 'sectionType_Skills/Interests/Languages', 'sectionType_Extracurriculars/Leadership', 'sectionType_Training/Certifications', 'sectionType_Achievements', 'sectionType_Projects', 'sectionType_AdditionalInformation', 'sectionType_Organisations']


In [5]:
# transform candidate json data into dataframe
def transfrom_json2df(data, cols):
    final_data = {}
    for col in cols:
        final_data[col] = []
    for candidate in data:
        for col in cols:
            final_data[col].append(candidate['data'][col])
    return final_data   

cd_df = pd.DataFrame(transfrom_json2df(candidates_data, cols))
display(cd_df.head())

Unnamed: 0,certifications,location,objective,publications,sections,languages,summary,totalYearsExperience,profession
0,[],"{'city': 'Seremban', 'state': 'Negeri Sembilan...",,[],"[{'sectionType': 'Summary', 'text': 'PROFESSIO...","[Bahasa, English]",To secure a challenging position in a reputabl...,5,Management Trainee
1,[],"{'city': 'Ipoh', 'state': 'Perak', 'country': ...",,[],"[{'sectionType': 'Summary', 'text': 'Objective...","[Malay, English]",March 2022 – 22 July 2022 Objective A sales ex...,0,Waiter
2,[],"{'city': 'Neodesha', 'state': 'Kansas', 'count...",,[],"[{'sectionType': 'Summary', 'text': 'PROFILE C...",[English],Committed person with an ability to generate a...,2,Associate
3,[2003 2004 Completed Foundation for Graphic De...,"{'city': 'Kuala Lumpur', 'state': 'Wilayah Per...",,[],"[{'sectionType': 'Education', 'text': 'Educati...","[Bahasa, English]",,13,Operations Manager
4,"[ITIL Foundation V3, ITIL Service LevelManagem...","{'city': 'Subang Jaya', 'state': 'Selangor', '...",,[],"[{'sectionType': 'WorkExperience', 'text': 'Fe...","[German, English]",A multi-skilled IT manager with good all round...,7,System Administrator


In [6]:
# expand sections data into different column.
df = {}
for idx in range(len(cd_df['sections'])):
    df[idx] = {'sectionType_Summary':[], 
               'sectionType_WorkExperience':[], 
               'sectionType_Education':[], 
               'sectionType_Footer':[], 
               'sectionType_Skills/Interests/Languages':[], 
               'sectionType_Extracurriculars/Leadership':[], 
               'sectionType_Training/Certifications':[], 
               'sectionType_Achievements':[], 
               'sectionType_Projects':[], 
               'sectionType_AdditionalInformation':[], 
               'sectionType_Organisations':[]}
    for i in range(len(cd_df['sections'][idx])):
        sectType = cd_df['sections'][idx][i]['sectionType']
        text = cd_df['sections'][idx][i]['text']
        df[idx]['sectionType_'+sectType].append(text)

In [7]:
# download/update nltk package
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lowka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lowka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lowka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lowka\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [8]:
from nltk.corpus import stopwords
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
stop = stopwords.words('english')
stop_words_ = set(stopwords.words('english'))
wn = WordNetLemmatizer()

# return text if not stop words and punctuation
def black_txt(token):
    return  token not in stop_words_ and token not in list(string.punctuation)  and len(token)>2 

# text cleansing for numeric number and html tag, lowercase and lemmatize words.
def clean_txt(text):
    clean_html = re.compile(r'<.*?>')
    clean_text = clean_html.sub('', text) # clean html tag in dataset
    clean_number = re.compile(r'[0-9]')
    clean_text = clean_number.sub('', clean_text) # clean number in dataset
    clean_text = clean_text.replace("nbsp", "")
    clean_text = [wn.lemmatize(word, pos="v") for word in word_tokenize(clean_text.lower()) if black_txt(word)]
    clean_text = tokenizer.tokenize(" ".join(clean_text))
    return " ".join(clean_text)

In [9]:
# process cleansing data
clean_data = cd_df.merge(pd.DataFrame(df).T, left_index=True, right_index=True)
selected_cols = ['certifications', 
                 'objective',
                 'summary', 
                 'profession', 
                 'sectionType_Summary', 
                 'sectionType_WorkExperience', 
                 'sectionType_Education', 
                 'sectionType_Skills/Interests/Languages',
                 'sectionType_Extracurriculars/Leadership',
                 'sectionType_Achievements',
                 'sectionType_Projects'] 
clean_data = clean_data[selected_cols]

#merge data into single text column
clean_data['text'] = clean_data['certifications'].map(str) + " " + clean_data['objective'].map(str) + " " + clean_data['summary'].map(str) + " " + clean_data['profession'].map(str) + " " + clean_data['sectionType_Summary'].map(str) + clean_data['sectionType_WorkExperience'].map(str) + " " + clean_data['sectionType_Education'].map(str) + " " + clean_data['sectionType_Skills/Interests/Languages'].map(str) + " " + clean_data['sectionType_Extracurriculars/Leadership'].map(str) + " " + clean_data['sectionType_Achievements'].map(str) + " " + clean_data['sectionType_Projects'].map(str)
clean_data = clean_data.reset_index()[['index', 'text']]
clean_data.columns = ['CandidateID', 'text']
clean_data['text'] = clean_data['text'].apply(clean_txt)

display(clean_data.head())

Unnamed: 0,CandidateID,text
0,0,secure challenge position reputable organizati...
1,1,march july objective sales executive business ...
2,2,commit person ability generate implement effec...
3,3,complete foundation graphic design centre adva...
4,4,itil foundation itil service levelmanagementpr...


## Clean up Job Data

In [10]:
# load and clean data using same function above
job_df = pd.read_csv("./Job/jobs.csv")
job_df = job_df.reset_index()
job_df['text'] = job_df['title'] + " " + job_df['description'] + " " + job_df['requirements'] + " " + job_df['short_description']
job_df = job_df[['index', 'title', 'text']]
job_df.columns = ['JobID', 'title', 'text']
job_df['text'] = job_df['text'].apply(clean_txt) 
display(job_df)

Unnamed: 0,JobID,title,text
0,0,Business Development Executive,business development executive research identi...
1,1,Data Scientist,data scientist need convert data actionable in...
2,2,Software Developer,software developer software coder passionate d...
3,3,Mobile App Developer,mobile app developer mobile application develo...
4,4,Graphic Designer,graphic designer product shoot company product...
5,5,Customer Happiness Associate,customer happiness associate join meaningful c...
6,6,Customer Happiness Officer,customer happiness officer role customer happi...
7,7,Digital Marketing Specialist,digital market specialist strong grasp current...
8,8,Digital Marketing Executive,digital market executive teamwe market team bi...
9,9,Sales Executive,sales executive role within company lead expan...


# Building Recommender System

## Recommendation using cosine similarity

In [11]:
# select a candidate ID use for testing purpose later.
u = 25 # 0~29
index = np.where(clean_data['CandidateID'] == u)[0][0]
user_q = clean_data.iloc[[index]]
print(user_q)
print("="*100)
print("Text after clean: \n", user_q['text'][u])

    CandidateID                                               text
25           25  basic program bsc hons information technology ...
Text after clean: 
 basic program bsc hons information technology fresh graduate passionate develop user friendly website mobile applications excellent problem solving skills ability perform well team look fulltime position backend developers web developers mobile application developer internship bsc hons information technology fresh graduate passionate develop user friendly website mobile applications excellent problem solving skills ability perform well team look fulltime position backend developers web developers mobile application developer work experince internship backend development webby group sdn bhd achievements tasks design implement rule business logic use graphql query laravel php implement various function mobile front end development use graphql api laravel php fix debug function errors mobile frontend development use graphql api laravel p

### 1. Using TFIDF

In [12]:
# Initializing tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

tfidf_jobid = tfidf_vectorizer.fit_transform((job_df['text'])) # fitting and transforming job text into vector

In [13]:
# Computing cosine similarity using tfidf
from sklearn.metrics.pairwise import cosine_similarity

candidate_tfidf = tfidf_vectorizer.transform(user_q['text']) # transforming candidate text into vetors

# Computing cosine similarity between candidate vector and job vectors using tfidf
cos_similarity_tfidf = map(lambda x: cosine_similarity(candidate_tfidf, x),tfidf_jobid)
output = list(cos_similarity_tfidf)

In [14]:
def recom_by_tfid(job_df, output):

    list_scores = output
    recommendation = {'JobID': [],'Title': [], 'Score': list_scores}
    
    for i in range(len(list_scores)):
        recommendation['JobID'].append(job_df[job_df['JobID']==i].JobID[i])  
        recommendation['Title'].append(job_df[job_df['JobID']==i].title[i]) 
        
    df = pd.DataFrame(recommendation).sort_values("Score", ascending=False)
    df['Score'] = df['Score'].astype('float')
    return df

In [15]:
recom_by_tfid(job_df, output)

Unnamed: 0,JobID,Title,Score
3,3,Mobile App Developer,0.302903
2,2,Software Developer,0.207543
7,7,Digital Marketing Specialist,0.143324
8,8,Digital Marketing Executive,0.133703
4,4,Graphic Designer,0.102582
9,9,Sales Executive,0.089486
1,1,Data Scientist,0.08854
5,5,Customer Happiness Associate,0.060773
0,0,Business Development Executive,0.058976
6,6,Customer Happiness Officer,0.054184


### 2. Using Count Vectorizer

In [16]:
# Computing cosine similarity using CountVector
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()

count_jobid = count_vectorizer.fit_transform((job_df['text'])) # fitting and transforming the vector
count_jobid

<10x745 sparse matrix of type '<class 'numpy.int64'>'
	with 1094 stored elements in Compressed Sparse Row format>

In [17]:
user_count = count_vectorizer.transform(user_q['text'])  # transforming candidate text into vetors

# Computing cosine similarity between candidate vector and job vectors using tfidf
cos_similarity_countv = map(lambda x: cosine_similarity(user_count, x),count_jobid)
output = list(cos_similarity_countv)

In [18]:
def recom_by_CV(job_df, output):

    list_scores = output
    recommendation = {'JobID': [],'Title': [], 'Score': list_scores}
    
    for i in range(len(list_scores)):
        recommendation['JobID'].append(job_df[job_df['JobID']==i].JobID[i])  
        recommendation['Title'].append(job_df[job_df['JobID']==i].title[i]) 
        
    df = pd.DataFrame(recommendation).sort_values("Score", ascending=False)
    df['Score'] = df['Score'].astype('float')
    return df

In [19]:
display(recom_by_CV(job_df, output))
print("Candidate Summary: \n", cd_df.iloc[u]['summary'])
print("="*100)
print("Candidate Sections: \n", cd_df.iloc[u]['sections'])

Unnamed: 0,JobID,Title,Score
3,3,Mobile App Developer,0.338712
2,2,Software Developer,0.245903
8,8,Digital Marketing Executive,0.18016
4,4,Graphic Designer,0.163681
7,7,Digital Marketing Specialist,0.161915
9,9,Sales Executive,0.111667
1,1,Data Scientist,0.106399
0,0,Business Development Executive,0.102763
5,5,Customer Happiness Associate,0.094355
6,6,Customer Happiness Officer,0.085383


Candidate Summary: 
 BSc (Hons) Information Technology fresh graduate passionate about developing user-friendly website and mobile applications. Excellent problem-solving skills and ability to perform well in a team. Looking for fulltime positions Backend developers, web developers and mobile application developer.
Candidate Sections: 
 [{'sectionType': 'Summary', 'text': 'BSc (Hons) Information Technology fresh graduate passionate about developing user-friendly website and mobile applications. Excellent problem-solving skills and ability to perform well in a team. Looking for fulltime positions Backend developers, web developers and mobile application developer.'}, {'sectionType': 'Education', 'text': 'EDUCATION Bsc (Hons) Information Technology Sunway University 03/2020 - 03/2022 Projects Create and design a basic e-Books Rental Software using Scala and css. Create a UI UX mobile app prototype with Figma. Diploma in Information Technology Sunway College 01/2018 - 03/2020 Projects Cre

# Recommendation using Spacy

In [20]:
# !pip install spacy
# !python -m spacy download en_core_web_lg
import spacy

In [21]:
nlp = spacy.load('/Users/lowka/anaconda3/Lib/site-packages/en_core_web_lg/en_core_web_lg-3.2.0')

In [22]:
%%time
list_docs = []
for i in range(len(job_df)):
    doc = nlp("u'" + job_df['text'][i] + "'")
    list_docs.append((doc,i))
print(len(list_docs))

10
Wall time: 332 ms


In [23]:
def calculateSimWithSpaCy(nlp, df, user_text):
    # Calculate similarity using spaCy
    list_sim =[]
    doc1 = nlp("u'" + user_text + "'")
    for i in df.index:
        try:
            doc2 = list_docs[i][0]
            score = doc1.similarity(doc2)
            list_sim.append((doc1, doc2, list_docs[i][1],score))
        except:
            continue

    return  list_sim 

In [24]:
def recom_by_spacy(nlp, job_df, candidate_data, n):
    df = calculateSimWithSpaCy(nlp, job_df, candidate_data.text[n])
    df_recom_spacy = pd.DataFrame(df).sort_values([3], ascending=False).head(10)
    df_recom_spacy.reset_index(inplace=True)
    index_spacy = df_recom_spacy[2]
    list_scores = df_recom_spacy[3]
    recommendation = {'JobID': index_spacy,'Title': [], 'Score': list_scores}
    
    for i in range(len(recommendation['JobID'])):
        recommendation['Title'].append(job_df[job_df['JobID']==i].title[i])    
    df = pd.DataFrame(recommendation)
    return df

In [25]:
recom_by_spacy(nlp, job_df, clean_data, u)

Unnamed: 0,JobID,Title,Score
0,2,Business Development Executive,0.935271
1,3,Data Scientist,0.933031
2,7,Software Developer,0.906458
3,8,Mobile App Developer,0.876016
4,9,Graphic Designer,0.866656
5,1,Customer Happiness Associate,0.864753
6,5,Customer Happiness Officer,0.852899
7,4,Digital Marketing Specialist,0.852706
8,0,Digital Marketing Executive,0.840486
9,6,Sales Executive,0.838619


# KNN Recomender System

In [26]:
from sklearn.neighbors import NearestNeighbors

def recom_by_KNN(job_df, candidate_data):
    
    KNN = NearestNeighbors(p=2)
    KNN.fit(tfidf_jobid)
    NNs = KNN.kneighbors(candidate_tfidf, return_distance=True) 

    index_spacy = NNs[1][0][1:]
    list_scores = NNs[0][0][1:]
    recommendation = {'JobID': index_spacy,'Title': [], 'Score': list_scores}
    
    for i in range(len(recommendation['JobID'])):
        recommendation['Title'].append(job_df[job_df['JobID']==i].title[i])    
    df = pd.DataFrame(recommendation)
    return df

In [27]:
recom_by_KNN(job_df, clean_data)

Unnamed: 0,JobID,Title,Score
0,2,Business Development Executive,1.258934
1,7,Data Scientist,1.308951
2,8,Software Developer,1.31628
3,4,Mobile App Developer,1.339715


# Conclusion

We builded 4 content-based recommender systems, **cosine similarity** using **count vectorizer** seems to show a better results compare with the other three systems.

Howerver, do note that the sample size used in this notebook consider very small (30 candidates and 10 jobs).