# Project Background:

#### Companies often receive thousands of resumes for each job posting and employ dedicated screening officers to screen qualified candidates.

#### Hiring the right talent is a challenge for all businesses. This challenge is magnified by the high volume of applicants if the business is labour-intensive, growing, and facing high attrition rates.

#### IT departments are short of growing markets. In a typical service organization, professionals with a variety of technical skills and business domain expertise are hired and assigned to projects to resolve customer issues. This task of selecting the best talent among many others is known as Resume Screening.

<img src="giphy.gif">

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

import nltk
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('stopwords')

from nltk import bigrams
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

import docx2txt
import os

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity as cos_sim

## Concatenate new resumes into resume dataset

In [8]:
doc_list = [x for x in os.listdir('./resumes') if x.endswith(".docx")]
print(doc_list)

for doc in doc_list:

        resume = docx2txt.process(f'./resumes/{doc}')
        resume = pd.DataFrame({'Resume': resume}, index=[1])
        df_resume = pd.concat([df_resume, resume], ignore_index=True)
       

In [49]:
start_idx = df_resume.index[df_resume['Category'].isnull()].tolist()
print(start_idx)
df_resume['Resume'][start_idx]

[962, 963, 964, 965]


962    candidate details\t\t\t\t\n\n\t\t\t\t\t\t\t\t\...
963    candidate details\n\n\n\nname\t\t\t\t\t: Xisi\...
964    Yang Gui Fei\n\n101 Ang Mo Kio Avenue 1 • #02-...
965    POSITION\n\nBusiness Analyst\n\n\n\n\n\nPERSON...
Name: Resume, dtype: object

In [12]:
jd = docx2txt.process('./Job_description/JD Business Analyst.docx')

In [None]:
score_array[0][1]

In [50]:
doc_score_dict = {}
score_list = []
count = CountVectorizer()
for idx in start_idx:
    text = [df_resume['Resume'][idx], jd]
    count_matrix = count.fit_transform(text)
    score_list.append(cos_sim(count_matrix)[0][1])
    


In [57]:
doc_score_dict = { doc_list [idx] : round(score_list[idx], 2) for idx in range(len(score_list))}

In [64]:
def rank_score(dict):
    
    sorted_dict = sorted(dict, key=dict.get, reverse=True)
    for val in sorted_dict[:3]:
        print(f'{val} : {dict[val]}') 

In [65]:
rank_score(doc_score_dict)

Resume - Xisi.docx : 0.77
ChenMeiMei.docx : 0.71
Resume - YangGuiFei.docx : 0.71


In [None]:
score_array = cos_sim(count_matrix)

# Objective 1: Filter Resume based on Job Description

## Read Dataset to DataFrame

In [7]:
df_resume = pd.read_csv('resume_dataset.csv')
jd_df = pd.read_csv('jd_data.csv', encoding='cp1252')
jd_df.rename(columns = {'JD':'Resume'}, inplace=True)

## Merge JD and Resume dataset

In [None]:
df_resume = pd.concat([df_resume, jd_df], ignore_index=True)

## Exploratory Data Visualization

### Dataset Size

In [None]:
df_resume

In [None]:
df_resume.shape

In [None]:
df_resume.sample(5)

### Distribution of Job Categories

In [None]:
plt.figure(figsize=(20,5))
plt.xticks(rotation=90)
ax=sns.countplot(x="Category", data=df_resume[:962], order=df_resume['Category'].value_counts().index, palette="husl")
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.01 , p.get_height() * 1.01))
plt.grid()

## Data Cleaning

### Removed Unusual Characters using Regular Expression (Regex)

In [None]:
def unusual_text_remover(text):
    text = re.sub('â\x80¢Â', ' ', text)
    text = re.sub('â\x9c¶', ' ', text)
    text = re.sub('Ã¼Â', ' ', text)
    text = re.sub('Â', '', text)
    text = re.sub('Ã¼', '', text)
    text = re.sub('ï', '', text)
    text = (re.sub('[\W]+', ' ', text.lower()))
    return text

In [None]:
usual_text = df_resume['Resume'].apply(unusual_text_remover)

In [None]:
df_resume['Processed_Resume'] = df_resume['Resume'].apply(unusual_text_remover)

## Data Preprocessing

### Convert root word using Lemmatization

In [None]:
def lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    word_list = word_tokenize(text)
    output = ' '.join([lemmatizer.lemmatize(word) for word in word_list])
    return output

In [None]:
lemmatized_text = usual_text.apply(lemmatization)

In [None]:
df_resume['Processed_Resume'] = df_resume['Processed_Resume'].apply(lemmatization)

### Resume before data cleaning and preprocessing (in-progress)

In [None]:
df_resume.iloc[18,1]

### Resume after data cleaning 

In [None]:
usual_text[18]

### Resume after data cleaning and preprocessing (in-progress)

In [None]:
lemmatized_text[18]

## Filter Resume based on keywords and Job Title

In [None]:
df_filtered = pd.DataFrame()  

key_words = ['12 month', '1 year', 'python']
role = 'Data Science'

for idx in range(df_resume.shape[0]):
    sentence = df_resume['Processed_Resume'][idx]  
    twogram = bigrams(sentence.split())
    for gram in twogram:
        try:
            if (gram[0] + ' ' + gram[1]) in key_words and df_resume.iloc[[idx]]['Category'].values[0] == role:
                    df_filtered =  df_filtered.append(df_resume.iloc[[idx]])
        
        except KeyError:
            df_filtered =  df_filtered.append(df_resume.iloc[[idx]])
             
df_filtered.drop_duplicates(subset=['Processed_Resume'], inplace=True)
      

In [None]:
df_filtered

### Write Dataframe to csv file

In [None]:
df_filtered.to_csv('filtered_resume.csv')

# Objective 2: Classify Job Category from Incoming Resume

In [None]:
from nltk.corpus import stopwords


from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics

from wordcloud import WordCloud

## Data Preprocessing 

### Remove Stop-words in Resume

In [None]:
stop = stopwords.words('english')

In [None]:
total_words =[]
sentences = []
for text in df_resume['Processed_Resume']:
    word_list = word_tokenize(text)
    para = [word for word in word_list if word not in stop]
    sentences.append(' '.join(para))
    total_words = total_words + para


In [None]:
len(sentences)

In [None]:
wordfreqdist = nltk.FreqDist(' '.join(total_words))
mostcommon = wordfreqdist.most_common(50)
print(mostcommon)

In [None]:
wc = WordCloud().generate(' '.join(total_words))
plt.figure(figsize=(10,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()      

### Transform words into feature vectors

In [None]:
count = CountVectorizer()

In [None]:
tfidf = TfidfTransformer(use_idf=True, norm = 'l2', smooth_idf =True)

In [None]:
tfidf.fit_transform(count.fit_transform(sentences))

In [None]:
np.set_printoptions(precision= 2)
a = tfidf.fit_transform(count.fit_transform(sentences)).toarray()

### Train the model

In [None]:
y = df_resume['Category'][:962]
x = a[:962]
x_test_jd = a[962:]
y_test_jd = df_resume['Category'][962:]

In [None]:
print(y.shape)
print(x.shape)
print(y_test_jd.shape)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(x, y,random_state=42, test_size=0.3,
                                                 shuffle=True, stratify=y)

In [None]:
# Create object clf from class OneVsRestClassifier and KNeighborsClassifier
clf = OneVsRestClassifier(KNeighborsClassifier())
# Input training sets into the object (model)
clf.fit(X_train, y_train)
# Predict target variable by using test set on the trained model
prediction = clf.predict(X_test)

In [None]:
print(round(clf.score(X_train, y_train),3))
print(round(clf.score(X_test, y_test),3))

In [None]:
print(f'{clf} \n')
print(metrics.classification_report(y_test, prediction))

# Objective 3: Run JD data on trained model

In [None]:
jd_prediction = clf.predict(x_test_jd)
print(jd_prediction)
print(y_test_jd)

In [None]:
print(f'{clf} \n')
print(metrics.classification_report(y_test_jd, jd_prediction))