# Resume Screening
Version 1.0

***
## Pipeline

1. Resume Parsing  
    * The first subtask is parsing the resume, i.e., extracting information in a structured format from the document. 
2. Resume Matching
    * The second sub-task is extracting semantic information and actually understanding the underlying information.
3. Database Formation

***

## Import Libraries

In [None]:
# !pip install PyMuPDF
# !pip install docx2pdf

In [27]:
import numpy
import sys
import fitz
from docx2pdf import convert
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [None]:
#nltk.download('punkt')
#nltk.download('stopwords')

## Reading the file

In [None]:
def read_file(fname):
    if fname.split(".")[1] == 'docx':
        convert(fname)
        fname = fname.split(".")[0] + ".pdf"
    elif fname.split(".")[1] == 'pdf':
        pass
    else:
        print("Only PDF and docx types are supported!")
        return
    
    doc = fitz.open(fname)
    text = ""
    for page in doc:
        text = text + str(page.get_text())

    tx = " ".join(text.split('\n'))  
    #print(text)
    return text

In [None]:
resume = read_file('Ahmed Osama-Long Version.docx')
resume

## Pre-processing
* It consists of some main steps
    * Lowercase
    * Removing Punctuation
    * Tokenization
    * Stopword Filtering
    * Stemming
    * Lemmatization

### Lowercase

In [None]:
resume_lower = resume.lower()
resume_lower

### Removing Punctuation

In [None]:
#print(string.punctuation)
## Note + needs to be kept -> C++
## Note @ needs to be kept -> May remove emails 
## Remove bullet points
## Remove Qoutes
resume_punc = "".join([char for char in resume_lower if char not in string.punctuation + "●•"])
print(resume_punc)

### Tokenization

In [None]:
resume_tokens = nltk.word_tokenize(resume_punc)
resume_tokens

### Stopword Filtering

In [None]:
stop_words = stopwords.words('english')
#print(stop_words)
resume_filtered_words = [word for word in resume_tokens if word not in stop_words]
#print(resume_filtered_words)
resume_filtered_words

### Stemming

In [None]:
porter = PorterStemmer()
stemmed_resume = [porter.stem(word) for word in resume_filtered_words]
print(stemmed_resume)

## Feature Extraction

### Words Count (Frequency)

In [None]:
resume_freq = dict()
for word in stemmed_resume:
    resume_freq[word] = resume_freq.get(word,0)+1
print(resume_freq)

# V2.0

## Reading the Data

In [5]:
import pandas as pd
import numpy as np

In [40]:
resume_df = pd.read_csv("Datasets/resume/Resume.csv")
resume_df.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [41]:
resume_df.describe()

Unnamed: 0,ID
count,2484.0
mean,31826160.0
std,21457350.0
min,3547447.0
25%,17544300.0
50%,25210310.0
75%,36114440.0
max,99806120.0


## Preprocessing

### Cleaning data

In [42]:
##resume_df = resume_df[np.isfinite(pd.to_numeric(resume_df.ID, errors="coerce"))]
resume_df[['ID']] = resume_df[['ID']].apply(pd.to_numeric, errors='coerce')

In [43]:
resume_df.drop(columns=['Resume_html'], inplace = True)

### Lowercase

In [44]:
def to_lower(df):
    df["Resume_str"] = df["Resume_str"].str.lower()
    df["Category"] = df["Category"].str.lower()
    return df

In [46]:
lowered_df = to_lower(resume_df)
lowered_df.head()

Unnamed: 0,ID,Resume_str,Category
0,16852973,hr administrator/marketing associate\...,hr
1,22323967,"hr specialist, us hr operations ...",hr
2,33176873,hr director summary over 2...,hr
3,27018550,hr specialist summary dedica...,hr
4,17812897,hr manager skill highlights ...,hr


### Punctuation

In [47]:
def rem_punct(df):
    #" ".join([char for char in resume_lower if char not in string.punctuation + "●•"])
    df['punct_sent'] = df.apply(lambda row: "".join([char for char in row['Resume_str'] if char not in string.punctuation]), axis=1)
    return df

In [48]:
punct_df = rem_punct(lowered_df)
punct_df.head()

Unnamed: 0,ID,Resume_str,Category,punct_sent
0,16852973,hr administrator/marketing associate\...,hr,hr administratormarketing associate\n...
1,22323967,"hr specialist, us hr operations ...",hr,hr specialist us hr operations ...
2,33176873,hr director summary over 2...,hr,hr director summary over 2...
3,27018550,hr specialist summary dedica...,hr,hr specialist summary dedica...
4,17812897,hr manager skill highlights ...,hr,hr manager skill highlights ...


### Tokenization

In [49]:
def to_tokens(df):
    df['tokenized_sents'] = df.apply(lambda row: nltk.word_tokenize(row['punct_sent']), axis=1)
    return df

In [50]:
tokenized_df = to_tokens(lowered_df)
tokenized_df.head()

Unnamed: 0,ID,Resume_str,Category,punct_sent,tokenized_sents
0,16852973,hr administrator/marketing associate\...,hr,hr administratormarketing associate\n...,"[hr, administratormarketing, associate, hr, ad..."
1,22323967,"hr specialist, us hr operations ...",hr,hr specialist us hr operations ...,"[hr, specialist, us, hr, operations, summary, ..."
2,33176873,hr director summary over 2...,hr,hr director summary over 2...,"[hr, director, summary, over, 20, years, exper..."
3,27018550,hr specialist summary dedica...,hr,hr specialist summary dedica...,"[hr, specialist, summary, dedicated, driven, a..."
4,17812897,hr manager skill highlights ...,hr,hr manager skill highlights ...,"[hr, manager, skill, highlights, hr, skills, h..."


### Stop Words

In [51]:
def rem_stop_words(df):
    stop_words = stopwords.words('english')
    df['stop_words'] = df.apply(lambda row: [word for word in row['tokenized_sents'] if word not in stop_words], axis=1)
    return df

In [52]:
stop_words_df = rem_stop_words(tokenized_df)
stop_words_df.head()

Unnamed: 0,ID,Resume_str,Category,punct_sent,tokenized_sents,stop_words
0,16852973,hr administrator/marketing associate\...,hr,hr administratormarketing associate\n...,"[hr, administratormarketing, associate, hr, ad...","[hr, administratormarketing, associate, hr, ad..."
1,22323967,"hr specialist, us hr operations ...",hr,hr specialist us hr operations ...,"[hr, specialist, us, hr, operations, summary, ...","[hr, specialist, us, hr, operations, summary, ..."
2,33176873,hr director summary over 2...,hr,hr director summary over 2...,"[hr, director, summary, over, 20, years, exper...","[hr, director, summary, 20, years, experience,..."
3,27018550,hr specialist summary dedica...,hr,hr specialist summary dedica...,"[hr, specialist, summary, dedicated, driven, a...","[hr, specialist, summary, dedicated, driven, d..."
4,17812897,hr manager skill highlights ...,hr,hr manager skill highlights ...,"[hr, manager, skill, highlights, hr, skills, h...","[hr, manager, skill, highlights, hr, skills, h..."


In [53]:
stop_words_df

Unnamed: 0,ID,Resume_str,Category,punct_sent,tokenized_sents,stop_words
0,16852973,hr administrator/marketing associate\...,hr,hr administratormarketing associate\n...,"[hr, administratormarketing, associate, hr, ad...","[hr, administratormarketing, associate, hr, ad..."
1,22323967,"hr specialist, us hr operations ...",hr,hr specialist us hr operations ...,"[hr, specialist, us, hr, operations, summary, ...","[hr, specialist, us, hr, operations, summary, ..."
2,33176873,hr director summary over 2...,hr,hr director summary over 2...,"[hr, director, summary, over, 20, years, exper...","[hr, director, summary, 20, years, experience,..."
3,27018550,hr specialist summary dedica...,hr,hr specialist summary dedica...,"[hr, specialist, summary, dedicated, driven, a...","[hr, specialist, summary, dedicated, driven, d..."
4,17812897,hr manager skill highlights ...,hr,hr manager skill highlights ...,"[hr, manager, skill, highlights, hr, skills, h...","[hr, manager, skill, highlights, hr, skills, h..."
...,...,...,...,...,...,...
2479,99416532,rank: sgt/e-5 non- commissioned offic...,aviation,rank sgte5 non commissioned officer i...,"[rank, sgte5, non, commissioned, officer, in, ...","[rank, sgte5, non, commissioned, officer, char..."
2480,24589765,"government relations, communications ...",aviation,government relations communications a...,"[government, relations, communications, and, o...","[government, relations, communications, organi..."
2481,31605080,geek squad agent professional...,aviation,geek squad agent professional...,"[geek, squad, agent, professional, profile, it...","[geek, squad, agent, professional, profile, su..."
2482,21190805,program director / office manager ...,aviation,program director office manager ...,"[program, director, office, manager, summary, ...","[program, director, office, manager, summary, ..."


### TF in document

In [72]:
from collections import Counter

In [73]:
def TF_doc(df):
    df['TF_doc'] = df.apply(lambda row: Counter(row['stop_words']), axis=1)
    return df

In [74]:
TF_doc_df = TF_doc(stop_words_df)
TF_doc_df

Unnamed: 0,ID,Resume_str,Category,punct_sent,tokenized_sents,stop_words,TF_doc
0,16852973,hr administrator/marketing associate\...,hr,hr administratormarketing associate\n...,"[hr, administratormarketing, associate, hr, ad...","[hr, administratormarketing, associate, hr, ad...","{'hr': 4, 'administratormarketing': 2, 'associ..."
1,22323967,"hr specialist, us hr operations ...",hr,hr specialist us hr operations ...,"[hr, specialist, us, hr, operations, summary, ...","[hr, specialist, us, hr, operations, summary, ...","{'hr': 7, 'specialist': 3, 'us': 3, 'operation..."
2,33176873,hr director summary over 2...,hr,hr director summary over 2...,"[hr, director, summary, over, 20, years, exper...","[hr, director, summary, 20, years, experience,...","{'hr': 8, 'director': 3, 'summary': 1, '20': 1..."
3,27018550,hr specialist summary dedica...,hr,hr specialist summary dedica...,"[hr, specialist, summary, dedicated, driven, a...","[hr, specialist, summary, dedicated, driven, d...","{'hr': 3, 'specialist': 3, 'summary': 1, 'dedi..."
4,17812897,hr manager skill highlights ...,hr,hr manager skill highlights ...,"[hr, manager, skill, highlights, hr, skills, h...","[hr, manager, skill, highlights, hr, skills, h...","{'hr': 32, 'manager': 3, 'skill': 1, 'highligh..."
...,...,...,...,...,...,...,...
2479,99416532,rank: sgt/e-5 non- commissioned offic...,aviation,rank sgte5 non commissioned officer i...,"[rank, sgte5, non, commissioned, officer, in, ...","[rank, sgte5, non, commissioned, officer, char...","{'rank': 5, 'sgte5': 2, 'non': 2, 'commissione..."
2480,24589765,"government relations, communications ...",aviation,government relations communications a...,"[government, relations, communications, and, o...","[government, relations, communications, organi...","{'government': 7, 'relations': 7, 'communicati..."
2481,31605080,geek squad agent professional...,aviation,geek squad agent professional...,"[geek, squad, agent, professional, profile, it...","[geek, squad, agent, professional, profile, su...","{'geek': 2, 'squad': 2, 'agent': 2, 'professio..."
2482,21190805,program director / office manager ...,aviation,program director office manager ...,"[program, director, office, manager, summary, ...","[program, director, office, manager, summary, ...","{'program': 4, 'director': 6, 'office': 4, 'ma..."


In [90]:
#for item in TF_doc_df.at[1, 'TF_doc']:
for item, value in TF_doc_df.at[50, 'TF_doc'].items():
    print(item, value)

hr 5
assistantpayroll 3
highlights 1
paychex 1
mastertax 1
abra 2
time 2
star 1
infotronics 1
store 1
master 1
dominion 1
quickbooks 2
ceridian 2
peoplesoft 2
sap 2
microsoft 2
word 2
excel 2
powerpoint 2
crystal 2
reports 3
experience 2
company 11
name 10
－ 11
city 11
state 17
human 9
resources 9
professional 1
offering 1
15 1
years 1
combined 1
payroll 21
manufacturing 2
academic 2
settings 1
accurate 5
timely 3
preparation 1
employee 13
tax 9
information 16
benefits 10
administration 2
highly 1
effective 1
employer 1
representation 1
unemployment 11
claims 7
hearings 3
012014 2
coordinator 2
process 13
weekly 11
100 2
hourly 3
salaried 3
associates 3
garnishment 3
levy 3
child 3
support 3
processing 3
compile 3
labor 3
hours 3
enter 4
new 7
hire 6
hrispayroll 5
system 7
assist 7
annual 9
open 4
enrollment 4
status 4
changes 4
personnel 4
benefit 6
records 5
respond 5
establish 3
written 4
policies 4
procedures 3
conflict 4
resolution 4
workers 5
compensation 5
accident 4
reporting 9

### TF in a Category

In [54]:
stop_words_df.Category.unique()

array(['hr', 'designer', 'information-technology', 'teacher', 'advocate',
       'business-development', 'healthcare', 'fitness', 'agriculture',
       'bpo', 'sales', 'consultant', 'digital-media', 'automobile',
       'chef', 'finance', 'apparel', 'engineering', 'accountant',
       'construction', 'public-relations', 'banking', 'arts', 'aviation'],
      dtype=object)

In [69]:
TF_cat = pd.DataFrame(
                  index=pd.Index( stop_words_df.Category.unique()),
                  columns=pd.Index([]))
TF_cat.head()

hr
designer
information-technology
teacher
advocate


In [71]:
#TF_cat["yes"] = np.nan
TF_cat

hr
designer
information-technology
teacher
advocate
business-development
healthcare
fitness
agriculture
bpo
sales
