# Data Cleaning
## Importing Packages

In [4]:
import string
import re
import glob as glob
import slate3k as slate
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')

## Extracting Data

In [5]:
count = 0
folder_path = r'C:\Users\Bhagya\Resume Analyzer\ACTUAL PROJECT\Resumes_SF\*.pdf'
resume = []
names = []
restr = ''

for filenames in glob.glob(folder_path):
    names.append(filenames)
    with open(filenames, 'rb') as fi:
        doc = slate.PDF(fi, word_margin=0)
        for i in range(len(doc)):
            string = doc[i]
            restr = restr+string.replace('\xa0', ' ').replace('\x0c', ' ') 
    resume.append(restr)
    restr = ''



In [6]:
Candidates = []
for n in names:
    Candidates.append(n.split('Resumes_SF\\')[1].split('.pdf')[0])

In [7]:
print("Total Resumes parsed:", len(resume))

Total Resumes parsed: 201


In [8]:
print("Total Candidates:", len(Candidates))

Total Candidates: 201


## Data Cleaning

In [9]:
def clean_text(text):
    '''
    Removes new line and unwanted page of values from the text
    '''
    text1 = re.compile('[%s]' % '(\\n)*(\\x0c)*').sub(' ', text)  
    text2 = re.compile(r'Page [0-9]+ of [0-9]+').sub(' ', text1)  
    return text2

In [10]:
def remove_punctuation(text):
    '''
    Removes punctuation
    Did not remove few characters such as .,$%-~:;?!
    '''
    clean_punct =  re.compile('[%s]' % re.escape('"#&\()*+/<=>@[\\]^_{|}')).sub(' ', text) 
    return clean_punct                                                                     

In [11]:
def text_treatment(text):
    '''
    Replacing unwanted characters with space
    '''
    text = text.replace("\x00", '').replace("\x01", '').replace("\x02", '').replace("\x03", '') \
    .replace("\x04", '').replace("\x05", '').replace("\x06", '').replace("\x07", '').replace("\x08", '') \
    .replace("\x0e", '').replace("\x11", '').replace("\x12", '').replace("\x10", '').replace("\x19", '') \
    .replace("\x1b", '').replace("\x14", '').replace("\x15", '').replace('/', '').replace('=', '').replace("〓", "") \
    .replace("»", "").replace("«", "").replace("¬", "").replace('`', '').replace("•", "").replace("▬▬▬▬▬▬▬▬▬▬▬▬▬▬▬▬▬","")\
    .replace("”", "").replace("§", "").replace("¨", "").replace("©", "").replace("›", "").replace("■", "").replace("ifttt", "")\
    .replace("→", "").replace("⇨", "").replace("∎", "").replace("√", "").replace("□", "").replace("~~~", "").replace("★", "")\
    .replace("*", "").replace("&", "").replace("►", "").replace("◊", "").replace("☞", "").replace("#", "")\
    .replace("❖", "").replace("➠", "").replace("➢", "").replace("", "").replace("✓", "").replace("--","") \
    .replace("√", "").replace("✔", "").replace("♦", "").replace("◦", "").replace("●", "").replace("▫", "")\
    .replace("▪", "").replace("…", "").replace("þ", "").replace("®", "").replace('', '').replace("...", "")
    return text

In [12]:
def masters(text):
    '''
    Filtering people who have a Masters/PhD degree with value 1 from the Education column
    '''
    patterns = re.compile("(Master's|Master|M.S.|MS|M.Sc.|MSc|PhD|Ph.D.|Honors)")
    #print('Looking for "%s" in "%s" ->' % (patterns, text))
    if patterns.search(text):
        value = 1
    else:
        value = 0
    return value

In [13]:
def get_experience(text):
    text1 =  re.findall(r"([a-zA-Z]+\s\d+\s-\s\D+\s\d*\s\s?)(\d+\syears?\s\d+ months?|\d+ years?|\d+ months?)",text)
    '''
    text1: Finding strings of pattern 'June 2011 - December 2012  ', '1 year 7 months'
    Finding the total experience of a person
    '''
    years = 0
    months= 0 
    for i in text1:
        match_years = re.search("[0-9]+\syears?",i[1])     # to get all the years
        if match_years != None:
            yr = int(match_years.group()[0])
            years += yr
        match_months = re.search("[0-9]+\smonths?",i[1])   # to get all the months
        if match_months != None:
            month = int(match_months.group()[0:2])
            months += month
    total_exp = round(years + (months/12),2)
    return total_exp

In [63]:
def Get_contact(text):
    '''
    Filtering the Contact column to get any piece of contact information such as Email or Github or Phone number 
    '''
    if re.findall(r'[a-zA-Z0-9.-]+@[a-zA-Z-]+\.com+', text):
        value = re.findall(r'[a-zA-Z0-9.-]+@[a-zA-Z-]+\.com+', text)[0]
    elif re.findall(r'github\.com/\s?[a-zA-Z0-9_]+', text):
        value = re.findall(r'github\.com/\s?[a-zA-Z0-9_]+', text)[0]
    elif re.findall(r'\d{3}-\d{3}-\d{4}', text):
        value = re.findall(r'\d{3}-\d{3}-\d{4}', text)[0]
    else:
        value = None    
    return value

In [73]:
df = pd.DataFrame({'Candidate_Name':Candidates,'Resume':resume})
df2 = df[((df['Resume'].str.contains('\n\nSummary\n\n')) | (df['Resume'].str.contains('\n \n Summary\n\n'))) 
          & ((df['Resume'].str.contains('\n\nExperience\n\n')) | (df['Resume'].str.contains('\n \n Experience\n\n'))) 
          & ((df['Resume'].str.contains('\n\nEducation\n\n'))|(df['Resume'].str.contains('\n \n Education\n\n')))].reset_index(drop=True)

In [74]:
print("Total resumes having summary, experience and education fields:", df2.shape[0])

Total resumes having summary, experience and education fields: 143


In [75]:
# Creating different fields by extracting data
df2['Contact'] = df2['Resume'].str.split('Contact\n\n').str[1].str.split('\n\nSummary').str[0]
df2['Summary'] = df2['Resume'].str.split('Summary\n\n').str[1].str.split('\n\nExperience').str[0]
df2['Experience'] = df2['Resume'].str.split('Experience\n\n').str[1].str.split('\n\nEducation').str[0]
df2['Education'] = df2['Resume'].str.split('Education\n\n').str[1]

In [76]:
#Applying the above functions
df2['Contact']    = df2['Contact'].apply(clean_text)
df2['Summary']    = df2['Summary'].apply(clean_text).apply(remove_punctuation).apply(text_treatment)
df2['Experience'] = df2['Experience'].apply(clean_text).apply(remove_punctuation).apply(text_treatment)
df2['Education']  = df2['Education'].apply(clean_text)

In [77]:
pd.set_option('display.max_colwidth', 100)                              # to increase column width size

In [78]:
## Extracting useful information from contacts
df2["Masters"] = df2["Education"].apply(masters)
df2['Total_Experience'] = df2['Experience'].apply(get_experience)
df2['Linkedin_Profile'] = df2['Contact'].str.findall(r'www.linkedin.com/in/\s?[a-zA-Z0-9_%-]+\s*?[a-zA-Z0-9_%-]+\s?\s?[L$]').apply(''.join).str.split(' L').str[0].replace(" ", '', regex=True)
df2['Contact_Info'] = df2['Contact'].apply(Get_contact)
df2.drop(["Resume", "Contact"], axis = 1, inplace = True)

In [81]:
df2.head()

Unnamed: 0,Candidate_Name,Summary,Experience,Education,Masters,Total_Experience,Linkedin_Profile,Contact_Info
0,Aadil Hussaini,"Experienced Data Scientist with strong business acumen, with a demonstrated history of driving p...",Lyft Data Scientist November 2019 - Present 5 months Facebook 3 years 5 months Data Scien...,"Stanford University Graduate Certificate, Data Mining and Statistics · 2015 Univer...",1,7.5,www.linkedin.com/in/aadilhussaini,
1,Abhimanyu Mitra,Experienced Researcher in Data Science with a passion for data- driven decision making and build...,Walmart Labs 7 years 1 month Principal Data Scientist September 2016 - Present 3 years 7 mon...,"Cornell University PhD, Operations Research, Concentration: Applied Probability & Statistics ·...",1,8.67,www.linkedin.com/in/abhimanyu-mitra-84848011,
2,Alison Hung,Certified Scrum agile product development expert. Experienced Product Manager with a demonstrate...,Advantech Product Manager June 2016 - Present 3 years 10 months San Francisco Bay Area - Ma...,"University of Illinois at Urbana-Champaign Master of Science MS , Technology Management · 201...",1,7.33,www.linkedin.com/in/alisonhung,
3,Alvira Swalin,"IIT Bombay graduate 2016 and MSDS alum at USF. Keen to apply Mathematical, Statistical and Mac...",Uber 1 year 8 months Data Scientist II February 2020 - Present 2 months San Francisco Bay A...,"University of San Francisco Master's degree, Analytics · 2017 - 2018 Indian Institute of Tec...",1,3.33,www.linkedin.com/in/alvira-swalin,
4,Amrit Bulusu,"With a strong background in Cognitive Science and User Experience research, I am passionate in m...",Samsung Electronics America 2 years 3 months Data Scientist January 2020 - Present 3 months ...,"Rice University Doctor of Philosophy Ph.D. , Psychology · 2010 - 2016 Rice University Mast...",1,12.58,www.linkedin.com/in/yuhsuan1,


In [80]:
# Writing the dataframe into a csv file 
df2.to_csv('Resume_Data.csv', index = False)