---
# Importing Necessary Dependencies
---

In [66]:
import pandas as pd      #package for data analysis
import neattext as nt    #package for text cleaning
import os                #package for dealing with system paths
import PyPDF2           #package for data extraction from pdf
import docx2txt         #python package for data extraction from word file

---
# Data Extraction from Resumes
---

In [67]:
def extract_text(filename):
    """
    Extracts text from PDFs and Word docs.
    """
    _, ext = os.path.splitext(filename)
    if ext == '.pdf':
        with open(filename, 'rb') as file:
            text=''
             # Create a PDF reader object
            reader = PyPDF2.PdfFileReader(file)
            # Get the number of pages in the PDF
            num_pages = reader.getNumPages()
            # Loop through each page in the PDF
            for page_num in range(num_pages):
                # Get the current page
                page = reader.getPage(page_num)
                # Extract the text from the current page
                text += page.extractText()
                return text
    elif ext == '.docx':
        document= docx2txt.process(filename)
        doc_text = str(document)
        return doc_text

def read_files(directory):
    """
    Reads all files in a directory and extracts text from PDFs and Word docs.
    Returns a dictionary of file names and text.
    """
    files = {}
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath):
            text = extract_text(filepath)
            if text:
                files[filename] = text
    return files

# Example usage
directory = './'
files = read_files(directory)
df=pd.DataFrame(files.items())

In [68]:
df.columns=['resume_name','resume_text']

---
# Personal Informations Extraction
---

### Emails and Phone No. Extraxtion

In [69]:
df['candidate_email']=df['resume_text'].apply(lambda x:'|'.join(nt.extract_emails(x)))
df['candidate_email']=df['candidate_email'].apply(lambda x:[email for email  in x.split("|")][0])
df['candidate_phone']=df['resume_text'].apply(lambda x:'|'.join(nt.extract_phone_numbers(x)))
df['candidate_phone']=df['candidate_phone'].apply(lambda x:[phone for phone  in x.split("|")][0])

In [70]:
df[['resume_name','candidate_phone','candidate_email']]

Unnamed: 0,resume_name,candidate_phone,candidate_email
0,coolfreecv_resume_en_01.docx,+923067326316,imrnazir8@gmail.com
1,CV1.docx,,
2,CV10.docx,(222) 949-5578,SharaArmour@gmail.com
3,CV11.docx,(254) 844-5629,JudyWelch@gmail.com
4,CV12.docx,(627) 367-2287,WilliamBaltazar@gmail.com
5,CV13.docx,(433) 686-6674,SusanVillanueva@gmail.com
6,CV14.docx,,LeonDean@gmail.com
7,CV15.docx,(545) 885-2686,BettyRedden@gmail.com
8,CV16.docx,(863) 435-4986,LanceDuffie@gmail.com
9,CV17.docx,,


### Text Formatting

In [71]:
def resume_text_formatting(resume_text):
    resume_text=resume_text.split('\n')
    resume_text=[line.strip().lower() for line in resume_text if len(line)>3]
    return ' $ '.join(resume_text)

In [72]:
df['f_resume_text']=df['resume_text'].apply(lambda x:resume_text_formatting(x))

### Links Extraction

In [73]:
links=[]

In [74]:
def get_links(resume_text):
    links=[]
    return ''.join(nt.extract_urls(resume_text))

In [75]:
df['links']=df['f_resume_text'].apply(lambda x:get_links(x))

---
# Resume Ranking
---

### Predefined Evaluation Metrics

In [76]:
ds_tech_skills=pd.read_csv("data_science_skills.csv",index_col=0)
ds_education=pd.read_csv("data_science_educations.csv",index_col=0)
ds_experiences=pd.read_csv("data_science_experiences.csv",index_col=0)
soft_skills=pd.read_csv("soft_skills.csv",index_col=0)
ds_certificates=pd.read_csv("data_science_certificates.csv",index_col=0)

In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer    #importing tf-idf for text vectorizer
from sklearn.metrics.pairwise import cosine_similarity         #importing cosine similarity for text similarity
# initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

In [78]:
similar_educations=[]
education_simlrty_score=[]
education_rank={}

In [79]:
def get_similar_educations(resume_text,educations):
    '''
    this function gets predefined educations and find most similar educations from candidate resume.
    it calculates average similarity score between predefined educations and candidate education.
    It return the results only if candidate education 80% matches with predefined educations
    '''
    similar_educations=[]
    education_simlrty_score=[]
    education_rank={}
    for line in resume_text.split('$'):
        for edu in educations:
            # fit and transform the documents
            tfidf_matrix = vectorizer.fit_transform([line, edu])
            # compute cosine similarity between the documents
            cosine_similarities = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1]).flatten()
            cosine_similarities=cosine_similarities[0]
            if cosine_similarities>=0.80:
                education_simlrty_score.append(cosine_similarities)
                similar_educations.append(line)
                #print(line,edu,cosine_similarities)
    temp=sum(education_simlrty_score)/len((educations))
    education_rank[temp]=similar_educations
    return education_rank

In [80]:
df['educations']=df['f_resume_text'].apply(lambda x:get_similar_educations(x,ds_education.values.reshape(-1)))

In [111]:
similar_certificates=[]
certificates_simlrty_score=[]
certificates_rank={}

In [112]:
def get_similar_certifications(resume_text,certificates):
    '''
    this function gets predefined certificates and find most similar certificates from candidate resume.
    it calculates average similarity score between predefined certificates and candidate certificates.
    It return the results only if candidate certificates 80% matches with predefined certificates
    '''
    similar_certificates=[]
    certificates_simlrty_score=[]
    certificates_rank={}
    for line in resume_text.split('$'):
        for cert in certificates:
            # fit and transform the documents
            tfidf_matrix = vectorizer.fit_transform([line, cert])
            # compute cosine similarity between the documents
            cosine_similarities = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1]).flatten()
            cosine_similarities=cosine_similarities[0]
            if cosine_similarities>=0.80:
                certificates_simlrty_score.append(cosine_similarities)
                similar_certificates.append(line)
               # print(line,cert,cosine_similarities)
    temp=sum(certificates_simlrty_score)/len((certificates))
    certificates_rank[temp]=similar_certificates
    return certificates_rank

In [113]:
df['certificates']=df['f_resume_text'].apply(lambda x:get_similar_certifications(x,ds_certificates.values.reshape(-1)))

In [84]:
similar_experiences=[]
experiences_simlrty_score=[]
experiences_rank={}

In [85]:
def get_similar_experiences(resume_text,experiences):
    '''
    this function gets predefined experiences and find most similar experiences from candidate resume.
    it calculates average similarity score between predefined experiences and candidate experiences.
    It return the results only if candidate experiences 80% matches with predefined experiences
    '''
    similar_experiences=[]
    experiences_simlrty_score=[]
    experiences_rank={}
    for line in resume_text.split('$'):
        for expr in experiences:
            # fit and transform the documents
            tfidf_matrix = vectorizer.fit_transform([line, expr])
            # compute cosine similarity between the documents
            cosine_similarities = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1]).flatten()
            cosine_similarities=cosine_similarities[0]
            if cosine_similarities>=0.80:
                experiences_simlrty_score.append(cosine_similarities)
                similar_experiences.append(line)
               # print(line,expr,cosine_similarities)
    temp=sum(experiences_simlrty_score)/len((experiences))
    experiences_rank[temp]=similar_experiences
    return experiences_rank

In [86]:
df['experience']=df['f_resume_text'].apply(lambda x:get_similar_experiences(x,ds_experiences.values.reshape(-1)))

In [87]:
tech_skills_rank=0
soft_skills_rank=0

In [118]:
tech_skills_ls=[]
tech_skills_dict={}
soft_skills_ls=[]
soft_skills_dict={}

In [121]:
def check_tech_skills(resume_text,tech_skills):
    #checking the predefined skill is present in candidate resume or not
    tech_skills_rank=0
    tech_skills_ls=[]
    tech_skills_dict={}
    for skill in tech_skills:
        if skill.lower() in resume_text:
            tech_skills_rank+=1
            tech_skills_ls.append(skill)
    temp=tech_skills_rank/len(tech_skills)
    tech_skills_dict[temp]=','.join(tech_skills_ls)
    return tech_skills_dict

In [124]:
def check_soft_skills(resume_text,soft_skills):
    #checking the predefined soft skill is present in candidate resume or not
    soft_skills_rank=0
    soft_skills_ls=[]
    soft_skills_dict={}
    for skill in soft_skills:
        if skill.lower() in resume_text:
            soft_skills_rank+=1
            soft_skills_ls.append(skill)
    temp=soft_skills_rank/len(soft_skills)
    soft_skills_dict[temp]=','.join(tech_skills_ls)
    return soft_skills_dict

In [125]:
df['tech_skills']=df['f_resume_text'].apply(lambda x:check_tech_skills(x,ds_tech_skills.values.reshape(-1)))

In [126]:
df['soft_skills']=df['f_resume_text'].apply(lambda x:check_soft_skills(x,soft_skills.values.reshape(-1)))

In [130]:
df['tech_skills_score']=df['tech_skills'].apply(lambda x:list(x.keys()))
df['tech_skills']=df['tech_skills'].apply(lambda x:list(x.values()))

In [131]:
df['soft_skills_score']=df['soft_skills'].apply(lambda x:list(x.keys()))
df['soft_skills']=df['soft_skills'].apply(lambda x:list(x.values()))

In [96]:
df['educations_score']=df['educations'].apply(lambda x:list(x.keys()))
df['educations']=df['educations'].apply(lambda x:list(x.values()))

In [115]:
df['certificates_score']=df['certificates'].apply(lambda x:list(x.keys()))
df['certificates']=df['certificates'].apply(lambda x:list(x.values()))

In [99]:
df['experience_score']=df['experience'].apply(lambda x:list(x.keys()))
df['experience']=df['experience'].apply(lambda x:list(x.values()))

In [133]:
df['candidate_score']=(df['educations_score']+df['certificates_score']+df['experience_score']+df['tech_skills_score']+df['soft_skills_score'])

In [136]:
df['candidate_score']=df['candidate_score'].apply(lambda x:sum(x)/len(x)*100)

---
# Final Resutls
---

In [28]:
#the results are sorted on score of skills,mean a candidate who has most similar skills, will stand on top

In [140]:
results=df[['resume_name','candidate_phone','candidate_email','candidate_score','links', 'tech_skills','soft_skills','educations', 'certificates','experience']]

In [143]:
results.sort_values(by=['candidate_score'],ascending=False)

Unnamed: 0,resume_name,candidate_phone,candidate_email,candidate_score,links,tech_skills,soft_skills,educations,certificates,experience
1,CV1.docx,,,40.0,,"[Data Analysis,Data Visualization,Machine Lear...",[],[[]],"[[ ibm data science professional certificate ,...",[[]]
18,CV8.docx,(745) 368-7944,DonaldBritt@gmail.com,24.045977,,"[Data Analysis,Data Visualization,Machine Lear...",[],[[]],[[]],[[ experience with python programming and data...
0,coolfreecv_resume_en_01.docx,+923067326316,imrnazir8@gmail.com,8.172389,https://www.linkedin.com/in/imran-nazir-898b43...,"[Data Analysis,Data Visualization,Data Mining,...",[],"[[ bachelor of computer science , bachelor of...",[[]],[[]]
20,Imran_Nazir_002_MPhil_DS_NLP.docx,+923067326316,imrnazir8@gmail.com,8.172389,https://www.linkedin.com/in/imran-nazir-898b43...,"[Data Analysis,Data Visualization,Data Mining,...",[],"[[ bachelor of computer science , bachelor of...",[[]],[[]]
17,CV7.docx,(526) 372-3648,GayleHawkins@gmail.com,2.712644,,"[Data Analysis,Classification]",[],[[]],[[]],[[]]
16,CV6.docx,(685) 682-3747,VirgilRasmussen@gmail.com,2.712644,,"[Data Analysis,Classification]",[],[[]],[[]],[[]]
15,CV5.docx,(937) 794-8772,BonniePelt@gmail.com,2.022989,http://caremanager.optum.com,"[Data Analysis,Data Mining]",[],[[]],[[]],[[]]
19,CV9.docx,(873) 929-6922,CarmenNakata@gmail.com,2.022989,,"[Data Analysis,Data Visualization]",[],[[]],[[]],[[]]
5,CV13.docx,(433) 686-6674,SusanVillanueva@gmail.com,1.37931,,[],[],[[]],[[]],[[]]
11,CV19.docx,(289) 863-9669,FredStewart@gmail.com,1.356322,,[Clustering],[],[[]],[[]],[[]]


---
# Savin the Results
---

In [31]:
for candidate in range(results.shape[0]): #it will save the details of all the candidates in csv form 
    results.to_csv(results.iloc[candidate,]['resume_name'].split('.')[0]+'.csv')

In [146]:
for candidate in range(results.shape[0]):
    with open(results.iloc[candidate,]['resume_name'].split('.')[0]+'.txt','a+') as f:
        f.write('Email:\n')
        f.write(results.iloc[candidate,]['candidate_email'])
        f.write('\n')
        f.write('\n')
        f.write('Phone:\n')
        f.write(results.iloc[candidate,]['candidate_phone'])
        f.write('\n')
        f.write('\n')
        f.write('Links:\n')
        f.write(str(results.iloc[candidate,]['links']))
        f.write('\n')
        f.write('\n')
        f.write('Technical Skills:\n')
        f.write(str(results.iloc[candidate,]['tech_skills']))
        f.write('\n')
        f.write('\n')
        f.write('Educations:\n')
        f.write(str(results.iloc[candidate,]['educations']))
        f.write('\n')
        f.write('\n')
        f.write('Experiences:\n')
        f.write(str(results.iloc[candidate,]['experience']))
        f.write('\n')
        f.write('\n')
        f.write('Certificates:\n')
        f.write(str(results.iloc[candidate,]['certificates']))
        f.write('\n')
        f.write('\n')
        f.write('Soft Skills:\n')
        f.write(str(results.iloc[candidate,]['soft_skills']))
        f.close()