---
# Importing Necessary Dependencies
---

In [1]:
import pandas as pd      #package for data analysis
import neattext as nt    #package for text cleaning
import os                #package for dealing with system paths
import PyPDF2           #package for data extraction from pdf
import docx2txt         #python package for data extraction from word file

---
# Data Extraction from Resumes
---

In [2]:
def extract_text(filename):
    """
    Extracts text from PDFs and Word docs.
    """
    _, ext = os.path.splitext(filename)
    if ext == '.pdf':
        with open(filename, 'rb') as file:
            text=''
             # Create a PDF reader object
            reader = PyPDF2.PdfFileReader(file)
            # Get the number of pages in the PDF
            num_pages = reader.getNumPages()
            # Loop through each page in the PDF
            for page_num in range(num_pages):
                # Get the current page
                page = reader.getPage(page_num)
                # Extract the text from the current page
                text += page.extractText()
                return text
    elif ext == '.docx':
        document= docx2txt.process(filename)
        doc_text = str(document)
        return doc_text

def read_files(directory):
    """
    Reads all files in a directory and extracts text from PDFs and Word docs.
    Returns a dictionary of file names and text.
    """
    files = {}
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath):
            text = extract_text(filepath)
            if text:
                files[filename] = text
    return files

# Example usage
directory = './'
files = read_files(directory)
df=pd.DataFrame(files.items())

In [3]:
df.columns=['resume_name','resume_text']

---
# Personal Informations Extraction
---

### Emails and Phone No. Extraxtion

In [4]:
df['candidate_email']=df['resume_text'].apply(lambda x:'|'.join(nt.extract_emails(x)))
df['candidate_email']=df['candidate_email'].apply(lambda x:[email for email  in x.split("|")][0])
df['candidate_phone']=df['resume_text'].apply(lambda x:'|'.join(nt.extract_phone_numbers(x)))
df['candidate_phone']=df['candidate_phone'].apply(lambda x:[phone for phone  in x.split("|")][0])

In [5]:
df[['resume_name','candidate_phone','candidate_email']]

Unnamed: 0,resume_name,candidate_phone,candidate_email
0,coolfreecv_resume_en_01.docx,+923067326316,imrnazir8@gmail.com
1,CV1.docx,,
2,CV10.docx,(222) 949-5578,SharaArmour@gmail.com
3,CV11.docx,(254) 844-5629,JudyWelch@gmail.com
4,CV12.docx,(627) 367-2287,WilliamBaltazar@gmail.com
5,CV13.docx,(433) 686-6674,SusanVillanueva@gmail.com
6,CV14.docx,,LeonDean@gmail.com
7,CV15.docx,(545) 885-2686,BettyRedden@gmail.com
8,CV16.docx,(863) 435-4986,LanceDuffie@gmail.com
9,CV17.docx,,


### Text Formatting

In [6]:
def resume_text_formatting(resume_text):
    resume_text=resume_text.split('\n')
    resume_text=[line.strip() for line in resume_text if len(line)>3]
    return ' $ '.join(resume_text)

In [7]:
df['f_resume_text']=df['resume_text'].apply(lambda x:resume_text_formatting(x))

### Links Extraction

In [8]:
links=[]

In [9]:
def get_links(resume_text):
    links=[]
    return nt.extract_urls(resume_text)

In [10]:
df['links']=df['f_resume_text'].apply(lambda x:get_links(x))

---
# Resume Ranking
---

### Predefined Evaluation Metrics

In [11]:
ds_tech_skills=pd.read_csv("data_science_skills.csv",index_col=0)
ds_education=pd.read_csv("data_science_educations.csv",index_col=0)
ds_experiences=pd.read_csv("data_science_experiences.csv",index_col=0)
soft_skills=pd.read_csv("soft_skills.csv",index_col=0)
ds_certificates=pd.read_csv("data_science_certificates.csv",index_col=0)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer    #importing tf-idf for text vectorizer
from sklearn.metrics.pairwise import cosine_similarity         #importing cosine similarity for text similarity
# initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

In [13]:
similar_educations=[]
education_simlrty_score=[]
education_rank={}

In [14]:
def get_similar_educations(resume_text,educations):
    '''
    this function gets predefined educations and find most similar educations from candidate resume.
    it calculates average similarity score between predefined educations and candidate education.
    It return the results only if candidate education 80% matches with predefined educations
    '''
    similar_educations=[]
    education_simlrty_score=[]
    education_rank={}
    for line in resume_text.split('$'):
        for edu in educations:
            # fit and transform the documents
            tfidf_matrix = vectorizer.fit_transform([line, edu])
            # compute cosine similarity between the documents
            cosine_similarities = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1]).flatten()
            cosine_similarities=cosine_similarities[0]
            if cosine_similarities>=0.80:
                education_simlrty_score.append(cosine_similarities)
                similar_educations.append(line)
                print(line,edu,cosine_similarities)
    temp=sum(education_simlrty_score)/len((educations))
    education_rank[temp]=similar_educations
    return education_rank

In [15]:
df['education_score']=df['f_resume_text'].apply(lambda x:get_similar_educations(x,ds_education.values.reshape(-1)))

 Bachelor of Computer Science  Bachelor of Science in Computer Science 0.8344777154850027
 Bachelor of Computer Science  Bachelor of Science in Computer Science 0.8344777154850027
 Bachelor of Computer Science  Bachelor of Science in Computer Science 0.8344777154850027
 Bachelor of Computer Science  Bachelor of Science in Computer Science 0.8344777154850027


In [16]:
similar_certificates=[]
certificates_simlrty_score=[]
certificates_rank={}

In [17]:
def get_similar_certifications(resume_text,certificates):
    '''
    this function gets predefined certificates and find most similar certificates from candidate resume.
    it calculates average similarity score between predefined certificates and candidate certificates.
    It return the results only if candidate certificates 80% matches with predefined certificates
    '''
    similar_ceritificates=[]
    ceritificates_simlrty_score=[]
    ceritificates_rank={}
    for line in resume_text.split('$'):
        for cert in certificates:
            # fit and transform the documents
            tfidf_matrix = vectorizer.fit_transform([line, cert])
            # compute cosine similarity between the documents
            cosine_similarities = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1]).flatten()
            cosine_similarities=cosine_similarities[0]
            if cosine_similarities>=0.80:
                certificates_simlrty_score.append(cosine_similarities)
                similar_certificates.append(line)
               # print(line,cert,cosine_similarities)
    temp=sum(certificates_simlrty_score)/len((certificates))
    certificates_rank[temp]=similar_ceritificates
    return certificates_rank

In [18]:
df['certificates_score']=df['f_resume_text'].apply(lambda x:get_similar_certifications(x,ds_certificates.values.reshape(-1)))

 IBM Data Science Professional Certificate  IBM Data Science Professional Certificate 1.0000000000000002
 Microsoft Certified: Azure Data Scientist Associate  Microsoft Certified: Azure Data Scientist Associate 1.0000000000000002
 Google Data Analytics Professional Certificate  Google Data Analytics Professional Certificate 1.0000000000000002
 Cloudera Certified Data Analyst  Cloudera Certified Data Analyst 1.0
 DataCamp Certified Data Scientist  DataCamp Certified Data Scientist 1.0
 Python Institute Certified Data Science Associate  Python Institute Certified Data Science Associate 1.0000000000000002
 Hortonworks Certified Associate (HCA) for Apache Hadoop  Hortonworks Certified Associate (HCA) for Apache Hadoop 1.0000000000000004
 SAS Certified Big Data Professional  SAS Certified Big Data Professional 1.0000000000000002
 Amazon Web Services (AWS) Certified Data Analytics - Specialty  Amazon Web Services (AWS) Certified Data Analytics - Specialty 1.0000000000000002


In [19]:
similar_experiences=[]
experiences_simlrty_score=[]
experiences_rank={}

In [20]:
def get_similar_experiences(resume_text,experiences):
    '''
    this function gets predefined experiences and find most similar experiences from candidate resume.
    it calculates average similarity score between predefined experiences and candidate experiences.
    It return the results only if candidate experiences 80% matches with predefined experiences
    '''
    similar_experiences=[]
    experiences_simlrty_score=[]
    experiences_rank={}
    for line in resume_text.split('$'):
        for expr in experiences:
            # fit and transform the documents
            tfidf_matrix = vectorizer.fit_transform([line, expr])
            # compute cosine similarity between the documents
            cosine_similarities = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1]).flatten()
            cosine_similarities=cosine_similarities[0]
            if cosine_similarities>=0.80:
                experiences_simlrty_score.append(cosine_similarities)
                similar_experiences.append(line)
               # print(line,expr,cosine_similarities)
    temp=sum(experiences_simlrty_score)/len((experiences))
    experiences_rank[temp]=similar_experiences
    return experiences_rank

In [21]:
df['experience_score']=df['f_resume_text'].apply(lambda x:get_similar_experiences(x,ds_experiences.values.reshape(-1)))

 Experience with Python programming and data analysis libraries such as Pandas, NumPy, and SciPy  Experience with Python programming and data analysis libraries such as Pandas, NumPy, and SciPy 1.0
 Proficiency in SQL and experience with databases such as MySQL, PostgreSQL, or Oracle  Proficiency in SQL and experience with databases such as MySQL, PostgreSQL, or Oracle 1.0000000000000007
 Familiarity with machine learning algorithms and frameworks such as scikit-learn, TensorFlow, or PyTorch  Familiarity with machine learning algorithms and frameworks such as scikit-learn, TensorFlow, or PyTorch 1.0000000000000007
 Ability to work with large and complex datasets, and to clean and preprocess data as necessary  Ability to work with large and complex datasets, and to clean and preprocess data as necessary 1.0
 Experience with data visualization tools such as Tableau, D3.js, or Matplotlib  Experience with data visualization tools such as Tableau, D3.js, or Matplotlib 1.0000000000000002
 St

In [22]:
tech_skills_rank=0
soft_skills_rank=0

In [23]:
def check_tech_skills(resume_text,tech_skills):
    #checking the predefined skill is present in candidate resume or not
    tech_skills_rank=0
    for skill in tech_skills:
        if skill[0] in resume_text:
            tech_skills_rank+=1
    return tech_skills_rank/len(tech_skills)

In [24]:
def check_soft_skills(resume_text,soft_skills):
    #checking the predefined soft skill is present in candidate resume or not
    soft_skills_rank=0
    for skill in soft_skills:
        if skill[0] in resume_text:
            soft_skills_rank+=1
    return soft_skills_rank/len(soft_skills)

In [25]:
df['tech_skills_score']=df['f_resume_text'].apply(lambda x:check_tech_skills(x,ds_tech_skills.values.reshape(-1)))

In [26]:
df['soft_skills_score']=df['f_resume_text'].apply(lambda x:check_soft_skills(x,soft_skills.values.reshape(-1)))

---
# Final Resutls
---

In [None]:
#the results are sorted on score of skills,mean a candidate who has most similar skills, will stand on top

In [28]:
results=df[['resume_name','candidate_phone','candidate_email','links', 'tech_skills_score','soft_skills_score','education_score', 'certificates_score','experience_score']].sort_values(by=['tech_skills_score'],ascending=False)

In [31]:
results[0:2]

Unnamed: 0,resume_name,candidate_phone,candidate_email,links,tech_skills_score,soft_skills_score,education_score,certificates_score,experience_score
11,CV19.docx,(289) 863-9669,FredStewart@gmail.com,[],1.0,1.0,{0.0: []},{0.0: []},{0.0: []}
20,CV9.docx,(873) 929-6922,CarmenNakata@gmail.com,[],1.0,1.0,{0.0: []},{0.0: []},{0.0: []}


---
# Savin the Results
---

In [36]:
for candidate in range(10): #it will save the details of top 10 candidates in csv form 
    results.to_csv(results.iloc[candidate,]['resume_name'].split('.')[0]+'.csv')