# Resume Screening
Version 1.0

***
## Pipeline

1. Resume Parsing  
    * The first subtask is parsing the resume, i.e., extracting information in a structured format from the document. 
2. Resume Matching
    * The second sub-task is extracting semantic information and actually understanding the underlying information.
3. Database Formation

***

## Import Libraries

In [None]:
# !pip install PyMuPDF
# !pip install docx2pdf

In [None]:
import numpy
import sys
import fitz
from docx2pdf import convert
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [None]:
#nltk.download('punkt')
#nltk.download('stopwords')

## Reading the file

In [None]:
def read_file(fname):
    if fname.split(".")[1] == 'docx':
        convert(fname)
        fname = fname.split(".")[0] + ".pdf"
    elif fname.split(".")[1] == 'pdf':
        pass
    else:
        print("Only PDF and docx types are supported!")
        return
    
    doc = fitz.open(fname)
    text = ""
    for page in doc:
        text = text + str(page.get_text())

    tx = " ".join(text.split('\n'))  
    #print(text)
    return text

In [None]:
resume = read_file('Ahmed Osama-Long Version.docx')
resume

## Pre-processing
* It consists of some main steps
    * Lowercase
    * Removing Punctuation
    * Tokenization
    * Stopword Filtering
    * Stemming
    * Lemmatization

### Lowercase

In [None]:
resume_lower = resume.lower()
resume_lower

### Removing Punctuation

In [None]:
#print(string.punctuation)
## Note + needs to be kept -> C++
## Note @ needs to be kept -> May remove emails 
## Remove bullet points
## Remove Qoutes
resume_punc = "".join([char for char in resume_lower if char not in string.punctuation + "●•"])
print(resume_punc)

### Tokenization

In [None]:
resume_tokens = nltk.word_tokenize(resume_punc)
resume_tokens

### Stopword Filtering

In [None]:
stop_words = stopwords.words('english')
#print(stop_words)
resume_filtered_words = [word for word in resume_tokens if word not in stop_words]
#print(resume_filtered_words)
resume_filtered_words

### Stemming

In [None]:
porter = PorterStemmer()
stemmed_resume = [porter.stem(word) for word in resume_filtered_words]
print(stemmed_resume)

## Feature Extraction

### Words Count (Frequency)

In [34]:
resume_freq = dict()
for word in stemmed_resume:
    resume_freq[word] = resume_freq.get(word,0)+1
print(resume_freq)

{'ahm': 1, 'osama': 1, 'moham': 1, 'address': 1, 'giza': 1, 'egypt': 1, 'mobil': 1, '201118862871': 1, 'email': 1, 'ahmadosgalalgmailcom': 1, 'linkedin': 1, 'linkedincominahmaddosama': 1, 'github': 1, 'githubcomahmadosgal': 1, 'hackerrank': 1, 'hackerrankcomahmadgalal99': 1, 'work': 1, 'experi': 1, 'softwar': 1, 'develop': 3, 'internship': 1, 'shadi': 1, 'system': 9, 'summer': 1, '2021': 3, 'implement': 6, 'databas': 3, 'queri': 3, 'store': 3, 'procedur': 1, 'build': 1, 'web': 4, 'applic': 2, 'call': 1, '“': 3, 'smart': 1, '”': 3, 'use': 10, 'sql': 4, 'microsoft': 1, 'server': 1, 'postman': 1, 'api': 2, 'test': 2, 'aspnet': 1, 'core': 1, 'document': 2, 'featur': 2, 'educ': 2, 'certif': 1, 'comput': 2, 'scienc': 1, 'cairo': 1, 'univers': 1, 'faculti': 1, 'engin': 4, 'credit': 1, 'hour': 1, '2017': 2, '–': 3, '2022': 1, 'commun': 1, 'ccec': 1, 'cumul': 1, 'gpa': 1, '378400': 1, 'secondari': 1, 'dar': 1, 'el': 1, 'tarbiah': 1, 'school': 1, '2014': 1, 'technolog': 2, 'languag': 3, 'c': 5, 