# Importing Dependencies

In [209]:
import pandas as pd           #python package for data analysis
import numpy as np            #python package for handling arrays
import os                     #python package for dealing with system directory paths
import docx2txt               #python package for getting data from word documents
import neattext as nt         #python package for text cleaning

# Reading the Documents

In [274]:
files=os.listdir('./Anonymous_CVs/')  #give address of directory where your documents are present

# Converting the Documents into the DataFrame

In [275]:
documents=dict()   #dictomary of documents

In [276]:
for file in files[0:5]:
    document= docx2txt.process('./Anonymous_CVs/'+file)   #getting text from word documents
    documents[file]=document

In [277]:
documents_df=pd.DataFrame(documents.items(),columns=['Document Name','Documents'])

# Documents DataFrame

In [278]:
documents_df

Unnamed: 0,Document Name,Documents
0,CV1.docx,Donald Petrovich\n\nEmail: DonaldPetrovich@gma...
1,CV2.docx,Helen Grant\n\n(922) 679-9797\nHelenGrant@gmai...
2,CV3.docx,Clarence Price\n\n(786) 324-2395 ClarencePri...
3,CV4.docx,Jennifer Gillman\n\nJenniferGillman@gmail.com\...
4,CV7.docx,Gayle Hawkins\n\n\tSr. Business Systems Analys...


# Doccuments Cleaning

In [279]:
def docs_cleaning(text):
    text=nt.remove_emails(text)
    text=nt.remove_numbers(text)
    text=nt.remove_stopwords(text)
    text=nt.remove_special_characters(text)
    text=nt.remove_emojis(text)
    text=nt.remove_phone_numbers(text)
    text=nt.remove_multiple_spaces(text)
    text=' '.join(text.split())
    return text.lower()

In [280]:
documents_df['Cleand_Documents']=documents_df['Documents'].apply(lambda x:docs_cleaning(x))

# Text Lemmatization

In [281]:
import nltk
from nltk.stem import 	WordNetLemmatizer   #nltk word lemmatizer
wordnet_lemmatizer = WordNetLemmatizer()    #creating object of word lemmatizer

In [282]:
lemmarized_words=[]

In [283]:
#method for text lemmatization
def lemmatization(text):
    lemmarized_words=[]
    tokenization = nltk.word_tokenize(text) #word tokenization using nltk word tokenizer
    for word in tokenization:
        word=wordnet_lemmatizer.lemmatize(word)
        lemmarized_words.append(word)
    return ' '.join(lemmarized_words)
    

In [284]:
documents_df['Lemmatized_Documents']=documents_df['Cleand_Documents'].apply(lambda x:lemmatization(x))

# Creating Word Dictionary

In [285]:
word_dict=dict()

In [286]:
def word_dictionary(text):
    word_dict=dict()
    for word in text.split():
        if word in word_dict.keys():
            word_dict[word]+=1
        else:
            word_dict[word]=1
    return word_dict

In [287]:
documents_df['Word_Dictionary']=documents_df['Lemmatized_Documents'].apply(lambda x:word_dictionary(x))

In [288]:
# Preprocessing on the Document that we are going to match with our previous documents

In [289]:
def pre_processing(text):
    text=nt.remove_emails(text)
    text=nt.remove_numbers(text)
    text=nt.remove_stopwords(text)
    text=nt.remove_special_characters(text)
    text=nt.remove_emojis(text)
    text=nt.remove_phone_numbers(text)
    text=nt.remove_multiple_spaces(text)
    text=text.lower()
    #lemmatization
    tokenization = nltk.word_tokenize(text)
    lemmarized_words=[]     
    for word in tokenization:
        word=wordnet_lemmatizer.lemmatize(word)
        lemmarized_words.append(word)
    return ' '.join(lemmarized_words)

# Reading our Searched Document

In [290]:
searched_doc= docx2txt.process('./Anonymous_CVs/CV1.docx')   #reading the document to be searched

In [291]:
searched_doc=pre_processing(searched_doc)   #applying preprocessing on the document to be searched

In [292]:
# creating word dictionary for the document to be searched
def searched_doc_word_dictionary(text):
    for word in text.split():
        if word in word_dict.keys():
            word_dict[word]+=1
        else:
            word_dict[word]=1
    return word_dict

In [293]:
searched_doc_dict=searched_doc_word_dictionary(searched_doc)

# Calculating the Similarity Between All the Docs with Searched Doc

In [294]:
#sklearn package for text vectorization
from sklearn.feature_extraction.text import CountVectorizer 
cv=CountVectorizer()  #creating object of text vectorizer

In [295]:
#importing cosine similarity for finding documents similarity
from sklearn.metrics.pairwise import cosine_similarity   

In [296]:
def get_similar_docs(documents,searched_document):
    content=[str(documents.keys()),str(searched_document.keys())]
    matrix=cv.fit_transform(content)
    similarity_matrix=cosine_similarity(matrix)[0][1]
    similarity=round((similarity_matrix*100),3)
    return similarity

In [299]:
documents_df['Similarity']=documents_df['Word_Dictionary'].apply(lambda x:get_similar_docs(x,searched_doc_dict))

# Similarity Results

In [300]:
documents_df[['Similarity','Document Name']]

Unnamed: 0,Similarity,Document Name
0,100.0,CV1.docx
1,51.123,CV2.docx
2,16.639,CV3.docx
3,25.643,CV4.docx
4,30.385,CV7.docx
