# Resume Parsing
Version 3.0

# Imports

In [None]:
!pip install PyMuPDF
!pip install docx2pdf
import pandas as pd
import numpy as np
import sys
import fitz
from docx2pdf import convert
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter
from sklearn.model_selection import train_test_split


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting PyMuPDF
  Downloading PyMuPDF-1.20.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.8 MB)
[K     |████████████████████████████████| 8.8 MB 6.0 MB/s 
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.20.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting docx2pdf
  Downloading docx2pdf-0.1.8-py3-none-any.whl (6.7 kB)
Installing collected packages: docx2pdf
Successfully installed docx2pdf-0.1.8


In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

# Kaggle API

In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 67 bytes


In [None]:
!kaggle datasets download -d snehaanbhawal/resume-dataset

Downloading resume-dataset.zip to /content
 78% 49.0M/62.5M [00:00<00:00, 90.6MB/s]
100% 62.5M/62.5M [00:00<00:00, 99.7MB/s]


In [None]:
!unzip resume-dataset.zip

Archive:  resume-dataset.zip
  inflating: Resume/Resume.csv       
  inflating: data/data/ACCOUNTANT/10554236.pdf  
  inflating: data/data/ACCOUNTANT/10674770.pdf  
  inflating: data/data/ACCOUNTANT/11163645.pdf  
  inflating: data/data/ACCOUNTANT/11759079.pdf  
  inflating: data/data/ACCOUNTANT/12065211.pdf  
  inflating: data/data/ACCOUNTANT/12202337.pdf  
  inflating: data/data/ACCOUNTANT/12338274.pdf  
  inflating: data/data/ACCOUNTANT/12442909.pdf  
  inflating: data/data/ACCOUNTANT/12780508.pdf  
  inflating: data/data/ACCOUNTANT/12802330.pdf  
  inflating: data/data/ACCOUNTANT/13072019.pdf  
  inflating: data/data/ACCOUNTANT/13130984.pdf  
  inflating: data/data/ACCOUNTANT/13294301.pdf  
  inflating: data/data/ACCOUNTANT/13491889.pdf  
  inflating: data/data/ACCOUNTANT/13701259.pdf  
  inflating: data/data/ACCOUNTANT/14055988.pdf  
  inflating: data/data/ACCOUNTANT/14126433.pdf  
  inflating: data/data/ACCOUNTANT/14224370.pdf  
  inflating: data/data/ACCOUNTANT/14449423.pdf  
  

# Pre-processing
* It consists of some main steps
    * Lowercase
    * Removing Punctuation
    * Tokenization
    * Stopword Filtering
    * Stemming
    * Lemmatization

## Cleaning

In [None]:
def clean(df):
  resume_df = df.copy()
  resume_df[['ID']] = resume_df[['ID']].apply(pd.to_numeric, errors='coerce')
  resume_df.drop(columns=['Resume_html'], inplace = True)
  return resume_df

## Lowercase

In [None]:
def to_lower(df):
  lower_df = df.copy()
  lower_df["Resume_str"] = lower_df["Resume_str"].str.lower()
  lower_df["Category"] = lower_df["Category"].str.lower()
  return lower_df

## Removing Punctuation

In [None]:
def rem_punct(df):
  punct_df = df.copy()
  punct_df['punct_sent'] = punct_df.apply(lambda row: "".join([char for char in row['Resume_str'] if char not in string.punctuation]), axis=1)
  return punct_df

## Tokenization

In [None]:
def to_tokens(df):
  tokens_df = df.copy()
  tokens_df['tokenized_sents'] = tokens_df.apply(lambda row: nltk.word_tokenize(row['punct_sent']), axis=1)
  return tokens_df

## Stop Words

In [None]:
def rem_stop_words(df):
  stop_df = df.copy()
  stop_words = stopwords.words('english')
  stop_df['stop_words'] = stop_df.apply(lambda row: [word for word in row['tokenized_sents'] if word not in stop_words], axis=1)
  return stop_df

## Stemming

In [None]:
def stemming(df):
  new_df = df.copy()
  porter = PorterStemmer()
  new_df['Stemmed'] = df.apply(lambda row:[porter.stem(word) for word in row['stop_words']], axis = 1)
  return new_df

## Lemmatization

In [None]:
def lemmatizing(df):
  new_df = df.copy()
  lemmatizer = WordNetLemmatizer()
  new_df['lemmatized'] = df.apply(lambda row:[lemmatizer.lemmatize(word) for word in row['stop_words']], axis = 1)
  return new_df

# Feature Extraction

## TF

In [None]:
def TF_doc(df):
  TF_document_df = df.copy()
  #### Retrun to stemming / lemmetization
  TF_document_df['TF_doc'] = TF_document_df.apply(lambda row: Counter(row['stop_words']), axis=1)
  return TF_document_df

## TF for each Category

In [None]:
def TF_category(df):
  ## Getting the Categories Names
  TF_cat = pd.DataFrame(
                  columns=pd.Index( df.Category.unique()),
                  index=pd.Index([]))
  ## Calculating words frequency within each category
  for index, row in df.iterrows():
    for item, value in row["TF_doc"].items():
      if item not in TF_cat.index:
        TF_cat.loc[item] = 0
        TF_cat.loc[item, row["Category"]] = value
      else:      
        TF_cat.loc[item, row["Category"]] += value

  return TF_cat

In [None]:
def TF_Normalize(df):
  TF_Normalized = df.copy()
  for col in TF_Normalized:
    TF_Normalized[col]/=TF_Normalized[col].sum()
  return TF_Normalized

## IDF

In [None]:
def IDF(df):
  # Make a Copy to work with
  IDF_DF = df.copy()
  
  # 1+ loge(No of documents in corpus/No. of documents containing the word)
  IDF_DF['IDF'] = IDF_DF.apply(lambda row: 1+ np.log(len(IDF_DF.columns)/row.astype(bool).sum()), axis=1)

  return IDF_DF

## TF-IDF

In [None]:
def TF_IDF(TF, IDF):
  TF_IDF_DF = TF.copy()
  TF_IDF_DF = TF_IDF_DF.multiply(IDF["IDF"], axis="index")
  return TF_IDF_DF

# Main

In [None]:
def preprocess(file_name):
  resume_df = pd.read_csv(file_name)
  cleaned_df = clean(resume_df)
  lowered_df = to_lower(cleaned_df)
  punct_df = rem_punct(lowered_df)
  tokenized_df = to_tokens(punct_df)
  stop_words_df = rem_stop_words(tokenized_df)
  stemmed_df = stemming(stop_words_df)
  lemma_df = lemmatizing(stemmed_df)

  return lemma_df

In [None]:
train_result = preprocess("Resume/Resume.csv")

In [None]:
x = train_result.loc[:, train_result.columns != "Category"]

In [None]:
y = train_result["Category"]

In [None]:
np.random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 22)

# Training

In [None]:
def train(input_df):
  TF_doc_df = TF_doc(input_df)
  TF_cat_df = TF_category(TF_doc_df)
  TF_Norm_df = TF_Normalize(TF_cat_df)
  IDF_df = IDF(TF_cat_df)
  TF_IDF_df = TF_IDF(TF_Norm_df, IDF_df)

  return TF_IDF_df

In [None]:
training_df_comb = X_train.join(y_train)

In [None]:
weights = train(training_df_comb)

# Evaluation

## TF

In [None]:
def TF_doc(df):
  TF_document_df = df.copy()
  #### Retrun to stemming / lemmetization
  TF_document_df['TF_doc'] = TF_document_df.apply(lambda row: Counter(row['Stemmed']), axis=1)
  return TF_document_df

## TF for each Category

In [None]:
def TF_category(df):
  ## Getting the Categories Names
  TF_cat = pd.DataFrame(
                  columns=pd.Index( ["Test"]),
                  index=pd.Index([]))
  ## Calculating words frequency within each category
  for index, row in df.iterrows():
    for item, value in row["TF_doc"].items():
      if item not in TF_cat.index:
        TF_cat.loc[item] = 0
        TF_cat.loc[item, "Test"] = value
      else:      
        TF_cat.loc[item, "Test"] += value

  return TF_cat

In [None]:
def TF_Normalize(df):
  TF_Normalized = df.copy()
  for col in TF_Normalized:
    TF_Normalized[col]/=TF_Normalized[col].sum()
  return TF_Normalized

## IDF

In [None]:
def IDF(df):
  # Make a Copy to work with
  IDF_DF = df.copy()
  
  # 1+ loge(No of documents in corpus/No. of documents containing the word)
  IDF_DF['IDF'] = IDF_DF.apply(lambda row: 1+ np.log(len(IDF_DF.columns)/row.astype(bool).sum()), axis=1)

  return IDF_DF

## TF-IDF

In [None]:
def TF_IDF(TF, IDF):
  TF_IDF_DF = TF.copy()
  TF_IDF_DF = TF_IDF_DF.multiply(IDF["IDF"], axis="index")
  return TF_IDF_DF

## Cosine Similarity

In [None]:
def cosine_similarity(vec1, vec2):
  return (np.dot(vec1,vec2))/(np.sqrt(sum(np.square(vec1)))*np.sqrt(sum(np.square(vec2))))

## Predict

In [None]:
def predict(train_inp_df, inp_X_test, inp_y_test):
  temp = TF_doc(inp_X_test)
  counter = 0
  correct = 0
  for i in range(len(temp)):
    resume_1 = temp.iloc[i]
    resume_1 = resume_1.to_frame().transpose()
    temp_2 = TF_category(resume_1)
    temp_3 = TF_Normalize(temp_2)
    temp_4 = IDF(temp_2)
    temp_5 = TF_IDF(temp_3, temp_4)
    TF_cat = pd.DataFrame(
                  columns=pd.Index(train_inp_df.columns),
                  index=pd.Index([]))
    #for item in TF_IDF_test
    for index, row in temp_5.iterrows():
      if index not in train_inp_df.index:
        TF_cat.loc[index] = 0
      else:      
        TF_cat.loc[index] = train_inp_df.loc[index]

    scores_dict = dict()
    ## Calculating Similarity
    TF_IDF_arr_query = np.array(temp_5["Test"])
    max_score = -1
    cat_sim = ""
    #print("Similarities:#########")
    for col in TF_cat.columns:
      temp_arr = np.array(TF_cat[col])
      sim_score = cosine_similarity(TF_IDF_arr_query, temp_arr)
      scores_dict[col] = sim_score
      if sim_score> max_score:
        max_score = sim_score
        cat_sim = col
      #print(col, sim_score)

    # print("########\nWinning Category:", cat_sim, "\nWith Score:", max_score, "\nActual Category:", inp_y_test.iloc[i])
    counter += 1
    if cat_sim == inp_y_test.iloc[i] :
      correct += 1
  print("Correct =", str(correct), "\nTotal =", str(counter), "\nAccuracy = ", str(correct / counter * 100))

In [None]:
predict(weights,X_test,y_test)

Correct = 120 
Total = 249 
Accuracy =  48.19277108433735


## Predict Top N

In [None]:
def predict_top_N(train_inp_df, inp_X_test, inp_y_test, N=3):
  temp = TF_doc(inp_X_test)
  counter = 0
  correct = 0
  for i in range(len(temp)):
    resume_1 = temp.iloc[i]
    resume_1 = resume_1.to_frame().transpose()
    temp_2 = TF_category(resume_1)
    temp_3 = TF_Normalize(temp_2)
    temp_4 = IDF(temp_2)
    temp_5 = TF_IDF(temp_3, temp_4)
    TF_cat = pd.DataFrame(
                  columns=pd.Index(train_inp_df.columns),
                  index=pd.Index([]))
    #for item in TF_IDF_test
    for index, row in temp_5.iterrows():
      if index not in train_inp_df.index:
        TF_cat.loc[index] = 0
      else:      
        TF_cat.loc[index] = train_inp_df.loc[index]

    scores_dict = dict()
    ## Calculating Similarity
    TF_IDF_arr_query = np.array(temp_5["Test"])
    max_score = -1
    cat_sim = ""
    #print("Similarities:#########")
    for col in TF_cat.columns:
      temp_arr = np.array(TF_cat[col])
      sim_score = cosine_similarity(TF_IDF_arr_query, temp_arr)
      scores_dict[col] = sim_score
      if sim_score> max_score:
        max_score = sim_score
        cat_sim = col
      #print(col, sim_score)

    # print("########\nWinning Category:", cat_sim, "\nWith Score:", max_score, "\nActual Category:", inp_y_test.iloc[i])
    
    scores_dict = sorted(scores_dict.items(), key=lambda item: item[1], reverse= True)
    sorted_dict = dict()
    for k, v in scores_dict:
        sorted_dict[k] = v
    
    # print(sorted_dict)
    iter = N
    for k,v in sorted_dict.items():
      iter -= 1
      
      if k == inp_y_test.iloc[i] :
        correct += 1
        break

      if iter == 0:
        break

    counter += 1

    # if i == 10:
    #   break
  accuracy = correct / counter * 100  
  print("N = ", N, "Correct =", str(correct), "\nTotal =", str(counter), "\nAccuracy = ", str(accuracy))
  
  return accuracy

In [None]:
predict_top_N(weights,X_test,y_test)

N =  3 Correct = 173 
Total = 249 
Accuracy =  69.47791164658635


69.47791164658635

In [None]:
results = dict()
for i in range(1, 11):
  acc = predict_top_N(weights,X_test,y_test, i)
  results[i] = acc

N =  1 Correct = 120 
Total = 249 
Accuracy =  48.19277108433735
N =  2 Correct = 154 
Total = 249 
Accuracy =  61.84738955823293
N =  3 Correct = 173 
Total = 249 
Accuracy =  69.47791164658635
N =  4 Correct = 185 
Total = 249 
Accuracy =  74.29718875502009
N =  5 Correct = 190 
Total = 249 
Accuracy =  76.30522088353415
N =  6 Correct = 199 
Total = 249 
Accuracy =  79.91967871485943
N =  7 Correct = 204 
Total = 249 
Accuracy =  81.92771084337349
N =  8 Correct = 207 
Total = 249 
Accuracy =  83.13253012048193
N =  9 Correct = 213 
Total = 249 
Accuracy =  85.54216867469879
N =  10 Correct = 215 
Total = 249 
Accuracy =  86.34538152610442


In [None]:
results

{1: 48.19277108433735,
 2: 61.84738955823293,
 3: 69.47791164658635,
 4: 74.29718875502009,
 5: 76.30522088353415,
 6: 79.91967871485943,
 7: 81.92771084337349,
 8: 83.13253012048193,
 9: 85.54216867469879,
 10: 86.34538152610442}