In [1]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rahulram\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Job Description

job_df = pd.read_csv("Job_Descriptions.csv")
job_df.drop(["Unnamed: 0"], axis=1, inplace=True)
job_df.head()

Unnamed: 0,company_name,job_description,position_title,description_length,model_response
0,HCA Healthcare,registrar\n\n\nhca florida memorial hospital\n...,Registrar,2223,"{\n ""Core Responsibilities"": ""Interview pati..."
1,Major League Baseball,want your work displayed across major league b...,Live Content Creator (Seasonal),2336,"{\n ""Core Responsibilities"": ""Capture pregam..."
2,Teaneck Public Schools,jobid \n\nposition type\n\ndistrictart\n\ndat...,Art Teacher,625,"{\n ""Core Responsibilities"": ""Teach art clas..."
3,Volt,volt has partnered with a leading manufacturin...,Talent Acquisition Specialist / Recruiter,2859,"{\n ""Core Responsibilities"": ""primary recrui..."
4,Lear Corporation,we are hiring a customer service representativ...,Customer Service Representative,1265,"{\n ""Core Responsibilities"": ""Responding pro..."


In [3]:
# Resume / Candidate

resume_df = pd.read_csv("Extracted_Data.csv")
resume_df.rename(columns={"Unnamed: 0" : "Id"}, inplace=True)
resume_df.head()

Unnamed: 0,Id,Category,Pages,Data
0,10554236,ACCOUNTANT,5,ACCOUNTANT\nSummary\nFinancial Accountant spec...
1,10674770,ACCOUNTANT,2,STAFF ACCOUNTANT\nSummary\nHighly analytical a...
2,11163645,ACCOUNTANT,2,ACCOUNTANT\nProfessional Summary\nTo obtain a ...
3,11759079,ACCOUNTANT,2,SENIOR ACCOUNTANT\nExperience\nCompany Name Ju...
4,12065211,ACCOUNTANT,2,SENIOR ACCOUNTANT\nProfessional Summary\nSenio...


### Pre-Processing on Resume DataFrame 

In [4]:
# Drop Duplicates

print("No of Duplicate Records :", resume_df.duplicated().sum())
print("No of Duplicate Data :", resume_df["Data"].duplicated().sum())

# Drop Duplicates 
resume_df = resume_df[resume_df["Data"].duplicated() == False]

# Testing
print("Testing - No of Duplicate Data :", resume_df["Data"].duplicated().sum())

No of Duplicate Records : 0
No of Duplicate Data : 2
Testing - No of Duplicate Data : 0


In [5]:
# NA Values

print("NA Values : ")
print(resume_df.isna().sum(), end="\n\n")

print("NA Record")
print(resume_df[resume_df["Data"].isna()], end="\n\n")

# Drop Na Row
resume_df.dropna(inplace=True)

# Testing 
print("Testing - NA Values :", resume_df.isna().any().sum())

NA Values : 
Id          0
Category    0
Pages       0
Data        1
dtype: int64

NA Record
           Id              Category  Pages Data
802  12632728  BUSINESS-DEVELOPMENT      1  NaN

Testing - NA Values : 0


#### Pre-processing Text

The `pre_process_text` function performs the following tasks:

- Removes hyperlinks and weblinks.
- Removes punctuation.
- Removes stop words.

**NOTE**
- I chose not to lemmatize the text as it might remove action words from the resume, which are equally important. 
- However, for extensive research on a resume, it's equally important to consider hyperlinks and weblinks.


In [6]:
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize

def pre_process_text(string):
    
    # Remove any Links [HyperLink / WebLink] and Punctuations 
    string = re.sub(r'https?://\S+|www\.\S+', "", string)
    string = re.sub(r'[!\"#$%&\'()*+,-./:;<=>?@[\\\]]', "", string) 
    
    # Convert string to Lowercase
    string = string.lower()
    
    processed_string = ""
    
    # Removing Stopwords
    for sent in sent_tokenize(string):
        sub_string = " ".join([word for word in word_tokenize(sent) if word not in stopwords.words('english')])
        processed_string += sub_string + " "
        
    return processed_string

In [7]:
%%time

resume_df["Processed_Data"] = resume_df["Data"].apply(pre_process_text)
resume_df.to_csv("Processed_Data.csv")

CPU times: total: 3min 10s
Wall time: 4min 45s


### Pre-Processing on Job Description DataFrame

In [8]:
# Drop description_length column
job_df.drop(["description_length"], axis=1, inplace=True)
print("Columns :", job_df.columns.values)


print("No of NULL Record :", job_df.isna().any().sum())
print("No of Duplicate Record :", job_df.duplicated().sum())

Columns : ['company_name' 'job_description' 'position_title' 'model_response']
No of NULL Record : 0
No of Duplicate Record : 0


In [9]:
# For My Understanding

# for i in range(10):
#     print(job_df["position_title"][i])
#     print(job_df["job_description"][i])
#     print(job_df["model_response"][i])

#### Note

- From my initial understanding of the job description dataset, it seems sufficient to focus on the `model_response` rather than the job description. 
- Additionally, I need to rework the preprocessing of the resume dataset. Instead of transforming the entire resume into features, I should extract only the **skills**, **education**, **responsibilities**, and **experience**.


In [10]:
# Job Description Feature Extraction

job_df["Feature"] = [" ".join(re.findall(r'"(.*?)"', feature.strip())[:8]) for feature in job_df["model_response"]]
job_df["Processed_Feature"] = job_df["Feature"].apply(pre_process_text)

#### Pre-processing Resume Data

The `resume_data_feature_extraction` function performs the following tasks:
- Removes hyperlinks and weblinks.
- Removes stop words.
- Extracts data containing keywords **skills**, **education**, **responsibilities**, and **experience**.

In [11]:
# Re Work on resume Data

def resume_data_feature_extraction(string):
    
    # Remove any Links [HyperLink / WebLink]
    string = re.sub(r'https?://\S+|www\.\S+', "", string) 
    
    # Convert string to Lowercase
    string = string.lower()
    
    mp = {
          "responsibilities" : [],
          "skill" : [],
          "education" : [],
          "experience" : [],
          "highlights" :[],
          "summary" :[],
         }
    
    for sent in sent_tokenize(string):
        for req in mp: # REQ -> responsibilities, skill, education, experience, highlights, summary
            if(req in sent): 
                sub_string = " ".join([word for word in word_tokenize(sent) if word not in stopwords.words('english')]) # Removing Stopwords
                mp[req].append(sub_string)
                break # To avoid Duplicates
    
    return_string = ""
    for req in mp:
        if(len(mp[req]) == 0): # If any of the columns in empty, skip it
            continue
        return_string += " " + req + " " + " ".join(mp[req])
    
    return_string = re.sub(r'[!\"#$%&\'()*+,-./:;<=>?@[\\\]]', "", return_string) 
    
    return return_string

In [12]:
%%time

resume_df["Processed_Feature"] = resume_df["Data"].apply(resume_data_feature_extraction)
resume_df.to_csv("Processed_Data.csv")

CPU times: total: 1min 23s
Wall time: 2min 7s


In [13]:
# Read DataFrame

resume_df = pd.read_csv("Processed_Data.csv")
resume_df.drop(["Unnamed: 0"], axis=1, inplace=True)
resume_df.head()

Unnamed: 0,Id,Category,Pages,Data,Processed_Data,Processed_Feature
0,10554236,ACCOUNTANT,5,ACCOUNTANT\nSummary\nFinancial Accountant spec...,accountant summary financial accountant specia...,responsibilities briefing identified team mem...
1,10674770,ACCOUNTANT,2,STAFF ACCOUNTANT\nSummary\nHighly analytical a...,staff accountant summary highly analytical det...,skill excel problem solving strategic plannin...
2,11163645,ACCOUNTANT,2,ACCOUNTANT\nProfessional Summary\nTo obtain a ...,accountant professional summary obtain positio...,skill accountant professional summary obtain ...
3,11759079,ACCOUNTANT,2,SENIOR ACCOUNTANT\nExperience\nCompany Name Ju...,senior accountant experience company name june...,skill developed writing skills drafting forty...
4,12065211,ACCOUNTANT,2,SENIOR ACCOUNTANT\nProfessional Summary\nSenio...,senior accountant professional summary senior ...,skill senior accountant professional summary ...


#### Difference Between `pre_process_text` and `resume_data_feature_extraction` Functions

In [14]:
for index in [10, 100, 200, 1000, 2000]:
    print("Length of Data :", len(resume_df["Data"][index]))
    print("Length of Data after pre_process_text :", len(resume_df["Processed_Data"][index]))
    print("Length of Data after resume_data_feature_extraction :", len(resume_df["Processed_Feature"][index]), end="\n\n")

Length of Data : 6066
Length of Data after pre_process_text : 5143
Length of Data after resume_data_feature_extraction : 710

Length of Data : 4797
Length of Data after pre_process_text : 4171
Length of Data after resume_data_feature_extraction : 1992

Length of Data : 7362
Length of Data after pre_process_text : 6357
Length of Data after resume_data_feature_extraction : 2995

Length of Data : 5306
Length of Data after pre_process_text : 4740
Length of Data after resume_data_feature_extraction : 2843

Length of Data : 5179
Length of Data after pre_process_text : 4585
Length of Data after resume_data_feature_extraction : 1686



In [15]:
index = 69
print("Processed_Data : ", resume_df["Processed_Data"][index], end="\n\n", sep="\n")
print("Processed_Feature : ", resume_df["Processed_Feature"][index], end="\n\n", sep="\n")

Processed_Data : 
senior accountant summary 8 years accomplished experience field accounting team organizational training major global public corporation exceptionally fast efficient organized knowledge accounting functions gl pl bs budgets forecasting variance analysis trend analysis financial reporting reconciliations work papers journal entries accruals ap ar experience gaap statutory accounting monthly yearend closing processes highlights oracle financial peoplesoft microsoft dynamics nav microsoft office suite outlook lotus notes experience senior accountant 062015 current company name city state prepare examine analyze accounting records financial statements financial reports assess accuracy completeness conformance reporting procedural standards process prepare maintain reporting related inventory associated recurring andor ad hoc journal entries account analysis financial reporting account reconciliation system interface analysis cogs rebates adjustments revenues accordance est

#### It's evident that `resume_data_feature_extraction` nearly reduced more than half of the data extracted by `pre_process_text`.


## Model Building 

### Reference
- https://www.pinecone.io/learn/semantic-search/

### SentenceTransformers Documentation
- https://www.sbert.net/index.html

In [16]:
# ! pip install -U sentence-transformers

In [17]:
print("Max Length of Resume Processed_Feature :", max(resume_df["Processed_Feature"].apply(lambda x : len(x))))
print("Max Length of Job Processed_Feature : ", max(job_df["Processed_Feature"].apply(lambda x : len(x))), end="\n\n")

print("Max Words of Resume Processed_Feature :", max(resume_df["Processed_Feature"].apply(lambda x : len(x.split()))))
print("Max Words of Job Processed_Feature : ", max(job_df["Processed_Feature"].apply(lambda x : len(x.split()))), end="\n\n")

print("Mean No of Words in Resume Processed_Feature :", np.mean(resume_df["Processed_Feature"].apply(lambda x : len(x.split()))))
print("Mean No of Words in Job Processed_Feature : ", np.mean(job_df["Processed_Feature"].apply(lambda x : len(x.split()))), end="\n\n")

Max Length of Resume Processed_Feature : 20336
Max Length of Job Processed_Feature :  2000

Max Words of Resume Processed_Feature : 2414
Max Words of Job Processed_Feature :  246

Mean No of Words in Resume Processed_Feature : 243.53123740427247
Mean No of Words in Job Processed_Feature :  99.4



- Since the mean number of words in a resume is around 240, and the maximum is 2414, it's better to set the max_seq_length to 512.

### Model : gtr-t5-large

* https://huggingface.co/sentence-transformers/gtr-t5-large

In [18]:
# New DataFrame

resume = resume_df[["Id", "Category", "Processed_Feature"]]
job = job_df[["company_name", "position_title", "Processed_Feature"]]

In [19]:
from sentence_transformers import SentenceTransformer

model  = SentenceTransformer("gtr-t5-large")
model.max_seq_length = 512
print(model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: T5EncoderModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Dense({'in_features': 1024, 'out_features': 768, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity'})
  (3): Normalize()
)


In [20]:
%%time
# Apply Embeddings

resume["Embedding"] = resume["Processed_Feature"].apply(lambda x : model.encode(x).reshape(1, -1))
job["Embedding"] = job["Processed_Feature"].apply(lambda x : model.encode(x).reshape(1, -1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: total: 1min 54s
Wall time: 2min 48s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [21]:
# Cosine Similarity

from sklearn.metrics.pairwise import cosine_similarity

for index in range(job.shape[0]):
    
    # Compute Cosine Similarity
    similarity = np.array([cosine_similarity(job["Embedding"][index], resume["Embedding"][resume_index])[0][0] for resume_index in range(resume.shape[0])])
    
    # Retrieve Top 5 Resumes
    top_5_resume_index = np.argsort(similarity)[:-6:-1]
    top_5_resume = resume.iloc[top_5_resume_index].head()
    top_5_resume["Similarity"] = np.sort(similarity)[:-6:-1]
    
    # Display
    print("Company :", job["company_name"][index])
    print("Position :", job["position_title"][index], end="\n\n")
    print(top_5_resume[["Category", "Id", "Similarity"]].to_string(index=False), end="\n\n")

Company : HCA Healthcare
Position : Registrar

  Category       Id  Similarity
CONSULTANT 18856440    0.760448
  ADVOCATE 23427369    0.759660
  ADVOCATE 22259475    0.756234
  ADVOCATE 28206098    0.752234
HEALTHCARE 45907524    0.751526

Company : Major League Baseball
Position : Live Content Creator (Seasonal)

        Category       Id  Similarity
PUBLIC-RELATIONS 24559558    0.779076
   DIGITAL-MEDIA 70196518    0.774543
   DIGITAL-MEDIA 11270462    0.766067
   DIGITAL-MEDIA 16536141    0.763968
PUBLIC-RELATIONS 16620172    0.759614

Company : Teaneck Public Schools
Position : Art Teacher

Category       Id  Similarity
    ARTS 12386670    0.760832
    ARTS 28629430    0.743643
    ARTS 23752500    0.732657
 APPAREL 14413257    0.731279
 TEACHER 35421497    0.727940

Company : Volt
Position : Talent Acquisition Specialist / Recruiter

        Category       Id  Similarity
PUBLIC-RELATIONS 13727873    0.854944
              HR 30862904    0.848326
              HR 73077810    0.847

#### Note

- I initially thought of reducing the search space by filtering resumes based on categories with the help of job positions. However, it would have been a blunder if I had implemented it. 
- For example, in the case of Company: Lear Corporation and Position: Customer Service Representative, the top resumes are from healthcare, automobile, and fitness backgrounds.

In [22]:
print("Lear Corporation, Role : Customer Service Representative", end="\n\n")
print(job[job["company_name"] == "Lear Corporation"]["Processed_Feature"].values, end="\n\n")

print(resume_df[resume_df["Id"] == 34594746][["Id", "Category"]].to_string(index=False), end="\n\n")
print(resume_df[resume_df["Id"] == 34594746]["Processed_Feature"].values, end="\n\n")

Lear Corporation, Role : Customer Service Representative

['core responsibilities responding promptly customer inquiries communicating customers various channels acknowledging resolving customer complaints processing orders forms applications requests keeping records customer interactions transactions comments complaints communicating coordinating colleagues necessary providing feedback efficiency customer service process managing team junior customer service representatives ensuring customer satisfaction providing professional customer support required skills ability stay calm customers stressed upset comfortable using computers experience working customer support educational requirements high school diploma general education degree equivalent experience level na ']

      Id   Category
34594746 HEALTHCARE

[' responsibilities position responsibilities outlined way construed encompassing  duties  responsibilities  qualifications may required andor assigned necessary  skill audit recov

### Model : all-mpnet-base-v2
- https://huggingface.co/sentence-transformers/all-mpnet-base-v2

In [23]:
# New DataFrame

resume = resume_df[["Id", "Category", "Processed_Feature"]]
job = job_df[["company_name", "position_title", "Processed_Feature"]]

In [24]:
from sentence_transformers import SentenceTransformer

model  = SentenceTransformer("all-mpnet-base-v2")
model.max_seq_length = 512
print(model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)


In [25]:
%%time
# Apply Embeddings

resume["Embedding"] = resume["Processed_Feature"].apply(lambda x : model.encode(x).reshape(1, -1))
job["Embedding"] = job["Processed_Feature"].apply(lambda x : model.encode(x).reshape(1, -1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: total: 51.2 s
Wall time: 1min 11s


In [26]:
# Cosine Similarity

from sklearn.metrics.pairwise import cosine_similarity

for index in range(job.shape[0]):
    
    # Compute Cosine Similarity
    similarity = np.array([cosine_similarity(job["Embedding"][index], resume["Embedding"][resume_index])[0][0] for resume_index in range(resume.shape[0])])
    
    # Retrieve Top 5 Resumes
    top_5_resume_index = np.argsort(similarity)[:-6:-1]
    top_5_resume = resume.iloc[top_5_resume_index].head()
    top_5_resume["Similarity"] = np.sort(similarity)[:-6:-1]
    
    # Display
    print("Company :", job["company_name"][index])
    print("Position :", job["position_title"][index], end="\n\n")
    print(top_5_resume[["Category", "Id", "Similarity"]].to_string(index=False), end="\n\n")

Company : HCA Healthcare
Position : Registrar

  Category       Id  Similarity
  ADVOCATE 20544228    0.698301
  ADVOCATE 13342150    0.683802
HEALTHCARE 33803142    0.676823
      ARTS 38115035    0.664016
HEALTHCARE 10568183    0.659261

Company : Major League Baseball
Position : Live Content Creator (Seasonal)

        Category       Id  Similarity
   DIGITAL-MEDIA 19444529    0.629020
   DIGITAL-MEDIA 40311088    0.616245
PUBLIC-RELATIONS 98086373    0.612260
   DIGITAL-MEDIA 29915354    0.600370
         FITNESS 12019284    0.595799

Company : Teaneck Public Schools
Position : Art Teacher

Category       Id  Similarity
    ARTS 12386670    0.633650
    ARTS 78107631    0.622798
    ARTS 11555549    0.606932
 TEACHER 33704389    0.603569
    CHEF 12155206    0.598780

Company : Volt
Position : Talent Acquisition Specialist / Recruiter

  Category       Id  Similarity
        HR 30862904    0.809733
HEALTHCARE 17864043    0.777453
        HR 19179079    0.765547
        HR 18297650 

### Model : all-MiniLM-L12-v2
- https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2

In [27]:
# New DataFrame

resume = resume_df[["Id", "Category", "Processed_Feature"]]
job = job_df[["company_name", "position_title", "Processed_Feature"]]

In [28]:
from sentence_transformers import SentenceTransformer

model  = SentenceTransformer("all-MiniLM-L12-v2")
model.max_seq_length = 512
print(model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)


In [29]:
%%time
# Apply Embeddings

resume["Embedding"] = resume["Processed_Feature"].apply(lambda x : model.encode(x).reshape(1, -1))
job["Embedding"] = job["Processed_Feature"].apply(lambda x : model.encode(x).reshape(1, -1))

CPU times: total: 28.3 s
Wall time: 43.1 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [30]:
# Cosine Similarity

from sklearn.metrics.pairwise import cosine_similarity

for index in range(job.shape[0]):
    
    # Compute Cosine Similarity
    similarity = np.array([cosine_similarity(job["Embedding"][index], resume["Embedding"][resume_index])[0][0] for resume_index in range(resume.shape[0])])
    
    # Retrieve Top 5 Resumes
    top_5_resume_index = np.argsort(similarity)[:-6:-1]
    top_5_resume = resume.iloc[top_5_resume_index].head()
    top_5_resume["Similarity"] = np.sort(similarity)[:-6:-1]
    
    # Display
    print("Company :", job["company_name"][index])
    print("Position :", job["position_title"][index], end="\n\n")
    print(top_5_resume[["Category", "Id", "Similarity"]].to_string(index=False), end="\n\n")

Company : HCA Healthcare
Position : Registrar

  Category       Id  Similarity
   FITNESS 15932017    0.705378
  ADVOCATE 12544735    0.702153
   FITNESS 10969918    0.677800
  ADVOCATE 13342150    0.673177
HEALTHCARE 25834360    0.663054

Company : Major League Baseball
Position : Live Content Creator (Seasonal)

     Category       Id  Similarity
      FITNESS 27903191    0.663730
DIGITAL-MEDIA 29915354    0.661354
      FITNESS 19975121    0.654657
DIGITAL-MEDIA 19444529    0.646088
DIGITAL-MEDIA 28109594    0.638440

Company : Teaneck Public Schools
Position : Art Teacher

     Category       Id  Similarity
DIGITAL-MEDIA 14761906    0.624654
      TEACHER 90363254    0.612468
     DESIGNER 62312955    0.608341
         ARTS 31273413    0.606782
      TEACHER 20230207    0.602979

Company : Volt
Position : Talent Acquisition Specialist / Recruiter

  Category       Id  Similarity
        HR 30862904    0.741534
HEALTHCARE 17864043    0.710526
        HR 19179079    0.703621
        