# Imports

In [1]:
from IPython.display import clear_output

In [2]:
!pip install pyspellchecker
clear_output()

In [3]:
!pip install spacy
!python -m spacy download en_core_web_sm
clear_output()

In [4]:
# Importing libraries

import pandas as pd

# text processing libraries
import re
import string
from spellchecker import SpellChecker
import spacy

In [5]:
# Importing libraries
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
!cd /content/drive/MyDrive/Datasets/jarvis-hiring-dataset/

In [7]:
# Importing dataset

resume_df = pd.read_csv("/content/drive/MyDrive/Datasets/jarvis-hiring-dataset/Resume.csv")

# EDA

In [18]:
resume_df.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [19]:
resume_df.shape

(2484, 4)

In [20]:
features = resume_df.columns
print(features)

Index(['ID', 'Resume_str', 'Resume_html', 'Category'], dtype='object')


# Text preprocessing

In [21]:
steps = []

### Html tag removal

In [22]:
def remove_html_tags(text):

  if text != None or text != np.nan:
    text = str(text)
    clean_text = re.sub("<.*?>"," ", text)
    return clean_text
  else:
    return text

steps.append(remove_html_tags)

### URL Removal

In [23]:
# remove url

def remove_urls(text):
    """
    Explanation

    https?://\S+ matches URLs that start with "http://" or "https://".
    www\.\S+ matches URLs that start with "www.".
    \S+\.\S{2,} matches URLs that have no scheme but contain a dot (e.g., "example.com").

    This method is efficient for processing text and works with various URL formats.
    """

    if text != None:
      text = str(text)
      url_pattern = r'https?://\S+|www\.\S+|\S+\.\S{2,}'
      cleaned_text = re.sub(url_pattern, "", text)
      return cleaned_text
    else:
      return text

steps.append(remove_urls)

### Punctuation removal

In [24]:
import string
string.punctuation

def remove_punc(text):

    """
    Explanation:

    string.punctuation contains all punctuation characters (!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`).

    str.maketrans('', '', string.punctuation) creates a translation table mapping each punctuation character to None.

    text.translate(translator) removes all punctuation based on this table.

    This method is efficient and works well for removing punctuation in Python.
    """
    if text != None:
      text = str(text)
      translator = str.maketrans("","", string.punctuation)
      return text.translate(translator)
    else:
      return text



steps.append(remove_punc)

### Stopwords removal

In [25]:
nlp = spacy.load("en_core_web_sm")

def remove_stop_words_spacy(text):
    if text != None:
      text = str(text)
      doc = nlp(text)
      return " ".join([token.text for token in doc if not token.is_stop])
    else:
      return text


steps.append(remove_stop_words_spacy)

### Lower case

In [26]:
def lower_case(text):
  if text != None:
    text = str(text)
    return text.lower()
  else:
    return text

steps.append(lower_case)

### List unicodes removal


In [27]:
def list_unicode_remove(text):
    if text != None:
      text = str(text)
      cleaned_text = text.translate(str.maketrans('', '', '\xa0'))
      return cleaned_text
    else:
      return text

steps.append(list_unicode_remove)

# Text vectorisation

In [30]:
vectorised = {}

## Term Frequency - Inverse Document Frequency

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_vector(data):
  tfidf = TfidfVectorizer()
  x_tfidf = tfidf.fit_transform(data).toarray()
  return x_tfidf

vectorised["tfidf"] = tfidf_vector

## Bag of words

In [32]:
# Bag of words
from sklearn.feature_extraction.text import CountVectorizer

def bag_of_words(data):
  cv = CountVectorizer()
  x_bow = cv.fit_transform(data).toarray()
  return x_bow

vectorised["bow"] = bag_of_words

In [33]:
# Bag of words N-grams

def bag_of_words_22(data):
  cv = CountVectorizer(ngram_range=(2,2))
  x_bow = cv.fit_transform(data).toarray()
  return x_bow

vectorised["bow_22"] = bag_of_words_22

In [34]:
# def bag_of_words_max(data):
#   cv = CountVectorizer(ngram_range=(2,2), max_features=1500)
#   x_bow = cv.fit_transform(data).toarray()
#   return x_bow


# vectorised["bow_22_max"] = bag_of_words_max

# Machine learning

In [35]:
classifiers = []

### Evaluation

In [36]:
import pandas as pd
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score
import numpy as np


def model_evoluation(y_true, y_pred,y_pred_prob, name, conf):
    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred_prob, multi_class="ovr")
    f1 = f1_score(y_true, y_pred,average="weighted")

    # Store them in a dictionary
    metrics = {
        'model':name,
        'config':conf,
        'Accuracy': accuracy,
        'Balanced Accuracy': balanced_accuracy,
        'ROC AUC': roc_auc,
        'F1 Score': f1
    }
    return metrics

### Logistic Regression

In [37]:
from sklearn.linear_model import LogisticRegression

def logistic_regression(x_train, x_test, y_train, y_test, conf):
  model = "Logistic Regression"
  lr = LogisticRegression()
  lr.fit(x_train, y_train)
  y_pred = lr.predict(x_test)
  y_pred_prob = lr.predict_proba(x_test)
  temp = model_evoluation(y_test, y_pred, y_pred_prob, model, conf)
  return temp


classifiers.append(logistic_regression)

### Random Forest Classifier

In [38]:
from sklearn.ensemble import RandomForestClassifier

def random_forest_classifier(x_train, x_test, y_train, y_test, conf):
  model= "Random Forest Classifier"
  rf = RandomForestClassifier()
  rf.fit(x_train, y_train)
  y_pred = rf.predict(x_test)
  y_pred_prob = rf.predict_proba(x_test)
  temp = model_evoluation(y_test, y_pred, y_pred_prob, model, conf)
  return temp


classifiers.append(random_forest_classifier)

### Support Vector Machine

In [39]:
# from sklearn.svm import SVC

# def support_vector_machine(x_train, x_test, y_train, y_test, conf):
#   model = "Support Vector Machine"
#   svc = SVC(probability= True)
#   svc.fit(x_train, y_train)
#   y_pred = svc.predict(x_test)
#   y_pred_prob = svc.predict_proba(x_test)
#   temp = model_evoluation(y_test, y_pred,y_pred_prob, model, conf)
#   return temp

# classifiers.append(support_vector_machine)

### Guassian Naive Bayes

In [40]:
from sklearn.naive_bayes import GaussianNB

def gaussian_naive_bayes(x_train, x_test, y_train, y_test, conf):
  model = "Gaussian Naive Bayes"
  gnb = GaussianNB()
  gnb.fit(x_train, y_train)
  y_pred = gnb.predict(x_test)
  y_pred_prob = gnb.predict_proba(x_test)
  temp = model_evoluation(y_test, y_pred, y_pred_prob, model, conf)
  return temp

classifiers.append(gaussian_naive_bayes)

# Different Strategies

### Label Encoding

In [41]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(resume_df["Category"])

In [42]:
y_encoded

array([19, 19, 19, ...,  6,  6,  6])

## Strategy 1 - Resume_str, Category

In [None]:
df = resume_df[["Resume_str","Category"]]
resume_df.shape

(2484, 4)

In [None]:
feature = "Resume_str"
target = "Category"

In [None]:
for step in steps:
  print(f"Step:{step}")
  df[feature] = df[feature].apply(step)

Step:<function remove_html_tags at 0x7f6675db7010>
Step:<function remove_urls at 0x7f6675db4ee0>


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[feature] = df[feature].apply(step)


Step:<function remove_punc at 0x7f674ea03490>
Step:<function remove_stop_words_spacy at 0x7f6675db51b0>
Step:<function lower_case at 0x7f6675e28160>
Step:<function list_unicode_remove at 0x7f6675e2b520>


In [None]:
x = df[feature]
y = df[target]

In [None]:
x

Unnamed: 0,Resume_str
0,hr administratormarketing associate ...
1,hr specialist hr operations s...
2,hr director summary 20 ...
3,hr specialist summary ded...
4,hr manager skill highlights...
...,...
2479,rank sgte5 non commissioned officer ...
2480,government relations communications ...
2481,geek squad agent profession...
2482,program director office manager ...


### Text vectorisation

In [None]:
# Text Vectorisation --> Features summary and title

x_tfidf = tfidf_vector(x)

In [None]:
# Text Vectorization --> Features summary and title - bag of words

# x_summary = bag_of_words(df["summary"])
# x_title = bag_of_words(df["title"])

In [None]:
# Text Vectorisation --> Features summary and title - bag of words 2 by 2

x_summary = bag_of_words_22(df["summary"])
x_title = bag_of_words_22(df["title"])

In [None]:
x_tfidf.shape

(2484, 51221)

### train_test_split

In [None]:
from sklearn.model_selection import  train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y_encoded, test_size = 0.25, random_state=1)

In [None]:
classifiers

[<function __main__.logistic_regression(x_train, x_test, y_train, y_test, conf)>,
 <function __main__.random_forest_classifier(x_train, x_test, y_train, y_test, conf)>,
 <function __main__.support_vector_machine(x_train, x_test, y_train, y_test, conf)>,
 <function __main__.gaussian_naive_bayes(x_train, x_test, y_train, y_test, conf)>]

In [None]:
models = []
for classifier in classifiers:
  conf = "featured_2_bow_22"
  temp = classifier(x_train, x_test, y_train, y_test, conf)
  models.append(temp)

In [None]:
models

[{'model': 'Logistic Regression',
  'config': 'featured_2_bow_22',
  'Accuracy': 0.6086956521739131,
  'Balanced Accuracy': 0.56854280426218,
  'ROC AUC': 0.9402553917704456,
  'F1 Score': 0.5911853294025081},
 {'model': 'Random Forest Classifier',
  'config': 'featured_2_bow_22',
  'Accuracy': 0.5990338164251208,
  'Balanced Accuracy': 0.5667915642563399,
  'ROC AUC': 0.9229776365343082,
  'F1 Score': 0.5675862673103368},
 {'model': 'Support Vector Machine',
  'config': 'featured_2_bow_22',
  'Accuracy': 0.5974235104669887,
  'Balanced Accuracy': 0.5580868029986843,
  'ROC AUC': 0.9354351654548395,
  'F1 Score': 0.5925038333295309},
 {'model': 'Gaussian Naive Bayes',
  'config': 'featured_2_bow_22',
  'Accuracy': 0.37037037037037035,
  'Balanced Accuracy': 0.35456979478983097,
  'ROC AUC': 0.6635515969387396,
  'F1 Score': 0.37134513870499963}]

### saving the progress

In [None]:
model_evaluation = pd.DataFrame(models)
model_evaluation.to_csv("/content/drive/MyDrive/Datasets/jarvis-hiring-dataset/models_strategy1_tfidf.csv")

## Strategy 2 - Feature extraction

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Datasets/jarvis-hiring-dataset/df_preprocessed_strat2.csv")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2484 entries, 0 to 2483
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    2484 non-null   int64  
 1   summary       2112 non-null   object 
 2   skills        9 non-null      object 
 3   education     7 non-null      object 
 4   experience    95 non-null     object 
 5   projects      1 non-null      object 
 6   achievements  12 non-null     object 
 7   certificates  2 non-null      object 
 8   hobbies       1 non-null      object 
 9   others        18 non-null     object 
 10  references    1 non-null      object 
 11  title         2472 non-null   object 
 12  Category      2483 non-null   object 
 13  0             0 non-null      float64
dtypes: float64(1), int64(1), object(12)
memory usage: 271.8+ KB


In [None]:
df.describe()

Unnamed: 0.1,Unnamed: 0,0
count,2484.0,0.0
mean,1241.5,
std,717.213357,
min,0.0,
25%,620.75,
50%,1241.5,
75%,1862.25,
max,2483.0,


In [None]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
summary,372
skills,2475
education,2477
experience,2389
projects,2483
achievements,2472
certificates,2482
hobbies,2483
others,2466


In [None]:
df.columns

Index(['Unnamed: 0', 'summary', 'skills', 'education', 'experience',
       'projects', 'achievements', 'certificates', 'hobbies', 'others',
       'references', 'title', 'Category', '0'],
      dtype='object')

In [None]:
df.drop(['Unnamed: 0', 'skills', 'education', 'experience',
       'projects', 'achievements', 'certificates', 'hobbies', 'others',
       'references', '0'], inplace=True, axis=1)

In [None]:
df.dropna(axis=0, inplace=True)

In [None]:
df.isnull().sum()

Unnamed: 0,0
summary,0
title,0
Category,0


In [None]:
y_strat2 = df["Category"]

### Preprocessing

In [None]:
for step in steps:
  for i in df.columns[:-1]:
    df[i]=df[i].apply(step)

In [None]:
df["summary"].head()


Unnamed: 0,summary
0,dedicated customer service manager 15 year...
1,versatile media professional background ...
2,20 years experience recruiting 15 plus...
3,dedicated driven dynamic 20 years customer ...
5,dedicated focused administrative assistant...


In [None]:
df["title"].head()

Unnamed: 0,title
0,hr administratormarketing associatehr administ...
1,hr specialist hr operations
2,hr director
3,hr specialist
5,hr generalist


## Strategy 3 - Deep Feature extraction


In [None]:
df = pd.read_csv("/content/drive/MyDrive/Datasets/jarvis-hiring-dataset/df_featured_new.csv")

In [None]:
df.drop(["Unnamed: 0","PICT","ERTL"], axis = 1, inplace = True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2484 entries, 0 to 2483
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   highlights         1955 non-null   object
 1   language           240 non-null    object
 2   affiliations       449 non-null    object
 3   accomplishments    925 non-null    object
 4   title              2475 non-null   object
 5   education          2465 non-null   object
 6   summary            2260 non-null   object
 7   skills             2378 non-null   object
 8   additional         541 non-null    object
 9   interests          349 non-null    object
 10  links              17 non-null     object
 11  professional_work  375 non-null    object
 12  experience         2483 non-null   object
 13  RTTL               3 non-null      object
 14  RELO               11 non-null     object
 15  Category           2484 non-null   object
dtypes: object(16)
memory usage: 310.6+ KB


### Preprocessing

In [None]:
steps

[<function __main__.remove_html_tags(text)>,
 <function __main__.remove_urls(text)>,
 <function __main__.remove_punc(text)>,
 <function __main__.remove_stop_words_spacy(text)>,
 <function __main__.lower_case(text)>,
 <function __main__.list_unicode_remove(text)>]

In [None]:
columns = df.columns

In [None]:
for feature in df.columns[:-1]:

  for step in steps:

    df[feature] = df[feature].apply(step)

In [None]:
df.to_csv("/content/drive/MyDrive/Datasets/jarvis-hiring-dataset/df_preprocessed_strat3.csv")

In [43]:
df = pd.read_csv("/content/drive/MyDrive/Datasets/jarvis-hiring-dataset/df_preprocessed_strat3.csv")

In [44]:
df.columns

Index(['Unnamed: 0', 'highlights', 'language', 'affiliations',
       'accomplishments', 'title', 'education', 'summary', 'skills',
       'additional', 'interests', 'links', 'professional_work', 'experience',
       'RTTL', 'RELO', 'Category'],
      dtype='object')

### Missing values analysis

In [54]:
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [67]:
feature_null = pd.DataFrame(df.isnull().sum().sort_values( ascending=True))


In [68]:
feature_null

Unnamed: 0,0
Unnamed: 0,0
Category,0
experience,1
title,9
education,19
skills,106
summary,224
highlights,529
accomplishments,1559
additional,1943


In [58]:
row_null

Unnamed: 0,0,count
0,8,0.314815
1,9,0.298309
2,7,0.161836
3,10,0.111514
4,6,0.069646
5,5,0.022142
6,11,0.016908
7,4,0.003623
8,3,0.000805
9,15,0.000403


In [70]:

fig = make_subplots(rows=1,
                    cols=1,
                    column_titles = ["features"] ,
                    x_title="Missing Values")

fig.add_trace(go.Bar(x=feature_null[0],
                     y=feature_null.index,
                     orientation="h",
                    marker=dict(color=[n for n in range(18)],
                                line_color='rgb(0,0,0)' ,
                                line_width = 2,
                                coloraxis="coloraxis")),
              1, 1)


fig.update_layout(showlegend=False, title_text="Column wise Null Value Distribution", title_x=0.5)


In [73]:
temp = [feature for feature in df.columns if df[feature].isnull().sum()<1000]
temp

['Unnamed: 0',
 'highlights',
 'title',
 'education',
 'summary',
 'skills',
 'experience',
 'Category']

In [53]:
df = df[temp]

In [75]:
df_non = df[[feature for feature in df.columns if feature not in temp]]

In [77]:
df_non["extra"] = np.nan



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [111]:
def extra(text):
  temp = ''
  for feature in text.index:
    if text[feature]!=np.nan and text[feature]!=None:
      temp = f"{temp} {text[feature]}"

  return temp


In [113]:
for row in df_non.index:

  df_non["extra"][row] = extra(df_non.iloc[row])




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [114]:
df_non["extra"]

Unnamed: 0,extra
0,nan nan accomplishments missouri dot supervis...
1,nan nan nan nan nan nan nan nan nan nan nan ...
2,nan activities honors topeka chamber commerce...
3,nan nan nan nan nan nan nan nan nan nan nan ...
4,nan professional affiliations society human r...
...,...
2479,nan nan accomplishments noncommissioned offic...
2480,nan nan nan additional information nonprofit ...
2481,nan nan nan nan nan nan nan nan nan nan nan ...
2482,nan nan nan additional information hobbies vo...


In [54]:
df.drop(["Unnamed: 0"],axis = 1, inplace = True)

### filling nan with " "

In [55]:
df.isnull().sum()

Unnamed: 0,0
highlights,529
title,9
education,19
summary,224
skills,106
experience,1
Category,0


In [56]:
df.fillna(" ",  inplace=True)

### target encoding

In [57]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df["Category"])

### Text vectorisation

In [82]:
vectorised

{'bow': <function __main__.bag_of_words(data)>}

In [83]:
classifiers

[<function __main__.random_forest_classifier(x_train, x_test, y_train, y_test, conf)>,
 <function __main__.gaussian_naive_bayes(x_train, x_test, y_train, y_test, conf)>,
 <function __main__.logistic_regression(x_train, x_test, y_train, y_test, conf)>]

In [69]:
from sklearn.model_selection import  train_test_split

In [84]:
#models = []
for key, value in vectorised.items():
  conf = key
  x_vector = pd.DataFrame()
  print(f"Vectorisation: {key}")
  for feature in df.columns[:-1]:
    temp = value(df[feature])
    x_vector = pd.concat([x_vector, pd.DataFrame(temp)],axis=1)

  print(x_vector.shape)

  x_train, x_test, y_train, y_test = train_test_split(x_vector, y_encoded, test_size = 0.25, random_state=1)

  for classifier in classifiers:

    temp = classifier(x_train, x_test, y_train, y_test, conf)
    models.append(temp)

model_evaluation = pd.DataFrame(models)
model_evaluation.to_csv("/content/drive/MyDrive/Datasets/jarvis-hiring-dataset/models_strategy3.csv")

Vectorisation: bow
(2484, 74555)


In [85]:
models

[{'model': 'Logistic Regression',
  'config': 'tfidf',
  'Accuracy': 0.8099838969404187,
  'Balanced Accuracy': 0.7466218099673675,
  'ROC AUC': 0.9699716063397384,
  'F1 Score': 0.799243246495523},
 {'model': 'Random Forest Classifier',
  'config': 'tfidf',
  'Accuracy': 0.7342995169082126,
  'Balanced Accuracy': 0.6796066769080765,
  'ROC AUC': 0.9458034243721918,
  'F1 Score': 0.7129857600764342},
 {'model': 'Gaussian Naive Bayes',
  'config': 'tfidf',
  'Accuracy': 0.4830917874396135,
  'Balanced Accuracy': 0.45396889627284515,
  'ROC AUC': 0.7157178466876767,
  'F1 Score': 0.4855611493275111},
 {'model': 'Random Forest Classifier',
  'config': 'bow',
  'Accuracy': 0.7262479871175523,
  'Balanced Accuracy': 0.668767100137894,
  'ROC AUC': 0.9412574291102814,
  'F1 Score': 0.6978516998553943},
 {'model': 'Gaussian Naive Bayes',
  'config': 'bow',
  'Accuracy': 0.391304347826087,
  'Balanced Accuracy': 0.37752507239958577,
  'ROC AUC': 0.6762118927175759,
  'F1 Score': 0.391525747243

In [None]:
# Text Vectorisation --> Features summary and title

# x_summary = tfidf_vector(df["summary"])
# x_title = tfidf_vector(df["title"])

In [None]:
# Text Vectorization --> Features summary and title - bag of words

# x_summary = bag_of_words(df["summary"])
# x_title = bag_of_words(df["title"])

In [None]:
# Text Vectorisation --> Features summary and title - bag of words 2 by 2

# x_summary = bag_of_words_22(df["summary"])
# x_title = bag_of_words_22(df["title"])

In [None]:
# Text Vectorisation --> Features summary and title - bag of words

# x_summary = bag_of_words_max(df["summary"])
# x_title = bag_of_words_max(df["title"])

# Result Comparison

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import plotly as py
import matplotlib.pyplot as plt

In [None]:
results = pd.read_csv("/content/drive/MyDrive/Datasets/jarvis-hiring-dataset/results.csv")

In [None]:
results.columns

Index(['Unnamed: 0', 'model', 'config', 'Accuracy', 'Balanced Accuracy',
       'ROC AUC', 'F1 Score', 'mode'],
      dtype='object')

In [None]:
results.sort_values(by="Accuracy", ascending=False)