In [1]:
import spacy
from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc

In [2]:
import gensim
from gensim import corpora

In [3]:
from spacy import displacy
import numpy as np 
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jsonlines
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download(['stopwords', 'wordnet'])

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package stopwords to C:\Users\Harshvardhan
[nltk_data]     Sharma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Harshvardhan
[nltk_data]     Sharma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
import warnings 
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('Resume.csv')
df=df.reindex(np.random.permutation(df.index))


In [6]:
data = df.copy().iloc[
    0:200,
]
data.head()


Unnamed: 0,ID,Resume_str,Resume_html,Category
1728,25930778,ENGINEERING TECHNICIAN Summ...,"<div class=""fontsize fontface vmargins hmargin...",ENGINEERING
827,10428916,RECREATION & SPORTS COORDINATOR ...,"<div class=""fontsize fontface vmargins hmargin...",FITNESS
108,27847081,HR SPECIALIST/ HORIZONTAL ENGINEER ...,"<div class=""fontsize fontface vmargins hmargin...",HR
1926,99433371,CONSTRUCTION Executive Summary ...,"<div class=""fontsize fontface vmargins hmargin...",CONSTRUCTION
2428,27502951,INTERNATIONAL CERTIFICATION PROGRAM M...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION


In [7]:
data.info()
data.value_counts()

<class 'pandas.core.frame.DataFrame'>
Index: 200 entries, 1728 to 1967
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           200 non-null    int64 
 1   Resume_str   200 non-null    object
 2   Resume_html  200 non-null    object
 3   Category     200 non-null    object
dtypes: int64(1), object(3)
memory usage: 7.8+ KB


ID        Resume_str                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [8]:
nlp = spacy.load("en_core_web_lg")
skill_pattern_path = "jz_skill_patterns.jsonl"

In [9]:
ruler=nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_pattern_path)
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [10]:
#Extract the skills
def get_skills(text):
    doc = nlp(text)
    myset = []
    subset = []
    for ent in doc.ents:
        if ent.label_ == 'SKILL':
            subset.append(ent.text)
    myset.append(subset)
    return subset
def unique_skills(x):
    return list(set(x)) 

In [11]:
#Text cleaning
clean = []
for i in range(data.shape[0]):
    review = re.sub(
        '(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"',
        " ",
        data["Resume_str"].iloc[i],
    )
    review = review.lower()
    review = review.split()
    lm = WordNetLemmatizer()
    review = [
        lm.lemmatize(word)
        for word in review
        if not word in set(stopwords.words("english"))
    ]
    review = " ".join(review)
    clean.append(review)

In [12]:
data["Clean_Resume"]= clean
data["skills"]=data["Clean_Resume"].str.lower().apply(get_skills)
data["skills"]=data["skills"].apply(unique_skills)
data.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category,Clean_Resume,skills
1728,25930778,ENGINEERING TECHNICIAN Summ...,"<div class=""fontsize fontface vmargins hmargin...",ENGINEERING,engineering technician summary creative innova...,"[design, engineering, material, knowledge base]"
827,10428916,RECREATION & SPORTS COORDINATOR ...,"<div class=""fontsize fontface vmargins hmargin...",FITNESS,recreation sport coordinator objective gain re...,"[software, support, database]"
108,27847081,HR SPECIALIST/ HORIZONTAL ENGINEER ...,"<div class=""fontsize fontface vmargins hmargin...",HR,hr specialist horizontal engineer professional...,"[support, data analysis, ambiguity, analytics,..."
1926,99433371,CONSTRUCTION Executive Summary ...,"<div class=""fontsize fontface vmargins hmargin...",CONSTRUCTION,construction executive summary find internship...,"[business administration, adobe photoshop, lib..."
2428,27502951,INTERNATIONAL CERTIFICATION PROGRAM M...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,international certification program management...,"[engineering, schedule, aeronautics, support, ..."


In [13]:
import plotly.express as px
fig = px.histogram(
    data, x="Category", title="Distribution of Jobs Categories"
).update_xaxes(categoryorder="total descending")
fig.show()

In [14]:
Job_cat = data["Category"].unique()
Job_cat = np.append(Job_cat, "ALL")

In [15]:
sent = nlp(data["Resume_str"].iloc[0])
displacy.render(sent, style="ent", jupyter=True)

In [16]:
displacy.render(sent[0:10], style="dep", jupyter=True, options={"distance": 90})

In [17]:
patterns = df.Category.unique()
for a in patterns:
    ruler.add_patterns([{"label": "Job-Category", "pattern": a}])

In [18]:
colors = {
    "Job-Category": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
    "SKILL": "linear-gradient(90deg, #9BE15D, #00E3AE)",
    "ORG": "#ffd966",
    "PERSON": "#e06666",
    "GPE": "#9fc5e8",
    "DATE": "#c27ba0",
    "ORDINAL": "#674ea7",
    "PRODUCT": "#f9cb9c",
}
options = {
    "ents": [
        "Job-Category",
        "SKILL",
        "ORG",
        "PERSON",
        "GPE",
        "DATE",
        "ORDINAL",
        "PRODUCT",
    ],
    "colors": colors,
}
sent = nlp(data["Resume_str"].iloc[5])
displacy.render(sent, style="ent", jupyter=True, options=options)

In [19]:
input_resume = "Shantanu GuptaAspiring Software DeveloperRaipur, Chhattisgarh+91 9752005866shantanugupta0412@gmail.comEDUCATIONIIIT Naya Raipur — B.Tech in Computer Science andEngineeringNovember 2022 - PresentCGPA: 8.35Swami Vivekananda Senior Secondary School, Raipur —12th CBSE2022Grade: 92.4%Bharatiya Vidya Bhavans R.K. Sarda Vidya Mandir, Raipur— 10th CBSE2020Grade: 95%PROJECTSSocial Media Sentiment Analysis — ML and Web DevelopmentJuly 2023 - December 2023-Designed and Built an ML model for sentiment analysis of Twitter, YouTube andAmazon reviews.-Developed a full stack web application on React and Flask for manual testing ofthe model.Fake News Detection using Machine Learning — MLMarch 2023 - June 2023-Built an NLP based ML model which detects fake news from news articles.-Implemented it on a Tkinter based user interface.Smart Cradle and Baby Monitoring System — Raspberry Pi andPythonMarch 2023 - June 2023Built a baby cradle with multiple sensors including temperature, humidity andaccelerometer. A camera was trained using a CV algorithm for detection of the babyas well as a mailing system was developed to notify of any discrepancies detected.Raspberry Pi acted as a controlling hub and the programs were coded in Python.SKILLSProficient With:C++, JavaScript, React, HTML,CSSFamiliar With:Node.js, Python(NumPy andPandas), Flask, SQL, MachineLearning, Raspberry Pi,ArduinoCERTIFICATIONSCisco:● JavaScript Essentials 1● JavaScript Essentials 2Forage:● J.P. Morgan ChaseSoftware EngineeringVirtual ExperienceLANGUAGESWorking Proficiency:EnglishNative Proficiency:HindiHOBBIES● Reading● Cinema● SpeedCubing"
sent2 = nlp(input_resume)
displacy.render(sent2, style="ent", jupyter=True, options=options)

In [20]:
#Match score
input_skills = "C++,Python,JavaScript,SQL,Machine learning"
req_skills = input_skills.lower().split(",")
resume_skills = unique_skills(get_skills(input_resume.lower()))
score = 0
for x in req_skills:
    if x in resume_skills:
        score += 1
req_skills_len = len(req_skills)
match = round(score / req_skills_len * 100, 1)

print(f"The current Resume is {match}% matched to your requirements")

The current Resume is 40.0% matched to your requirements
