In [1]:
import spacy
from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc

In [2]:
import gensim
from gensim import corpora

In [3]:
from spacy import displacy
import numpy as np 
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jsonlines
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download(['stopwords', 'wordnet'])

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package stopwords to C:\Users\Harshvardhan
[nltk_data]     Sharma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Harshvardhan
[nltk_data]     Sharma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
import warnings 
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('Resume.csv')
df=df.reindex(np.random.permutation(df.index))


In [6]:
data = df.copy().iloc[
    0:200,
]
data.head()


Unnamed: 0,ID,Resume_str,Resume_html,Category
1335,24703983,ACCOUNTANT Summary Experienc...,"<div class=""fontsize fontface vmargins hmargin...",AUTOMOBILE
1282,73282756,DIRECTOR OF DIGITAL INNOVATION AND ST...,"<div class=""fontsize fontface vmargins hmargin...",DIGITAL-MEDIA
2029,12826414,DIRECTOR OF PIPELINE OPERATIONS ...,"<div class=""fontsize fontface vmargins hmargin...",CONSTRUCTION
395,69532425,PRE-SERVICE TEACHER Summary ...,"<div class=""fontsize fontface vmargins hmargin...",TEACHER
829,19774173,SALES ASSOCIATE Objective To...,"<div class=""fontsize fontface vmargins hmargin...",FITNESS


In [7]:
data.info()
data.value_counts()

<class 'pandas.core.frame.DataFrame'>
Index: 200 entries, 1335 to 1848
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           200 non-null    int64 
 1   Resume_str   200 non-null    object
 2   Resume_html  200 non-null    object
 3   Category     200 non-null    object
dtypes: int64(1), object(3)
memory usage: 7.8+ KB


ID        Resume_str                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [8]:
nlp = spacy.load("en_core_web_lg")
skill_pattern_path = "jz_skill_patterns.jsonl"

In [9]:
ruler=nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_pattern_path)
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [10]:
#Extract the skills
def get_skills(text):
    doc = nlp(text)
    myset = []
    subset = []
    for ent in doc.ents:
        if ent.label_ == 'SKILL':
            subset.append(ent.text)
    myset.append(subset)
    return subset
def unique_skills(x):
    return list(set(x)) 

In [11]:
#Text cleaning
clean = []
for i in range(data.shape[0]):
    review = re.sub(
        '(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"',
        " ",
        data["Resume_str"].iloc[i],
    )
    review = review.lower()
    review = review.split()
    lm = WordNetLemmatizer()
    review = [
        lm.lemmatize(word)
        for word in review
        if not word in set(stopwords.words("english"))
    ]
    review = " ".join(review)
    clean.append(review)

In [12]:
data["Clean_Resume"]= clean
data["skills"]=data["Clean_Resume"].str.lower().apply(get_skills)
data["skills"]=data["skills"].apply(unique_skills)
data.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category,Clean_Resume,skills
1335,24703983,ACCOUNTANT Summary Experienc...,"<div class=""fontsize fontface vmargins hmargin...",AUTOMOBILE,accountant summary experienced accounting prof...,"[software, accounting]"
1282,73282756,DIRECTOR OF DIGITAL INNOVATION AND ST...,"<div class=""fontsize fontface vmargins hmargin...",DIGITAL-MEDIA,director digital innovation strategy executive...,"[statistical model, google, google analytics, ..."
2029,12826414,DIRECTOR OF PIPELINE OPERATIONS ...,"<div class=""fontsize fontface vmargins hmargin...",CONSTRUCTION,director pipeline operation executive profile ...,"[schedule, architectural engineering, business..."
395,69532425,PRE-SERVICE TEACHER Summary ...,"<div class=""fontsize fontface vmargins hmargin...",TEACHER,pre service teacher summary seeking middle gra...,[]
829,19774173,SALES ASSOCIATE Objective To...,"<div class=""fontsize fontface vmargins hmargin...",FITNESS,sale associate objective obtain position growi...,"[business administration, testing, accounting]"


In [13]:
import plotly.express as px
fig = px.histogram(
    data, x="Category", title="Distribution of Jobs Categories"
).update_xaxes(categoryorder="total descending")
fig.show()

In [14]:
Job_cat = data["Category"].unique()
Job_cat = np.append(Job_cat, "ALL")

In [15]:
sent = nlp(data["Resume_str"].iloc[0])
displacy.render(sent, style="ent", jupyter=True)

In [16]:
displacy.render(sent[0:10], style="dep", jupyter=True, options={"distance": 90})

In [17]:
patterns = df.Category.unique()
for a in patterns:
    ruler.add_patterns([{"label": "Job-Category", "pattern": a}])

In [18]:
colors = {
    "Job-Category": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
    "SKILL": "linear-gradient(90deg, #9BE15D, #00E3AE)",
    "ORG": "#ffd966",
    "PERSON": "#e06666",
    "GPE": "#9fc5e8",
    "DATE": "#c27ba0",
    "ORDINAL": "#674ea7",
    "PRODUCT": "#f9cb9c",
}
options = {
    "ents": [
        "Job-Category",
        "SKILL",
        "ORG",
        "PERSON",
        "GPE",
        "DATE",
        "ORDINAL",
        "PRODUCT",
    ],
    "colors": colors,
}
sent = nlp(data["Resume_str"].iloc[5])
displacy.render(sent, style="ent", jupyter=True, options=options)

In [19]:
input_resume = "Harshvardhan Sharma  +91 7067133966 # hvsharma2164@gmail.com /in/HarshvardhanSharma github.com/Harshvardhan2164 Summary I am on a quest to evolve into a successful data and business analyst—a maestro of insights and predictions. My ambition is to harness the power of data, unleashing its potential to propel businesses to new summits. Education Dr. SPM International Institute of Information Technology Naya Raipur Nov 2022 – Present Bachelor of Technology, Data Science and Artificial Intelligence CGPA: 9.50 Ryan International School June 2021 – July 2022 CBSE Board, Class - XII 97.2% Relevant Coursework • Data Structures • Data Analytics • Machine Learning • Database Management • Artificial Intelligence • Front-end Development Projects Applicant Tracking System | Python, NLP, Machine Learning, Full Stack Web Development Present • This enhanced system collects and parses resumes and job descriptions, utilizing NLP to extract key information and standardize data. • ML algorithms then employ semantic matching to identify the most relevant candidates, providing accurate resume ranking and scoring. Candidate profiling is enriched with qualitative aspects extracted by NLP, creating comprehensive candidate profiles. • The user interacts with the ATS through a full stack web interface, featuring real-time updates, notifications, and a secure environment that complies with data protection regulations. Social Media Sentiment Analysis | Python, NLP, Machine Learning, Full Stack Web Development December 2023 • This project aims to develop a machine learning model for sentiment analysis on social media platforms such as Twitter and YouTube and E-commerce websites like Amazon. • Social media plays a pivotal role in shaping people’s views and ideas, making it essential to understand the sentiments expressed on these platforms. House Price Prediction Model | Python, Machine Learning, Full Stack Web Development June 2023 • This project is about finalizing the appropriate house for an individual inclined based on their conditions and requirement • It uses the fundamental Machine Learning perspective for EDA and Data Visualization. • It includes an introduction to Linear Regression for continuous data evaluation and modeling using algorithms. Technical Skills Languages: Python, C, C++, HTML/CSS, JavaScript, SQL Technologies/Frameworks: MySQL, Tableau, Microsoft Excel, GitHub, Numpy, Pandas, Matplotlib Languages • English : Professional Proficiency • Hindi : Native Proficiency Hobbies • Cinephile • Fitness Enthusiast • Sports Enthusiast"
sent2 = nlp(input_resume)
displacy.render(sent2, style="ent", jupyter=True, options=options)

In [28]:
#Match score
input_skills = "C++,Python,SQL,Machine learning,Javascript"
req_skills = input_skills.lower().split(",")
resume_skills = unique_skills(get_skills(input_resume.lower()))
score = 0
for x in req_skills:
    if x in resume_skills:
        score += 1
req_skills_len = len(req_skills)
match = round(score / req_skills_len * 100, 1)

print(f"The current Resume is {match}% matched to your requirements")

The current Resume is 60.0% matched to your requirements
