In [1]:
import nltk
import spacy
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download resources
nltk.download('stopwords')
from nltk.corpus import stopwords
nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
resume_text = """
Harshal Lad
Python Developer with experience in Machine Learning, Data Science, and NLP.
Worked with libraries like NumPy, Pandas, Scikit-learn, TensorFlow.
Also experienced in MERN stack development, SQL and MongoDB.
Looking for opportunities in AI/ML and Full-stack development.
"""

jd_text = """
We are looking for a Machine Learning Engineer with strong skills in Python,
Data Science, and Deep Learning frameworks like TensorFlow or PyTorch.
Experience with NLP and web technologies such as MERN stack is a plus.
"""


In [5]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

clean_resume = preprocess(resume_text)
clean_jd = preprocess(jd_text)

print("Cleaned Resume:\n", clean_resume[:500])
print("Cleaned JD:\n", clean_jd[:500])


Cleaned Resume:
 
harshal lad
python developer with experience in machine learning data science and nlp
worked with libraries like numpy pandas scikitlearn tensorflow
also experienced in mern stack development sql and mongodb
looking for opportunities in aiml and fullstack development

Cleaned JD:
 
we are looking for a machine learning engineer with strong skills in python
data science and deep learning frameworks like tensorflow or pytorch
experience with nlp and web technologies such as mern stack is a plus



In [7]:
stop_words = set(stopwords.words('english'))

def tokenize_and_filter(text):
    words = text.split()
    return [w for w in words if w not in stop_words]

resume_tokens = tokenize_and_filter(clean_resume)
jd_tokens = tokenize_and_filter(clean_jd)

print("Filtered Resume Tokens:", resume_tokens[:50])


Filtered Resume Tokens: ['harshal', 'lad', 'python', 'developer', 'experience', 'machine', 'learning', 'data', 'science', 'nlp', 'worked', 'libraries', 'like', 'numpy', 'pandas', 'scikitlearn', 'tensorflow', 'also', 'experienced', 'mern', 'stack', 'development', 'sql', 'mongodb', 'looking', 'opportunities', 'aiml', 'fullstack', 'development']


In [8]:
resume_set = set(resume_tokens)
jd_set = set(jd_tokens)

matched = resume_set.intersection(jd_set)
missing = jd_set - resume_set
match_percent = (len(matched) / len(jd_set)) * 100

print("Matched Skills:", matched)
print("Missing Skills:", missing)
print(f"Match Score: {match_percent:.2f}%")


Matched Skills: {'tensorflow', 'science', 'learning', 'mern', 'like', 'looking', 'stack', 'machine', 'python', 'experience', 'data', 'nlp'}
Missing Skills: {'technologies', 'web', 'strong', 'deep', 'pytorch', 'frameworks', 'skills', 'plus', 'engineer'}
Match Score: 57.14%


In [9]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([clean_resume, clean_jd])
similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

print(f"TF-IDF Similarity Score: {similarity*100:.2f}%")


TF-IDF Similarity Score: 42.89%
