<a href="https://colab.research.google.com/github/MashfikaJahan/AI_Resume_Screening/blob/main/AI_Resume_Screening.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/AI_Resume_Screening.csv")
print(df.shape)
print(df.columns.tolist())
df.head()


(1000, 11)
['Resume_ID', 'Name', 'Skills', 'Experience (Years)', 'Education', 'Certifications', 'Job Role', 'Recruiter Decision', 'Salary Expectation ($)', 'Projects Count', 'AI Score (0-100)']


Unnamed: 0,Resume_ID,Name,Skills,Experience (Years),Education,Certifications,Job Role,Recruiter Decision,Salary Expectation ($),Projects Count,AI Score (0-100)
0,1,Ashley Ali,"TensorFlow, NLP, Pytorch",10,B.Sc,,AI Researcher,Hire,104895,8,100
1,2,Wesley Roman,"Deep Learning, Machine Learning, Python, SQL",10,MBA,Google ML,Data Scientist,Hire,113002,1,100
2,3,Corey Sanchez,"Ethical Hacking, Cybersecurity, Linux",1,MBA,Deep Learning Specialization,Cybersecurity Analyst,Hire,71766,7,70
3,4,Elizabeth Carney,"Python, Pytorch, TensorFlow",7,B.Tech,AWS Certified,AI Researcher,Hire,46848,0,95
4,5,Julie Hill,"SQL, React, Java",4,PhD,,Software Engineer,Hire,87441,9,100


In [9]:
#finding missing values
df.isna().sum().sort_values(ascending=False).head(15)


Unnamed: 0,0
Certifications,274
Name,0
Resume_ID,0
Skills,0
Experience (Years),0
Education,0
Job Role,0
Recruiter Decision,0
Salary Expectation ($),0
Projects Count,0


In [10]:
import pandas as pd

# 1) Fill missing certifications (so text doesn't become NaN)
df["Certifications"] = df["Certifications"].fillna("")

# 2) Create ONE text field per resume (simple and consistent)
df["resume_text"] = (
    "Skills: " + df["Skills"].astype(str) +
    " | Education: " + df["Education"].astype(str) +
    " | Certifications: " + df["Certifications"].astype(str) +
    " | Experience Years: " + df["Experience (Years)"].astype(str)
)

df[["Resume_ID", "Job Role", "resume_text"]].head()


Unnamed: 0,Resume_ID,Job Role,resume_text
0,1,AI Researcher,"Skills: TensorFlow, NLP, Pytorch | Education: ..."
1,2,Data Scientist,"Skills: Deep Learning, Machine Learning, Pytho..."
2,3,Cybersecurity Analyst,"Skills: Ethical Hacking, Cybersecurity, Linux ..."
3,4,AI Researcher,"Skills: Python, Pytorch, TensorFlow | Educatio..."
4,5,Software Engineer,"Skills: SQL, React, Java | Education: PhD | Ce..."


In [11]:
role = df["Job Role"].value_counts().index[0]   # most common role
role_df = df[df["Job Role"] == role].copy()

# Build a job description (JD) using top skills for that role
all_skills_text = " ".join(role_df["Skills"].astype(str).tolist()).lower()

# quick skill frequency (very simple token split)
from collections import Counter
tokens = [t.strip(",|;/()") for t in all_skills_text.split()]
common = [w for w, c in Counter(tokens).most_common(25)]

jd_text = f"Job Role: {role}. Required skills: " + ", ".join(common)
print(jd_text)


Job Role: AI Researcher. Required skills: tensorflow, nlp, python, pytorch


In [12]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Take 10 sample resumes from this role
sample = role_df.sample(10, random_state=42).copy()

# Create variants by changing ONLY the skill phrase
sample["control"] = sample["resume_text"]
sample["variant_python_programming"] = sample["resume_text"].str.replace(
    r"\bPython\b", "Python programming", regex=True
)

# TF-IDF scorer
vectorizer = TfidfVectorizer(stop_words="english")
docs = [jd_text] + sample["control"].tolist() + sample["variant_python_programming"].tolist()
X = vectorizer.fit_transform(docs)

jd_vec = X[0]
control_vecs = X[1:1+len(sample)]
variant_vecs = X[1+len(sample):]

sample["score_control"] = cosine_similarity(control_vecs, jd_vec).ravel()
sample["score_variant"] = cosine_similarity(variant_vecs, jd_vec).ravel()
sample["delta"] = sample["score_variant"] - sample["score_control"]

sample[["Resume_ID","Job Role","score_control","score_variant","delta"]].sort_values("delta", ascending=False).head(10)


Unnamed: 0,Resume_ID,Job Role,score_control,score_variant,delta
637,638,AI Researcher,0.112677,0.112677,0.0
27,28,AI Researcher,0.140403,0.140403,0.0
382,383,AI Researcher,0.134112,0.134112,0.0
23,24,AI Researcher,0.141827,0.13444,-0.007387
465,466,AI Researcher,0.16473,0.153456,-0.011274
83,84,AI Researcher,0.141787,0.130446,-0.011341
95,96,AI Researcher,0.173335,0.160339,-0.012995
283,284,AI Researcher,0.142056,0.122723,-0.019333
757,758,AI Researcher,0.183279,0.160551,-0.022729
710,711,AI Researcher,0.220777,0.195628,-0.025148
