## **Task 8: Resume Screening Using NLP**

**1. Setup (install libraries)**

In [None]:
!pip -q install -U sentence-transformers pandas scikit-learn spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m80.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
!pip install streamlit pyngrok

Collecting streamlit
  Downloading streamlit-1.49.1-py3-none-any.whl.metadata (9.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.49.1-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m72.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.3.0-py3-none-any.whl (25 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m84.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyngrok, pydeck, streamlit
Successfully installed pydeck-0.9.1 pyngrok-7.3.0 streamlit-1.49.1


In [None]:
import os
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import re
import spacy

**2. Load dataset**

In [None]:
df_resumes = pd.read_csv("/content/UpdatedResumeDataSet.csv")   # Kaggle Resume dataset
df_jobs = pd.read_csv("/content/job_descriptions.csv")         # Kaggle Job dataset

print("Resumes shape:", df_resumes.shape)
print("Jobs shape:", df_jobs.shape)

Resumes shape: (962, 2)
Jobs shape: (1615940, 23)


**3. Preprocess text**

In [None]:
def clean_text(t):
    if not isinstance(t, str): return ""
    t = t.lower()
    t = re.sub(r'\s+', ' ', t)
    return t.strip()

df_resumes = df_resumes.head(700)
df_jobs = df_jobs.head(70)

df_resumes['resume_text'] = df_resumes['Resume'].apply(clean_text)
df_jobs['job_text'] = df_jobs['Responsibilities'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_resumes['resume_text'] = df_resumes['Resume'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_jobs['job_text'] = df_jobs['Responsibilities'].apply(clean_text)


**4. Embeddings**

In [None]:
from sentence_transformers import SentenceTransformer

# Load pre-trained sentence embedding model
model = SentenceTransformer("paraphrase-MiniLM-L3-v2")

# Now you can encode resumes and jobs
resume_embeddings = model.encode(df_resumes['resume_text'].tolist(), show_progress_bar=True)
job_embeddings = model.encode(df_jobs['job_text'].tolist(), show_progress_bar=True)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
import os
os.makedirs("outputs", exist_ok=True) # Create the directory if it doesn't exist
np.save("outputs/resume_embeddings.npy", resume_embeddings)
np.save("outputs/job_embeddings.npy", job_embeddings)

**5. Matching**

In [None]:
sim_matrix = cosine_similarity(job_embeddings, resume_embeddings)

results = []
for j_idx, job in df_jobs.iterrows():
    sims = sim_matrix[j_idx]
    top_idx = np.argsort(-sims)[:5]  # top 5 resumes
    for rank, r_idx in enumerate(top_idx, start=1):
        results.append({
            "job_id": job["Job Id"], # Changed from 'Job ID' to 'Job Id'
            "job_title": job["Job Title"], # Changed from 'Job Title' to 'Job Title'
            "resume_id": r_idx, # Use DataFrame index as resume_id
            "candidate_category": df_resumes.loc[r_idx, "Category"], # Use 'Category' instead of 'Name'
            "score": float(sims[r_idx]),
            "rank": rank
        })

df_matches = pd.DataFrame(results)
print(df_matches.head(10))

os.makedirs("outputs", exist_ok=True)
df_matches.to_csv("outputs/resume_matches.csv", index=False)

             job_id                     job_title  resume_id  \
0  1089843540111562  Digital Marketing Specialist        136   
1  1089843540111562  Digital Marketing Specialist        118   
2  1089843540111562  Digital Marketing Specialist        124   
3  1089843540111562  Digital Marketing Specialist        130   
4  1089843540111562  Digital Marketing Specialist        106   
5   398454096642776                 Web Developer        157   
6   398454096642776                 Web Developer        147   
7   398454096642776                 Web Developer        142   
8   398454096642776                 Web Developer        140   
9   398454096642776                 Web Developer        152   

  candidate_category     score  rank  
0               Arts  0.258925     1  
1               Arts  0.258925     2  
2               Arts  0.258925     3  
3               Arts  0.258925     4  
4               Arts  0.258925     5  
5      Web Designing  0.361682     1  
6      Web Designing  

**6. Justifications (keywords overlap)**

In [None]:
nlp = spacy.load("en_core_web_sm")

def extract_keywords(text, topn=15):
    doc = nlp(text)
    candidates = [token.lemma_.lower() for token in doc if token.pos_ in ("NOUN","PROPN") and len(token.text) > 2]
    freq = {}
    for c in candidates:
        freq[c] = freq.get(c, 0) + 1
    return [k for k,_ in sorted(freq.items(), key=lambda x:-x[1])[:topn]]

justifications = []
for job_id in df_jobs['Job Id'].head(3):   # adjust column name if needed
    top_match = df_matches[(df_matches['job_id']==job_id) & (df_matches['rank']==1)].head(1)
    if not top_match.empty:
        top_match = top_match.iloc[0]
        resume_text = df_resumes.loc[top_match['resume_id'], 'resume_text'] # Use resume_id (index) and 'resume_text'
        job_text = df_jobs[df_jobs['Job Id']==job_id]['job_text'].iloc[0] # Changed from 'Job_ID' to 'Job Id'

        overlap = set(extract_keywords(resume_text)) & set(extract_keywords(job_text))

        justifications.append({
            "job_id": job_id,
            "top_resume_category": top_match['candidate_category'], # Changed from 'top_resume' to 'top_resume_category' and used 'candidate_category'
            "match_score": round(top_match['score'], 3),
            "shared_keywords": list(overlap)[:10]
        })

df_just = pd.DataFrame(justifications)
print(df_just)

             job_id top_resume_category  match_score shared_keywords
0  1089843540111562                Arts        0.259              []
1   398454096642776       Web Designing        0.362       [website]
2   481640072963533                 PMO        0.379          [team]


In [None]:
nlp = spacy.load("en_core_web_sm")

def extract_keywords(text, topn=15):
    doc = nlp(text)
    candidates = [token.lemma_.lower() for token in doc if token.pos_ in ("NOUN","PROPN") and len(token.text) > 2]
    freq = {}
    for c in candidates:
        freq[c] = freq.get(c, 0) + 1
    return [k for k,_ in sorted(freq.items(), key=lambda x:-x[1])[:topn]]

justifications = []
for job_id in df_jobs['Job Id'].head(3):  # demo for first 3 jobs
    top_matches_for_job = df_matches[(df_matches.job_id==job_id) & (df_matches.rank==1)]
    if not top_matches_for_job.empty: # Check if there is a rank 1 match
        top_match = top_matches_for_job.iloc[0]
        resume_text = df_resumes.loc[top_match['resume_id'], 'resume_text']
        job_text = df_jobs[df_jobs['Job Id']==job_id]['job_text'].iloc[0]
        overlap = set(extract_keywords(resume_text)) & set(extract_keywords(job_text))
        justifications.append({
            "job_id": job_id,
            "top_resume_category": top_match['candidate_category'],
            "match_score": round(top_match['score'], 3),
            "shared_keywords": list(overlap)[:10]
        })
    else:
        # Handle the case where no rank 1 match is found for this job_id
        justifications.append({
            "job_id": job_id,
            "top_resume_category": "No rank 1 match found",
            "match_score": 0.0,
            "shared_keywords": []
        })


df_just = pd.DataFrame(justifications)
df_just.to_csv("outputs/match_justifications.csv", index=False)
print(df_just)

             job_id    top_resume_category  match_score shared_keywords
0  1089843540111562  No rank 1 match found          0.0              []
1   398454096642776  No rank 1 match found          0.0              []
2   481640072963533  No rank 1 match found          0.0              []
