#

#Data Preprocessing :

###DataSet :

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
# Set file paths
resumes_df = pd.read_csv('/content/drive/MyDrive/Resume DataSet/Resume.csv')


###Installations :

In [3]:
!pip install gensim




###Imports :

In [4]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import _treebank_word_tokenizer
from nltk.data import load
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity


####NLTK SetUp :

In [5]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab', download_dir='/root/nltk_data')

# Forced load 'punkt'
nltk.data.path.append('/root/nltk_data')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


###CLEANING + TOKENIZATION + LEMMATIZATION

In [6]:
import re
import spacy
from collections import Counter

# Load spaCy English model
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# ---------------------------
# Step 1: Basic cleaning function
# ---------------------------
def clean_text_spacy(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and len(token) > 2]
    return list(dict.fromkeys(tokens))  # remove duplicates

# ---------------------------
# Step 2: Initial cleaning for resumes
# ---------------------------
resumes_df['temp_cleaned'] = resumes_df['Resume_str'].astype(str).apply(clean_text_spacy)

# ---------------------------
# Step 3: Automatically detect common resume words
# ---------------------------
all_resume_tokens = [token for sublist in resumes_df['temp_cleaned'] for token in sublist]
resume_counts = Counter(all_resume_tokens)
num_resumes = len(resumes_df)
resume_threshold = 0.5
common_resume_words = [word for word, count in resume_counts.items() if count / num_resumes > resume_threshold]

# ---------------------------
# Step 4: Final cleaning for resumes
# ---------------------------
def clean_resume(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc
              if not token.is_stop and len(token) > 2 and token.lemma_ not in common_resume_words]
    return list(dict.fromkeys(tokens))

resumes_df['Resume_clean'] = resumes_df['Resume_str'].astype(str).apply(clean_resume)
resumes_df.drop(columns=['temp_cleaned'], inplace=True)


print("✅ First cleaned resume:")
print(resumes_df[['Resume_str', 'Resume_clean']].iloc[0])

✅ First cleaned resume:
Resume_str               HR ADMINISTRATOR/MARKETING ASSOCIATE\...
Resume_clean    [administrator, marketing, associate, dedicate...
Name: 0, dtype: object


In [8]:
# -----------------------------------------
# 5. SECTION EXTRACTION (regex-based)
# -----------------------------------------
# Keep your dictionary of regex patterns for sections
section_patterns = {
    "education": r'education\b|degree|university|college|diploma',
    "experience": r'experience\b|work history|employment',
    "skills": r'skills?\b|competencies|expertise',
    "projects": r'projects?\b|portfolio|initiative',
    "certifications": r'certifications?|licenses?|awards?',
    "languages": r'languages?\b|english|french|arabic|german',
}

def extract_sections(text):
    """
    Extract key sections from resumes.
    Uses regex patterns.
    If no match → return 'NOT FOUND'
    """
    sections = {}
    for name, pattern in section_patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        sections[name] = match.group(0) if match else "NOT FOUND"
    return sections

# Apply extraction
resumes_df['Sections'] = resumes_df['Resume_str'].apply(extract_sections)


In [10]:
import re

def extract_full_sections(text):
    """
    Extract text for each section in a resume.
    Returns a dictionary {section_name: section_text}.
    """
    sections = {}
    text = text.lower()  # normalize
    # Find positions of all section headers
    header_positions = []
    for name, pattern in section_patterns.items():
        match = re.search(pattern, text)
        if match:
            header_positions.append((match.start(), name))
    # Sort headers by position
    header_positions.sort()

    # Extract content between headers
    for i, (start, name) in enumerate(header_positions):
        end = header_positions[i + 1][0] if i + 1 < len(header_positions) else len(text)
        section_text = text[start:end].strip()
        sections[name] = section_text
    # Fill missing sections with NOT FOUND
    for name in section_patterns.keys():
        if name not in sections:
            sections[name] = "NOT FOUND"
    return sections


In [11]:
sections_expanded = resumes_df['Sections'].apply(pd.Series)
resumes_df = pd.concat([resumes_df, sections_expanded], axis=1)

# Now you can view it like a table
print(resumes_df[['Resume_str', 'education', 'experience', 'skills', 'projects', 'certifications', 'languages']].head(5))


                                          Resume_str   education   education  \
0           HR ADMINISTRATOR/MARKETING ASSOCIATE\...   Education   Education   
1           HR SPECIALIST, US HR OPERATIONS      ...   Education   Education   
2           HR DIRECTOR       Summary      Over 2...  University  University   
3           HR SPECIALIST       Summary    Dedica...   Education   Education   
4           HR MANAGER         Skill Highlights  ...   Education   Education   

   experience  experience     skills     skills    projects    projects  \
0  experience  experience     skills     skills   NOT FOUND   NOT FOUND   
1  Experience  Experience     Skills     Skills  initiative  initiative   
2  experience  experience     Skills     Skills     Project     Project   
3  experience  experience  expertise  expertise   NOT FOUND   NOT FOUND   
4  Employment  Employment      Skill      Skill     Project     Project   

  certifications certifications  languages  languages  
0  Certifica

###EMBEDDING MODELS :

####CBOW :

In [15]:
from gensim.models import Word2Vec
import torch
from transformers import BertTokenizer, BertModel

# ---- CBOW ----
class CBOWEmbedder:
    def __init__(self, vector_size=768, window=5, min_count=2):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.model = None

    def train(self, texts):
        # Split cleaned text into tokens for training
        sentences = [t.split() for t in texts if len(str(t)) > 0]
        self.model = Word2Vec(sentences, vector_size=self.vector_size,
                              window=self.window, min_count=self.min_count, sg=0)
        return self.model

    def get_doc_embedding(self, text):
        # Average of word vectors
        tokens = text.split()
        vecs = [self.model.wv[t] for t in tokens if t in self.model.wv]
        return np.mean(vecs, axis=0) if vecs else np.zeros(self.vector_size)


#### Skip-gram :


In [16]:
class SkipGramEmbedder(CBOWEmbedder):
    def train(self, texts):
        # texts is a list of token lists
        sentences = [t for t in texts if len(t) > 0]

        # Initialize the Skip-gram Word2Vec model
        self.model = Word2Vec(
            sentences,
            vector_size=768,  # 768-dimensional embeddings
            window=self.window,
            min_count=self.min_count,
            sg=1  # sg=1 for Skip-gram
        )
        return self.model


####BERT :

In [19]:
# ---- BERT ----
import torch
from transformers import BertTokenizer, BertModel

class BERTEmbedder:
    def __init__(self, model_name='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.model.eval()

    def get_embedding(self, tokens):
        # Join token list into a string
        text = " ".join(tokens)
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
        with torch.no_grad():
            outputs = self.model(**inputs)
        # Mean pooling over tokens
        mask = inputs['attention_mask'].unsqueeze(-1).expand(outputs.last_hidden_state.size()).float()
        sum_emb = torch.sum(outputs.last_hidden_state * mask, 1)
        sum_mask = torch.clamp(mask.sum(1), min=1e-9)
        return (sum_emb / sum_mask).squeeze().numpy()


### models results :

In [20]:
texts = resumes_df['Resume_clean'].tolist()

# CBOW
cbow = CBOWEmbedder()
cbow.train(texts)
emb1 = cbow.get_doc_embedding(texts[0])

# Skip-gram
texts = resumes_df['Resume_clean'].tolist()  # list of token lists

sg = SkipGramEmbedder()
sg.train(texts)
emb_sg = sg.get_doc_embedding(texts[0])


# BERT
bert = BERTEmbedder()
emb_bert = bert.get_embedding(texts[0])
print("BERT shape:", emb_bert.shape)

print("CBOW shape:", emb1.shape)
print("Skip-gram shape:", emb_sg.shape)


AttributeError: 'list' object has no attribute 'split'

###Word Similarity

In [7]:


# CV-related word pairs with human similarity scores
word_pairs = [
    ("python", "java"),
    ("developer", "programmer"),
    ("accountant", "engineer"),
    ("excel", "spreadsheet"),
    ("analyst", "manager")
]
human_scores = [0.8, 0.9, 0.3, 0.85, 0.5]

# Evaluation for Word2Vec models (CBOW / Skip-gram)
def evaluate_similarity(model, pairs, human_scores):
    model_scores = []
    for (w1, w2) in pairs:
        if w1 in model.wv and w2 in model.wv:
            sim = model.wv.similarity(w1, w2)
        else:
            sim = 0
        model_scores.append(sim)
    corr, _ = spearmanr(human_scores, model_scores)
    return corr, model_scores

print("CBOW:", evaluate_similarity(cbow.model, word_pairs, human_scores))
print("Skip-gram:", evaluate_similarity(sg.model, word_pairs, human_scores))

# BERT similarity
def bert_similarity(bert, w1, w2):
    v1 = bert.get_embedding(w1).reshape(1, -1)
    v2 = bert.get_embedding(w2).reshape(1, -1)
    return cosine_similarity(v1, v2)[0][0]

bert_scores = [bert_similarity(bert, w1, w2) for (w1, w2) in word_pairs]
print("BERT similarity scores:", bert_scores)

NameError: name 'cbow' is not defined

####Word Analogies

In [None]:
# Example resume-domain analogies

# Developer specializations
print("Analogy: developer - java + python ≈ ?")
print("CBOW:", cbow.model.wv.most_similar(positive=['developer','python'], negative=['java'], topn=3))
print("Skip-gram:", sg.model.wv.most_similar(positive=['developer','python'], negative=['java'], topn=3))

# Data career path
print("\nAnalogy: data - analyst + scientist ≈ ?")
print("CBOW:", cbow.model.wv.most_similar(positive=['data','scientist'], negative=['analyst'], topn=3))
print("Skip-gram:", sg.model.wv.most_similar(positive=['data','scientist'], negative=['analyst'], topn=3))

# Machine learning engineer
print("\nAnalogy: machine - learning + engineer ≈ ?")
print("CBOW:", cbow.model.wv.most_similar(positive=['machine','engineer'], negative=['learning'], topn=3))
print("Skip-gram:", sg.model.wv.most_similar(positive=['machine','engineer'], negative=['learning'], topn=3))


BERT is not naturally trained for analogies; you'd need vector arithmetic but it’s less effective

###Word Clustering

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Choose CV-related words
words = ["developer", "designer", "manager", "analyst", "engineer",
         "scientist", "consultant", "intern"]

# Get vectors from CBOW
X = [cbow.model.wv[w] for w in words if w in cbow.model.wv]

# Dimensionality reduction
pca = PCA(n_components=2)
coords = pca.fit_transform(X)

# Plot
plt.figure(figsize=(8,6))
plt.scatter(coords[:,0], coords[:,1], c='blue')

for i, word in enumerate(words):
    plt.annotate(word, (coords[i,0], coords[i,1]))

plt.title("CBOW Word Clustering (CV Job Roles)")
plt.show()


In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

words = ["developer", "designer", "manager", "analyst", "engineer",
         "scientist", "consultant", "intern"]

# Get vectors from Skip-gram
X = [sg.model.wv[w] for w in words if w in sg.model.wv]

coords = PCA(n_components=2).fit_transform(X)

plt.figure(figsize=(8,6))
plt.scatter(coords[:,0], coords[:,1], c='red')

for i, word in enumerate(words):
    plt.annotate(word, (coords[i,0], coords[i,1]))

plt.title("Skip-gram Word Clustering (CV Job Roles)")
plt.show()


In [None]:
import numpy as np

words = ["developer", "designer", "manager", "analyst", "engineer",
         "scientist", "consultant", "intern"]

# Get embeddings from BERT
X = [bert.get_embedding(w) for w in words]

coords = PCA(n_components=2).fit_transform(X)

plt.figure(figsize=(8,6))
plt.scatter(coords[:,0], coords[:,1], c='purple')

for i, word in enumerate(words):
    plt.annotate(word, (coords[i,0], coords[i,1]))

plt.title("BERT Word Clustering (CV Job Roles)")
plt.show()


###Extrinsic Evaluation

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = [cbow.get_doc_embedding(t) for t in texts]
y = resumes_df['Category']  # assuming you have labels like "Data Science", "HR", etc.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf = LogisticRegression(max_iter=1000).fit(X_train, y_train)
print("CBOW accuracy:", accuracy_score(y_test, clf.predict(X_test)))


In [None]:
X_sg = [sg.get_doc_embedding(t) for t in texts]

X_train, X_test, y_train, y_test = train_test_split(X_sg, y, test_size=0.2, random_state=42)
clf_sg = LogisticRegression(max_iter=1000).fit(X_train, y_train)
print("Skip-gram accuracy:", accuracy_score(y_test, clf_sg.predict(X_test)))


In [None]:
X_bert = [bert.get_embedding(t) for t in texts]

X_train, X_test, y_train, y_test = train_test_split(X_bert, y, test_size=0.2, random_state=42)
clf_bert = LogisticRegression(max_iter=1000).fit(X_train, y_train)
print("BERT accuracy:", accuracy_score(y_test, clf_bert.predict(X_test)))


###Vector Comparisons

In [None]:
from numpy.linalg import norm
import numpy as np

def compare_vecs(v1, v2):
    cos = np.dot(v1, v2) / (norm(v1) * norm(v2))
    euc = norm(v1 - v2)
    return cos, euc

# Compare embeddings (emb1 = CBOW, emb2 = Skip-gram, emb3 = BERT)
pairs = {
    "CBOW vs Skip-gram": (emb1, emb2),
    "CBOW vs BERT": (emb1, emb3),
    "Skip-gram vs BERT": (emb2, emb3)
}

for name, (v1, v2) in pairs.items():
    cos, euc = compare_vecs(v1, v2)
    print(f"{name} -> Cosine similarity: {cos:.4f}, Euclidean distance: {euc:.4f}")
