In [None]:
import pandas as pd
import numpy as np
import os
import scipy.stats as stats


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

All of our Google Drive files look the same, so that's where we are pulling our data from.

In [None]:
import os

# data_path = "/content/drive/MyDrive/cadence 1a/data"
# print(os.listdir(data_path))

We read the resume dataset and do some preliminary analysis to see howw it looks. We look at the null values, the number of columns and rows, as well as the sum of null values. We proceed to do the same for the job posts dataset.

In [None]:
# resume_df = pd.read_csv(f"{data_path}/resumes.csv")
resume_df = pd.read_csv("resumes.csv")
resume_df.shape
resume_df.head()
# resume_df.isnull().sum()

In [None]:
# job_posts_df = pd.read_csv(f"{data_path}/jobs.csv")
job_posts_df = pd.read_csv("jobs.csv")
job_posts_df.shape
job_posts_df.head()
# job_posts_df.isnull().sum()

Columns that have missing values in the job posts dataset are being turned into a list.

In [None]:
condition = job_posts_df.isnull().sum() != 0
job_posts_df.isnull().sum()[condition].index
columnlist = list(job_posts_df.isnull().sum()[condition].index)
columnlist

There are no numerical values in this data set; everything is stored in string/object format.

In [None]:
job_posts_df[columnlist].dtypes

## Addressing Null values in job_posts_df
Since there are many columns in job_posts_df with null values, we can use reasoning to drop some of the rows or columns. Some columns can be cut if they have too many null values or if they are not really relevant to the problem.  For example, the column "AnnouncementCode" has 17793 null values and the unique non-null values are a random string of letters. It was most likely used to identify the job posting on its original website. As well, "Opening Date" and "Deadline" may not have many null values, but these values might not be very useful to determine whether or not a candidate would be a good fit for a job.

In [None]:
columns_to_drop = ['AnnouncementCode', 'Term', 'Eligibility', 'Audience', 'StartDate', 'Duration', 'OpeningDate', 'Deadline', 'Notes', 'Attach']
job_posts_df = job_posts_df.drop(columns=columns_to_drop)
job_posts_df.isnull().sum()

There are still null values, but the columns are too contextually important to the ML problem. We can drop the examples that have null values in these columns since they most likely do not have the information we need to train the model accurately. After we drop these rows, our job_posts_df dataset no longer has any null values.

In [None]:
columns_to_check = ['Title', 'JobDescription', 'JobRequirment', 'Company', 'Location', 'RequiredQual', 'Salary', 'AboutC']

job_posts_df = job_posts_df.dropna(subset=columns_to_check)

job_posts_df.shape

In [None]:
job_posts_df.isnull().sum()

Removing duplicates from both datasets.

In [None]:
job_posts_df = job_posts_df.drop_duplicates()
print(job_posts_df.duplicated().sum())
resume_df = resume_df.drop_duplicates()
print(resume_df.duplicated().sum())

Finding columns with HTML tags. No column in job post has any HTML tags we need to remove. Only resume.

In [None]:
import re
def has_html(text):
    if isinstance(text, str):
        return bool(re.search(r'<.*?>', text))
    return False

In [None]:
columns_with_html = [col for col in job_posts_df.columns if job_posts_df[col].apply(has_html).any()]
print(columns_with_html)

columns_with_html = [col for col in resume_df.columns if resume_df[col].apply(has_html).any()]
print(columns_with_html)

In [None]:
import re
def clean_html(text):
  return re.sub('<[^<]+?>', '', text)

resume_df['Resume_html'] = resume_df['Resume_str'].apply(clean_html)

Resume_html and Resume_str are the same column

In [None]:
resume_df[['Resume_html', 'Resume_str']].head()
resume_df.drop(columns=['Resume_html'], inplace= True)

Creates a corpus for all job post related details

In [None]:
job_posts_df["job_text"] = (
    "Description: " + job_posts_df["JobDescription"].fillna('') + " "
    "Requirements: " + job_posts_df["JobRequirment"].fillna('') + " "
    "Qualifications: " + job_posts_df["RequiredQual"].fillna('') + " "
    "About Company: " + job_posts_df["AboutC"].fillna('')
)


# Checkpoint #2 - Text Normalization
Apply tokenization, lowercasing, stopword removal, and lemmatization.

Import NLP and text-processing tools:
- nltk for natural language processing utilities (downloads WordNet for lemmatization)
- TfidfVectorizer and ENGLISH_STOP_WORDS from sklearn to convert text into numerical features and remove common stop words
- WordNetLemmatizer to reduce words to their base (dictionary) form
- word_tokenize: splits sentences into individual words
- stopwords: provides common words (like "the", "and") to remove from text

In [None]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('wordnet')

Make sure NLTK resources are available

In [None]:
nltk.download("punkt_tab")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("stopwords")

Initialize the WordNet lemmatizer and define a set of English stopwords for text cleaning

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))


 TEXT PREPROCESSING PIPELINE FOR RESUMES AND JOB POSTS

 Purpose:
   This section standardizes and cleans all text data to prepare it
   for accurate keyword and semantic matching in the scoring system.

 Description:
   The preprocess_text() function normalizes text by performing:
     1. Lowercasing – ensures consistent word comparisons.
     2. Tokenization – splits text into individual words.
     3. Stopword removal – removes common filler words like "the", "and", "is".
     4. Filtering – keeps only alphabetic tokens (drops numbers/punctuation).
     5. Lemmatization – reduces words to their base form
        (e.g., “running” → “run”, “analyses” → “analysis”).
     6. Reconstruction – joins cleaned tokens back into a single string.

   This preprocessing is applied to:
     • resume_df["Resume_str"]  → creates resume_df["Resume_clean"]
     • job_posts_df["job_text"] → creates job_posts_df["job_text_clean"]

 Outcome:
   Produces standardized, lemmatized, and stopword-free text columns
   for both resumes and job postings, enabling fair and consistent#   matching in later scoring functions.


In [None]:
def preprocess_text(text):
    """Lowercase, tokenize, remove stopwords, and lemmatize."""
    tokens = word_tokenize(str(text).lower())
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t.isalpha() and t not in stop_words]
    return " ".join(tokens)

resume_df["Resume_clean"] = resume_df["Resume_str"].apply(preprocess_text)
job_posts_df['job_text_clean'] = job_posts_df['job_text'].apply(preprocess_text)

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

resume_embeddings = model.encode(resume_df["Resume_clean"].tolist(), show_progress_bar=True)
job_embeddings = model.encode(job_posts_df["job_text_clean"].tolist(), show_progress_bar=True)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

resume_lengths = resume_df['Resume_str'].str.len()
job_lengths = job_posts_df['JobDescription'].str.len()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
ax1.hist(resume_lengths, bins=50, alpha=0.7)
ax1.set_title('Resume Text Length Distribution')
ax1.set_xlabel('Character Count')

ax2.hist(job_lengths, bins=50, alpha=0.7)
ax2.set_title('Job Description Length Distribution')
ax2.set_xlabel('Character Count')
plt.show()

In [None]:
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Get most common words in resumes and job postings
def get_top_words(text_series, n=45):
    all_words = ' '.join(text_series).lower().split()
    # Remove stopwords and short words
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in all_words if word not in stop_words and len(word) > 2]
    return Counter(filtered_words).most_common(n)

top_resume_words = get_top_words(resume_df['Resume_str'])
top_job_words = get_top_words(job_posts_df['JobDescription'])
print(top_resume_words)
print(top_job_words)
# Plot word frequencies
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

words, counts = zip(*top_resume_words)
ax1.barh(words, counts)
ax1.set_title('Top Words in Resumes')

words, counts = zip(*top_job_words)
ax2.barh(words, counts)
ax2.set_title('Top Words in Job Descriptions')
plt.tight_layout()
plt.show()

In [None]:
# Common skills extraction (simplified)
# Add/update list with keywords we are interested in
skills_keywords = ['python', 'java', 'sql', 'machine learning', 'aws',
                   'docker', 'kubernetes', 'react', 'node.js', 'tensorflow']

def count_skills(text, skills_list):
    text_lower = text.lower()
    return sum(1 for skill in skills_list if skill in text_lower)

# Count skills in resumes and job postings
for skill in skills_keywords:
    resume_df[f'resume_has_{skill}'] = resume_df['Resume_str'].str.lower().str.contains(skill)
    job_posts_df[f'job_has_{skill}'] = job_posts_df['JobDescription'].str.lower().str.contains(skill)

# Plot skills frequency
resume_skills_count = resume_df[[f'resume_has_{skill}' for skill in skills_keywords]].sum()
job_skills_count = job_posts_df[[f'job_has_{skill}' for skill in skills_keywords]].sum()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
resume_skills_count.plot(kind='barh', ax=ax1)
ax1.set_title('Skills Frequency in Resumes')
job_skills_count.plot(kind='barh', ax=ax2)
ax2.set_title('Skills Frequency in Job Postings')
plt.tight_layout()
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Bigram analysis
def plot_top_ngrams(text_series, n=2, top_k=45):
    vectorizer = CountVectorizer(ngram_range=(n, n), stop_words='english',
                               max_features=top_k)
    X = vectorizer.fit_transform(text_series)
    words = vectorizer.get_feature_names_out()
    counts = X.sum(axis=0).A1

    plt.figure(figsize=(10, 6))
    plt.barh(words, counts)
    plt.title(f'Top {n}-grams')
    plt.show()

# Compare bigrams in resumes vs job postings
plot_top_ngrams(resume_df['Resume_str'], n=2)
plot_top_ngrams(job_posts_df['JobDescription'], n=2)

# Checkpoint #3 - Data Annotation

define weights for: skills, experience, education, semantic similarity, domain
creates labels (good match/top 25% = 1, bad match/bottom 25% = 0)


# SCORING CONFIGURATION AND KEYWORD DEFINITIONS

 This section defines:
 1. Feature weights — how much each category contributes to the final composite score.
 2. Domain-specific keywords — used to detect the job domain (e.g., HR, finance, IT).
 3. Common skill keywords — used to evaluate skill overlap between job and resume.
4. Education levels — mapped to numeric values for scoring academic background.
 5. Experience indicators — keywords representing seniority or management experience.



In [None]:
from collections import Counter

In [None]:
#define weights
weights = {
    'skills': 0.35,
    'experience': 0.20,
    'education': 0.15,
    'semantic': 0.15,
    'domain': 0.15
}


domain_keywords = {
    'hr': [
        'human resources', 'hr', 'recruitment', 'recruiting', 'hiring',
        'payroll', 'benefits', 'employee relations', 'compensation',
        'performance management', 'talent acquisition', 'training',
        'onboarding', 'diversity', 'compliance', 'employee engagement',
        'career development', 'hr policies', 'conflict resolution',
        'organizational development', 'leadership', 'communication skills',
        'workplace safety', 'microsoft office', 'workforce planning',
        'hr analytics', 'labor law', 'employee retention'
    ],

    'finance': [
        'finance', 'financial', 'accounting', 'budget', 'budgeting', 'audit', 'tax',
        'bookkeeping', 'financial analysis', 'forecasting', 'financial modeling',
        'cash flow', 'profit', 'loss', 'ledger', 'accounts payable',
        'accounts receivable', 'payable', 'receivable', 'valuation',
        'cost analysis', 'financial reporting', 'economics', 'treasury',
        'capital markets', 'credit', 'debit', 'banking', 'investment',
        'excel', 'power bi', 'data analysis'
    ],

    'it': [
        'programming', 'software', 'development', 'software development',
        'software engineer', 'python', 'java', 'sql', 'database', 'web development',
        'network', 'system administration', 'cloud', 'aws', 'azure', 'gcp',
        'devops', 'docker', 'kubernetes', 'linux', 'git', 'version control',
        'testing', 'debugging', 'api', 'backend', 'frontend', 'node.js', 'react',
        'data science', 'machine learning', 'tensorflow', 'automation',
        'cybersecurity'
    ],

    'sales': [
        'sales', 'business development', 'account management', 'revenue', 'crm',
        'client', 'customer', 'lead generation', 'cold calling', 'prospecting',
        'presentation', 'closing deals', 'negotiation', 'pipeline', 'quota',
        'target', 'territory', 'upselling', 'cross-selling', 'b2b', 'b2c',
        'account executive', 'retail', 'merchandising', 'promotion', 'marketing',
        'sales strategy', 'partnerships', 'client relations', 'relationship management',
        'sales operations', 'business partnerships'
    ],

    'administration': [
        'administrative', 'secretary', 'assistant', 'coordination', 'office',
        'organization', 'communication', 'customer service', 'documentation',
        'inventory', 'scheduling', 'calendar management', 'data entry', 'filing',
        'record keeping', 'reception', 'travel arrangements', 'correspondence',
        'procurement', 'clerical', 'executive assistant', 'meeting planning',
        'office management', 'support staff', 'event planning', 'vendor management',
        'front desk', 'logistics', 'supplies', 'records management',
        'budget tracking', 'document control', 'office coordination',
        'front office', 'mail management'
    ],

    'research': [
        'research', 'analyst', 'analysis', 'data analysis', 'methodology',
        'report', 'evaluation', 'literature review', 'hypothesis', 'experiment',
        'survey', 'study', 'quantitative', 'qualitative', 'statistics', 'modeling',
        'scientific', 'investigation', 'findings', 'insight', 'insights',
        'publication', 'predictive modeling', 'data visualization',
        'policy analysis', 'impact assessment', 'data collection', 'r',
        'spss', 'tableau', 'power bi'
    ]
}


skills = ['excel', 'word', 'powerpoint', 'sql', 'python', 'project management', 'data analysis', 'ms office', 'microsoft office']

education_levels = {'phd': 4, 'master': 3, 'bachelor': 2, 'associate': 1, 'diploma': 0.5}

experience_words = ['manager', 'director', 'senior', 'lead', 'specialist', 'analyst']

In [None]:
def count_keywords_in_series(text_series, domain_keywords):
    """Counts all domain keywords in a pandas Series of text efficiently."""
    domain_counts = {domain: 0 for domain in domain_keywords}
    keyword_counts = Counter()
    
    for domain, keywords in domain_keywords.items():
        for kw in keywords:
            # Build a regex for full word or phrase match (case-insensitive)
            pattern = rf'\b{re.escape(kw.lower())}\b'
            # Sum counts across all rows
            count = text_series.str.count(pattern, flags=re.IGNORECASE).sum()
            if count > 0:
                domain_counts[domain] += count
                keyword_counts[(domain, kw)] += count
                
    return domain_counts, keyword_counts

resume_domain_counts, resume_kw_counts = count_keywords_in_series(resume_df['Resume_str'], domain_keywords)
job_domain_counts, job_kw_counts = count_keywords_in_series(job_posts_df['job_text'], domain_keywords)


domain_summary = pd.DataFrame({
    'domain': list(domain_keywords.keys()),
    'resume_keyword_count': [resume_domain_counts[d] for d in domain_keywords],
    'job_keyword_count': [job_domain_counts[d] for d in domain_keywords]
}).sort_values(by='resume_keyword_count', ascending=False)

# Optional: detailed keyword-level breakdown
resume_kw_df = pd.DataFrame(resume_kw_counts.items(), columns=['(domain, keyword)', 'resume_count'])
job_kw_df = pd.DataFrame(job_kw_counts.items(), columns=['(domain, keyword)', 'job_count'])




In [None]:
# Show top 10 keywords by count
print("=== Resume Keyword Counts (Top 30) ===")
display(resume_kw_df.sort_values('resume_count', ascending=False).head(30))

print("=== Job Keyword Counts (Top 30) ===")
display(job_kw_df.sort_values('job_count', ascending=False).head(30))


In [None]:

print("=== Domain-Level Keyword Summary ===")
print(domain_summary, "\n")

print("=== Sample Keyword-Level Breakdown (Top 10) ===")
print(resume_kw_df.sort_values('resume_count', ascending=False))

In [None]:
kw_compare = pd.merge(
    resume_kw_df, job_kw_df,
    on='(domain, keyword)', how='outer'
).fillna(0)

kw_compare['difference'] = kw_compare['resume_count'] - kw_compare['job_count']

# Show top 20 overrepresented in resumes
kw_compare.sort_values('difference', ascending=False).head(20)


Computes a basic semantic similarity between a resume and job posting.

Steps:
1. Converts both texts to lowercase.
2. Splits them into sets of unique words.
3. Finds the overlap (intersection) between the two word sets.
4. Returns the proportion of job words also present in the resume.

Note:
    This is a simple lexical overlap metric, not a deep semantic one.
    It can later be replaced with embedding-based cosine similarity.

Args:
    resume_text (str): Resume text.
    job_text (str): Job posting text.

Returns:
    float: Overlap ratio between 0 and 1.

In [None]:
def calculate_semantic_score(resume_text, job_text):
    resume_words = set(resume_text.lower().split())
    job_words = set(job_text.lower().split())
    if not job_words:
        return 0
    overlap = len(resume_words.intersection(job_words))
    return overlap / len(job_words) if overlap > 0 else 0


Calculates how well a resume matches the skills required by a job posting.

Steps:
1. Converts both resume and job descriptions to lowercase.
2. Extracts all skill keywords that appear in the job text.
3. Checks which of those required skills also appear in the resume.
4. Returns the ratio of matched skills to total job-required skills.

Args:
    resume_text (str): Full resume text.
    job_text (str): Combined job description and requirements text.

Returns:
    float: Skill match score between 0 and 1.
            Returns 0 if no skills were found in the job text.

In [None]:
def calculate_skills_score(resume_text, job_text):
    resume_lower = resume_text.lower()
    job_lower = job_text.lower()
    job_skills = [skill for skill in skills if skill in job_lower]
    if not job_skills:
        return 0
    resume_skills = [skill for skill in job_skills if skill in resume_lower]
    return len(resume_skills) / len(job_skills)


Estimates the candidate's experience level from their resume.

Steps:
1. Converts text to lowercase.
2. Searches for any mention of "X years" to estimate years of experience.
3. Counts seniority-related keywords like 'manager', 'senior', etc.
4. Combines both measures into a normalized score:
    (years / 10) + (experience_keywords / 5), capped at 1.0.

Args:
    resume_text (str): Full resume text.

Returns:
    float: Experience score between 0 and 1.

In [None]:
def calculate_experience_score(resume_text):
    text_lower = resume_text.lower()
    years_matches = re.findall(r'(\d+)\s*(?:years?|yrs?)', text_lower)
    max_years = max([int(year) for year in years_matches]) if years_matches else 0
    exp_count = sum(1 for word in experience_words if word in text_lower)
    return min((max_years / 10) + (exp_count / 5), 1.0)

Determines the highest education level mentioned in the resume.

Steps:
1. Converts resume to lowercase.
2. Checks for mentions of education keywords ('phd', 'master', etc.).
3. Maps the highest degree found to a numeric value from education_levels.
4. Normalizes by dividing by 4 (the highest possible score).

Args:
    resume_text (str): Full resume text.

Returns:
    float: Education score between 0 and 1.
            Higher degrees produce higher scores.

In [None]:
def calculate_education_score(resume_text):
    text_lower = resume_text.lower()
    max_education = 0
    for level, score in education_levels.items():
        if level in text_lower:
            max_education = max(max_education, score)
    return min(max_education / 4, 1.0)

Evaluates whether the resume and job post belong to the same domain/industry.

Steps:
1. Identifies the job's domain (HR, IT, finance, etc.) based on domain_keywords.
2. Counts how many domain-specific keywords appear in the job description.
3. Checks how many of those same keywords appear in the resume.
4. Returns the ratio of matching domain keywords.
    If the job domain cannot be determined, returns a neutral 0.5.

Args:
    resume_text (str): Resume text.
    job_text (str): Job posting text.

Returns:
    float: Domain relevance score between 0 and 1.

In [None]:
def calculate_domain_score(resume_text, job_text):
    resume_lower = resume_text.lower()
    job_lower = job_text.lower()
    job_domain = 'general'
    max_domain_score = 0
    for domain, keywords in domain_keywords.items():
        domain_score = sum(1 for keyword in keywords if keyword in job_lower)
        if domain_score > max_domain_score:
            max_domain_score = domain_score
            job_domain = domain
    if job_domain == 'general':
        return 0.5
    domain_keywords_list = domain_keywords[job_domain]
    matches = sum(1 for keyword in domain_keywords_list if keyword in resume_lower)
    return min(matches / len(domain_keywords_list), 1.0)

Calculates the overall match score between a resume and a job posting
by combining all five sub-scores using predefined feature weights.

Steps:
1. Calls each of the five scoring functions:
        - calculate_skills_score
        - calculate_experience_score
        - calculate_education_score
        - calculate_domain_score
        - calculate_semantic_score
2. Stores each sub-score in a dictionary for transparency.
3. Computes a weighted average using the 'weights' dictionary defined earlier.
4. Returns both the final composite score and the individual component scores.

Formula:
    final_score = Σ (weight_i × score_i)
    where i ∈ {skills, experience, education, domain, semantic}

Args:
    resume_text (str): Full resume text.
    job_text (str): Combined job description and requirements text.

Returns:
    tuple:
        - final_score (float): Overall weighted fit score (0–1 range).
        - scores (dict): Dictionary of component scores for analysis and debugging.

In [None]:
#final score
def calculate_composite_score(resume_text, job_text):
    scores = {
        'skills': calculate_skills_score(resume_text, job_text),
        'experience': calculate_experience_score(resume_text),
        'education': calculate_education_score(resume_text),
        'domain': calculate_domain_score(resume_text, job_text),
        'semantic': calculate_semantic_score(resume_text, job_text)
    }
    final_score = sum(weights[component] * scores[component] for component in scores.keys())
    return final_score, scores

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# ----- Resume Feature Extraction -----
def get_resume_features(resume_text):
    return {
        'skills': calculate_skills_score(resume_text, ""),
        'experience': calculate_experience_score(resume_text),
        'education': calculate_education_score(resume_text),
        'domain': calculate_domain_score(resume_text, "")
    }

resume_features = []
for _, row in resume_df.iterrows():
    features = get_resume_features(row['Resume_str'])
    resume_features.append({'resume_id': row['ID'], **features})

resume_features_df = pd.DataFrame(resume_features)


# ----- Job Feature Extraction -----
def get_job_features(job_text):
    return {
        'skills': calculate_skills_score(job_text, ""),
        'experience': calculate_experience_score(job_text),
        'education': calculate_education_score(job_text),
        'domain': calculate_domain_score(job_text, "")
    }

job_features = []
for _, row in job_posts_df.iterrows():
    features = get_job_features(row['job_text'])
    job_features.append({'job_id': row.get('ID', _), 'title': row['Title'], **features})

job_features_df = pd.DataFrame(job_features)


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np

# --- Detect domain of each job post ---
def detect_domain(text, domain_keywords):
    text_lower = text.lower()
    scores = {domain: sum(kw in text_lower for kw in keywords) for domain, keywords in domain_keywords.items()}
    return max(scores, key=scores.get) if max(scores.values()) > 0 else "other"

job_posts_df["domain"] = job_posts_df["job_text_clean"].apply(lambda x: detect_domain(x, domain_keywords))

# --- Cluster jobs by domain using K-Means ---
domain_clusters = {}
job_posts_df["cluster"] = -1  # default value

for domain, group in job_posts_df.groupby("domain"):
    if len(group) < 3:
        print(f"Skipping domain '{domain}' (too few samples: {len(group)})")
        continue

    embeddings_subset = job_embeddings[group.index.tolist()]

    # heuristic: choose cluster count based on group size (between 2 and 8)
    n_clusters = max(2, min(8, len(group) // 10))  # e.g., 1 cluster per ~10 jobs
    print(f"Clustering domain '{domain}' with n_clusters={n_clusters}")

    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
    cluster_labels = kmeans.fit_predict(embeddings_subset)

    job_posts_df.loc[group.index, "cluster"] = cluster_labels
    domain_clusters[domain] = kmeans

print("✅ Domain-based K-Means clustering complete!")


In [None]:
resume_clusters = cluster_entities_dbscan(resume_features_df, "Resume", eps=0.4, min_samples=3)
job_clusters = cluster_entities_dbscan(job_features_df, "Job", eps=0.45, min_samples=3)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.pairplot(resume_features_df[['skills','experience','education','domain']])
plt.show()


In [None]:
resume_clusters.groupby('cluster')[['skills','experience','education','domain']].mean()
job_clusters.groupby('cluster')[['skills','experience','education','domain']].mean()


GENERATE TRAINING PAIRS: JOB POSTINGS ↔ TOP-MATCHING RESUMES

Purpose:
This section builds the annotated dataset that pairs each job posting
with its top matching resumes, using the composite scoring system.

Process:
1. Takes a sample of job postings (first 100 for efficiency).
2. For each job posting:
   • Retrieves the combined job text (title, description, requirements, etc.)
   • Iterates through every resume in the dataset.
   • Uses calculate_composite_score() to compute a weighted “fit score”
     based on skills, experience, education, domain, and semantic similarity.
3. Stores each (job, resume) pair along with its detailed component scores.
4. Sorts the resumes by their final score and keeps only the top 10 matches per job.
5. Appends all top results into a single DataFrame called training_pairs.

Outcome:
A structured dataset containing job–resume pairs, ranked by relevance scores,
which will later be used for labeling and potential supervised model training.



In [None]:
# from joblib import Parallel, delayed
# import numpy as np
# import pandas as pd

# def process_one_job(job_idx, job_row, resume_df, calculate_composite_score):
#     job_title = job_row['Title']
#     job_text = job_row['job_text']

#     resume_results = []

#     for resume_idx, resume_row in resume_df.iterrows():
#         resume_text = resume_row['Resume_str']
#         resume_id = resume_row['ID']
#         final_score, component_scores = calculate_composite_score(resume_text, job_text)
        
#         resume_results.append({
#             'job_idx': job_idx,
#             'resume_idx': resume_idx,
#             'job_title': job_title,
#             'resume_id': resume_id,
#             'final_score': final_score,
#             **component_scores  # Include component breakdowns
#         })

#     # Pick top 10 efficiently
#     resume_results.sort(key=lambda x: x['final_score'], reverse=True)
#     return resume_results[:10]


# def generate_training_pairs_parallel(job_posts_df, resume_df, calculate_composite_score):
#     job_sample = job_posts_df
    
#     training_data = Parallel(n_jobs=-1, backend="loky")(
#         delayed(process_one_job)(idx, row, resume_df, calculate_composite_score)
#         for idx, row in job_sample.iterrows()
#     )

#     training_data_flat = [x for sublist in training_data for x in sublist]
#     training_pairs = pd.DataFrame(training_data_flat)

#     print(f"Processed {len(job_sample)} jobs in parallel.")
#     return training_pairs


# # Run it
# training_pairs = generate_training_pairs_parallel(job_posts_df, resume_df, calculate_composite_score)


"""
LABEL GENERATION: CONVERT CONTINUOUS SCORES INTO CATEGORICAL CLASSES

Purpose:
This section transforms the continuous "final_score" values into discrete
labels (1 = good fit, 0 = poor fit) to enable supervised learning or
evaluation later on.

Process:
1. Extracts all final composite scores from the training_pairs DataFrame.
2. Calculates score thresholds using quartiles:
   • High threshold (75th percentile) → top-performing resumes.
   • Low threshold (25th percentile) → least suitable resumes.
3. Assigns labels based on these cutoffs:
   • 1  → score ≥ high_threshold (good fit)
   • 0  → score ≤ low_threshold (poor fit)
   • -1 → scores in the middle range (ambiguous, excluded)
4. Filters out all -1 entries to keep only clearly positive and negative examples.

Outcome:
Creates labeled_training_data — a balanced dataset of strong and weak
resume–job matches suitable for training or evaluating future models.
"""


In [None]:
# #labels
# scores = training_pairs['final_score']
# high_threshold = scores.quantile(0.75)
# low_threshold = scores.quantile(0.25)

# print(f"Score thresholds: High={high_threshold:.3f}, Low={low_threshold:.3f}")

# labels = []
# for score in scores:
#     if score >= high_threshold:
#         labels.append(1)
#     elif score <= low_threshold:
#         labels.append(0)
#     else:
#         labels.append(-1)

# training_pairs['label'] = labels
# labeled_training_data = training_pairs[training_pairs['label'] != -1].copy()


In [None]:
# # annotated dataset
# output_file = 'resume_job_training_data.csv'
# labeled_training_data.to_csv(output_file, index=False)

In [None]:
# import matplotlib.pyplot as plt

# plt.figure(figsize=(8,5))
# plt.hist(training_pairs["final_score"], bins=30, color='skyblue', edgecolor='black')
# plt.title("Distribution of Resume–Job Fit Scores")
# plt.xlabel("Final Composite Score")
# plt.ylabel("Frequency")
# plt.show()
