In [None]:
import pandas as pd
import numpy as np
import os
import scipy.stats as stats


In [None]:
from google.colab import drive
drive.mount('/content/drive')

All of our Google Drive files look the same, so that's where we are pulling our data from.

In [None]:
import os
data_path = "/content/drive/MyDrive/cadence 1a/data"
print(os.listdir(data_path))

We read the resume dataset and do some preliminary analysis to see howw it looks. We look at the null values, the number of columns and rows, as well as the sum of null values. We proceed to do the same for the job posts dataset.

In [None]:
resume_df = pd.read_csv(f"{data_path}/Resume.csv")
resume_df.shape
resume_df.head()
# resume_df.isnull().sum()

In [None]:
job_posts_df = pd.read_csv(f"{data_path}/data job posts.csv")
job_posts_df.shape
job_posts_df.head()
# job_posts_df.isnull().sum()

Columns that have missing values in the job posts dataset are being turned into a list.

In [None]:
condition = job_posts_df.isnull().sum() != 0
job_posts_df.isnull().sum()[condition].index
columnlist = list(job_posts_df.isnull().sum()[condition].index)
columnlist

There are no numerical values in this data set; everything is stored in string/object format.

In [None]:
job_posts_df[columnlist].dtypes

## Addressing Null values in job_posts_df
Since there are many columns in job_posts_df with null values, we can use reasoning to drop some of the rows or columns. Some columns can be cut if they have too many null values or if they are not really relevant to the problem.  For example, the column "AnnouncementCode" has 17793 null values and the unique non-null values are a random string of letters. It was most likely used to identify the job posting on its original website. As well, "Opening Date" and "Deadline" may not have many null values, but these values might not be very useful to determine whether or not a candidate would be a good fit for a job.

In [None]:
columns_to_drop = ['AnnouncementCode', 'Term', 'Eligibility', 'Audience', 'StartDate', 'Duration', 'OpeningDate', 'Deadline', 'Notes', 'Attach']
job_posts_df = job_posts_df.drop(columns=columns_to_drop)
job_posts_df.isnull().sum()

There are still null values, but the columns are too contextually important to the ML problem. We can drop the examples that have null values in these columns since they most likely do not have the information we need to train the model accurately. After we drop these rows, our job_posts_df dataset no longer has any null values.

In [None]:
columns_to_check = ['Title', 'JobDescription', 'JobRequirment', 'Company', 'Location', 'RequiredQual', 'Salary', 'AboutC']

job_posts_df = job_posts_df.dropna(subset=columns_to_check)

job_posts_df.shape

In [None]:
job_posts_df.isnull().sum()

Removing duplicates from both datasets.

In [None]:
job_posts_df = job_posts_df.drop_duplicates()
print(job_posts_df.duplicated().sum())
resume_df = resume_df.drop_duplicates()
print(resume_df.duplicated().sum())

Finding columns with HTML tags. No column in job post has any HTML tags we need to remove. Only resume.

In [None]:
import re
def has_html(text):
    if isinstance(text, str):
        return bool(re.search(r'<.*?>', text))
    return False

In [None]:
columns_with_html = [col for col in job_posts_df.columns if job_posts_df[col].apply(has_html).any()]
print(columns_with_html)

columns_with_html = [col for col in resume_df.columns if resume_df[col].apply(has_html).any()]
print(columns_with_html)

In [None]:
import re
def clean_html(text):
  return re.sub('<[^<]+?>', '', text)

resume_df['Resume_html'] = resume_df['Resume_str'].apply(clean_html)

Resume_html and Resume_str are the same column

In [None]:
resume_df[['Resume_html', 'Resume_str']].head()
resume_df.drop(columns=['Resume_html'], inplace= True)

Creates a corpus for all job post related details

In [None]:
job_posts_df["job_text"] = (
    "Description: " + job_posts_df["JobDescription"].fillna('') + " "
    "Requirements: " + job_posts_df["JobRequirment"].fillna('') + " "
    "Qualifications: " + job_posts_df["RequiredQual"].fillna('') + " "
    "About Company: " + job_posts_df["AboutC"].fillna('')
)


# Checkpoint #2 - Text Normalization
Apply tokenization, lowercasing, stopword removal, and lemmatization.

Import NLP and text-processing tools:
- nltk for natural language processing utilities (downloads WordNet for lemmatization)
- TfidfVectorizer and ENGLISH_STOP_WORDS from sklearn to convert text into numerical features and remove common stop words
- WordNetLemmatizer to reduce words to their base (dictionary) form
- word_tokenize: splits sentences into individual words
- stopwords: provides common words (like "the", "and") to remove from text

In [None]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('wordnet')

Make sure NLTK resources are available

In [None]:
nltk.download("punkt_tab")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("stopwords")

Initialize the WordNet lemmatizer and define a set of English stopwords for text cleaning

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))


 TEXT PREPROCESSING PIPELINE FOR RESUMES AND JOB POSTS

 Purpose:
   This section standardizes and cleans all text data to prepare it
   for accurate keyword and semantic matching in the scoring system.

 Description:
   The preprocess_text() function normalizes text by performing:
     1. Lowercasing – ensures consistent word comparisons.
     2. Tokenization – splits text into individual words.
     3. Stopword removal – removes common filler words like "the", "and", "is".
     4. Filtering – keeps only alphabetic tokens (drops numbers/punctuation).
     5. Lemmatization – reduces words to their base form
        (e.g., “running” → “run”, “analyses” → “analysis”).
     6. Reconstruction – joins cleaned tokens back into a single string.

   This preprocessing is applied to:
     • resume_df["Resume_str"]  → creates resume_df["Resume_clean"]
     • job_posts_df["job_text"] → creates job_posts_df["job_text_clean"]

 Outcome:
   Produces standardized, lemmatized, and stopword-free text columns
   for both resumes and job postings, enabling fair and consistent#   matching in later scoring functions.


In [None]:
def preprocess_text(text):
    """Lowercase, tokenize, remove stopwords, and lemmatize."""
    tokens = word_tokenize(str(text).lower())
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t.isalpha() and t not in stop_words]
    return " ".join(tokens)

resume_df["Resume_clean"] = resume_df["Resume_str"].apply(preprocess_text)
job_posts_df['job_text_clean'] = job_posts_df['job_text'].apply(preprocess_text)

Tokenizes input text, converts it to lowercase, removes punctuation and stopwords, and lemmatizes each word to its base form for cleaner, standardized text analysis.

In [None]:
# def lemmatize_and_tokenize(text):
#     tokens = word_tokenize(str(text).lower())   # lowercase + tokenize
#     tokens = [lemmatizer.lemmatize(t) for t in tokens
#               if t.isalpha() and t not in stop_words]  # keep only words, no stopwords
#     return tokens

Create a TF-IDF vectorizer that uses the custom lemmatize_and_tokenize function to transform resume text into numerical feature vectors.
Apply it only to the 'Resume_str' column of the DataFrame, which contains the text data.
The resulting TF-IDF matrix shows how many resumes (rows) and unique terms (columns) were processed.

In [None]:
# vectorizer = TfidfVectorizer(tokenizer=lemmatize_and_tokenize)

# # Important: pass only the resume text column
# tfidf_matrix_resumes = vectorizer.fit_transform(resume_df["Resume_str"])

# print("Shape of tfidf_matrix_resumes:", tfidf_matrix_resumes.shape)

In [None]:
# #Fit/transform our the resumes

# # This does everything in one call:
# # 1. Normalizes (lowercase, stopwords, lemmatization)
# # 2. Creates the vocabulary
# # 3. Calculates TF-IDF vectors
# tfidf_matrix_resumes = vectorizer.fit_transform(resume_df)

# print("Shape of tfidf_matrix_resumes:", tfidf_matrix_resumes.shape)

In [None]:


# # Pick a few resumes to inspect
# sample_indices = [0, 1, 2]  # change these to any resume indices you want
# pd.set_option("display.max_colwidth", None)
# for idx in sample_indices:
#     original = resume_df["Resume_str"].iloc[idx][:500]  # first 500 chars only for readability
#     normalized = lemmatize_and_tokenize(resume_df["Resume_str"].iloc[idx])

#     print(f"=== Resume {idx} ===")
#     print("Original snippet:")
#     print(original, "\n")
#     print("Normalized tokens:")
#     print(normalized[:30], "...")  # show first 30 tokens
#     print("\n" + "="*60 + "\n")


In [None]:
# # Show the full text in each cell
# pd.set_option("display.max_colwidth", None)

# # Now when you print, you’ll see everything
# resume_df["Resume_str"].head()



In [None]:
# See the structure
# print(job_posts_df.info())

# See a few sample job posts



In [None]:
# pd.set_option("display.max_colwidth", None)
# job_posts_df.head(5)

# Checkpoint #3 - Data Annotation

define weights for: skills, experience, education, semantic similarity, domain
creates labels (good match/top 25% = 1, bad match/bottom 25% = 0)


# SCORING CONFIGURATION AND KEYWORD DEFINITIONS

 This section defines:
 1. Feature weights — how much each category contributes to the final composite score.
 2. Domain-specific keywords — used to detect the job domain (e.g., HR, finance, IT).
 3. Common skill keywords — used to evaluate skill overlap between job and resume.
4. Education levels — mapped to numeric values for scoring academic background.
 5. Experience indicators — keywords representing seniority or management experience.



In [None]:
from collections import Counter

In [None]:
#define weights
weights = {
    'skills': 0.35,
    'experience': 0.20,
    'education': 0.15,
    'semantic': 0.15,
    'domain': 0.15
}


domain_keywords = {
    'hr': ['human resources', 'hr', 'recruitment', 'recruiting', 'hiring', 'payroll', 'benefits', 'employee relations', 'compensation', 'performance management'],
    'finance': ['finance', 'financial', 'accounting', 'budget', 'audit', 'tax', 'bookkeeping', 'financial analysis'],
    'it': ['programming', 'software', 'development', 'python', 'java', 'sql', 'database', 'web development', 'network', 'system administration'],
    'sales': ['sales', 'business development', 'account management', 'revenue', 'crm'],
    'administration': ['administrative', 'secretary', 'assistant', 'coordination', 'office'],
    'research': ['research', 'analyst', 'analysis', 'data analysis', 'methodology']
}

skills = ['excel', 'word', 'powerpoint', 'sql', 'python', 'project management', 'data analysis', 'ms office', 'microsoft office']

education_levels = {'phd': 4, 'master': 3, 'bachelor': 2, 'associate': 1, 'diploma': 0.5}

experience_words = ['manager', 'director', 'senior', 'lead', 'specialist', 'analyst']

   Computes a basic semantic similarity between a resume and job posting.

    Steps:
    1. Converts both texts to lowercase.
    2. Splits them into sets of unique words.
    3. Finds the overlap (intersection) between the two word sets.
    4. Returns the proportion of job words also present in the resume.

    Note:
        This is a simple lexical overlap metric, not a deep semantic one.
        It can later be replaced with embedding-based cosine similarity.

    Args:
        resume_text (str): Resume text.
        job_text (str): Job posting text.

    Returns:
        float: Overlap ratio between 0 and 1.

In [None]:
def calculate_semantic_score(resume_text, job_text):
    resume_words = set(resume_text.lower().split())
    job_words = set(job_text.lower().split())
    if not job_words:
        return 0
    overlap = len(resume_words.intersection(job_words))
    return overlap / len(job_words) if overlap > 0 else 0


    Calculates how well a resume matches the skills required by a job posting.

    Steps:
    1. Converts both resume and job descriptions to lowercase.
    2. Extracts all skill keywords that appear in the job text.
    3. Checks which of those required skills also appear in the resume.
    4. Returns the ratio of matched skills to total job-required skills.

    Args:
        resume_text (str): Full resume text.
        job_text (str): Combined job description and requirements text.

    Returns:
        float: Skill match score between 0 and 1.
               Returns 0 if no skills were found in the job text.

In [None]:
def calculate_skills_score(resume_text, job_text):
    resume_lower = resume_text.lower()
    job_lower = job_text.lower()
    job_skills = [skill for skill in skills if skill in job_lower]
    if not job_skills:
        return 0
    resume_skills = [skill for skill in job_skills if skill in resume_lower]
    return len(resume_skills) / len(job_skills)


    Estimates the candidate's experience level from their resume.

    Steps:
    1. Converts text to lowercase.
    2. Searches for any mention of "X years" to estimate years of experience.
    3. Counts seniority-related keywords like 'manager', 'senior', etc.
    4. Combines both measures into a normalized score:
       (years / 10) + (experience_keywords / 5), capped at 1.0.

    Args:
        resume_text (str): Full resume text.

    Returns:
        float: Experience score between 0 and 1.

In [None]:
def calculate_experience_score(resume_text):
    text_lower = resume_text.lower()
    years_matches = re.findall(r'(\d+)\s*(?:years?|yrs?)', text_lower)
    max_years = max([int(year) for year in years_matches]) if years_matches else 0
    exp_count = sum(1 for word in experience_words if word in text_lower)
    return min((max_years / 10) + (exp_count / 5), 1.0)

    Determines the highest education level mentioned in the resume.

    Steps:
    1. Converts resume to lowercase.
    2. Checks for mentions of education keywords ('phd', 'master', etc.).
    3. Maps the highest degree found to a numeric value from education_levels.
    4. Normalizes by dividing by 4 (the highest possible score).

    Args:
        resume_text (str): Full resume text.

    Returns:
        float: Education score between 0 and 1.
               Higher degrees produce higher scores.

In [None]:
def calculate_education_score(resume_text):
    text_lower = resume_text.lower()
    max_education = 0
    for level, score in education_levels.items():
        if level in text_lower:
            max_education = max(max_education, score)
    return min(max_education / 4, 1.0)

    Evaluates whether the resume and job post belong to the same domain/industry.

    Steps:
    1. Identifies the job's domain (HR, IT, finance, etc.) based on domain_keywords.
    2. Counts how many domain-specific keywords appear in the job description.
    3. Checks how many of those same keywords appear in the resume.
    4. Returns the ratio of matching domain keywords.
       If the job domain cannot be determined, returns a neutral 0.5.

    Args:
        resume_text (str): Resume text.
        job_text (str): Job posting text.

    Returns:
        float: Domain relevance score between 0 and 1.

In [None]:
def calculate_domain_score(resume_text, job_text):
    resume_lower = resume_text.lower()
    job_lower = job_text.lower()
    job_domain = 'general'
    max_domain_score = 0
    for domain, keywords in domain_keywords.items():
        domain_score = sum(1 for keyword in keywords if keyword in job_lower)
        if domain_score > max_domain_score:
            max_domain_score = domain_score
            job_domain = domain
    if job_domain == 'general':
        return 0.5
    domain_keywords_list = domain_keywords[job_domain]
    matches = sum(1 for keyword in domain_keywords_list if keyword in resume_lower)
    return min(matches / len(domain_keywords_list), 1.0)

    Calculates the overall match score between a resume and a job posting
    by combining all five sub-scores using predefined feature weights.

    Steps:
    1. Calls each of the five scoring functions:
         - calculate_skills_score
         - calculate_experience_score
         - calculate_education_score
         - calculate_domain_score
         - calculate_semantic_score
    2. Stores each sub-score in a dictionary for transparency.
    3. Computes a weighted average using the 'weights' dictionary defined earlier.
    4. Returns both the final composite score and the individual component scores.

    Formula:
        final_score = Σ (weight_i × score_i)
        where i ∈ {skills, experience, education, domain, semantic}

    Args:
        resume_text (str): Full resume text.
        job_text (str): Combined job description and requirements text.

    Returns:
        tuple:
            - final_score (float): Overall weighted fit score (0–1 range).
            - scores (dict): Dictionary of component scores for analysis and debugging.

In [None]:
#final score
def calculate_composite_score(resume_text, job_text):
    scores = {
        'skills': calculate_skills_score(resume_text, job_text),
        'experience': calculate_experience_score(resume_text),
        'education': calculate_education_score(resume_text),
        'domain': calculate_domain_score(resume_text, job_text),
        'semantic': calculate_semantic_score(resume_text, job_text)
    }
    final_score = sum(weights[component] * scores[component] for component in scores.keys())
    return final_score, scores

In [None]:
# #job text field
# job_posts_df['job_text'] = job_posts_df.apply(
#     lambda row: ' '.join([str(row[field]) for field in ['Title', 'JobDescription', 'JobRequirment', 'RequiredQual'] if pd.notna(row[field])]),
#     axis=1
# )

GENERATE TRAINING PAIRS: JOB POSTINGS ↔ TOP-MATCHING RESUMES

Purpose:
This section builds the annotated dataset that pairs each job posting
with its top matching resumes, using the composite scoring system.

Process:
1. Takes a sample of job postings (first 100 for efficiency).
2. For each job posting:
   • Retrieves the combined job text (title, description, requirements, etc.)
   • Iterates through every resume in the dataset.
   • Uses calculate_composite_score() to compute a weighted “fit score”
     based on skills, experience, education, domain, and semantic similarity.
3. Stores each (job, resume) pair along with its detailed component scores.
4. Sorts the resumes by their final score and keeps only the top 10 matches per job.
5. Appends all top results into a single DataFrame called training_pairs.

Outcome:
A structured dataset containing job–resume pairs, ranked by relevance scores,
which will later be used for labeling and potential supervised model training.



In [None]:
# generate training pairs
training_data = []
job_sample = job_posts_df.head(100)

for job_idx, job_row in job_sample.iterrows():
    job_text = job_row['job_text']
    job_title = job_row['Title']
    resume_scores = []

    for resume_idx, resume_row in resume_df.iterrows():
        resume_text = resume_row['Resume_str']
        resume_id = resume_row['ID']
        final_score, component_scores = calculate_composite_score(resume_text, job_text)
        resume_scores.append({
            'job_idx': job_idx,
            'resume_idx': resume_idx,
            'job_title': job_title,
            'resume_id': resume_id,
            'final_score': final_score,
            **component_scores
        })

    # sort and keep top 10
    resume_scores.sort(key=lambda x: x['final_score'], reverse=True)
    training_data.extend(resume_scores[:10])

    if job_idx % 25 == 0:
        print(f"Processed {job_idx + 1} jobs...")

training_pairs = pd.DataFrame(training_data)


"""
LABEL GENERATION: CONVERT CONTINUOUS SCORES INTO CATEGORICAL CLASSES

Purpose:
This section transforms the continuous "final_score" values into discrete
labels (1 = good fit, 0 = poor fit) to enable supervised learning or
evaluation later on.

Process:
1. Extracts all final composite scores from the training_pairs DataFrame.
2. Calculates score thresholds using quartiles:
   • High threshold (75th percentile) → top-performing resumes.
   • Low threshold (25th percentile) → least suitable resumes.
3. Assigns labels based on these cutoffs:
   • 1  → score ≥ high_threshold (good fit)
   • 0  → score ≤ low_threshold (poor fit)
   • -1 → scores in the middle range (ambiguous, excluded)
4. Filters out all -1 entries to keep only clearly positive and negative examples.

Outcome:
Creates labeled_training_data — a balanced dataset of strong and weak
resume–job matches suitable for training or evaluating future models.
"""


In [None]:
#labels
scores = training_pairs['final_score']
high_threshold = scores.quantile(0.75)
low_threshold = scores.quantile(0.25)

print(f"Score thresholds: High={high_threshold:.3f}, Low={low_threshold:.3f}")

labels = []
for score in scores:
    if score >= high_threshold:
        labels.append(1)
    elif score <= low_threshold:
        labels.append(0)
    else:
        labels.append(-1)

training_pairs['label'] = labels
labeled_training_data = training_pairs[training_pairs['label'] != -1].copy()


In [None]:
# annotated dataset
output_file = 'resume_job_training_data.csv'
labeled_training_data.to_csv(output_file, index=False)

In [None]:
import matplotlib.pyplot as plt
plt.hist(resume_df["label"], bins=30)
plt.title("Distribution of Resume–Job Fit Scores")
plt.show()
