# Prepare Training Data with Clustering Features

This notebook integrates clustering results from notebooks 5, 6, and 7 with Jessica's scoring system to create a comprehensive training dataset for the AI recruitment model.

## Objectives:
1. Load clustered resume and job data from previous notebooks
2. Merge cluster features (cluster ID, domain labels, PCA coordinates)
3. Create cluster-based features (same_domain_cluster, cluster_distance)
4. Integrate with Jessica's scoring metrics
5. Save final training dataset with all features

## Feature Set:
- **Jessica's Scores**: skills, experience, education, semantic, domain
- **Cluster Features**: resume cluster, job cluster, domain labels
- **Derived Features**: same_domain_cluster, cluster_distance
- **Labels**: Match quality (1 = good match, 0 = poor match)


In [None]:
# 1. Imports & Setup
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

# Add utils to path
sys.path.append(os.path.join(os.pardir, "utils"))

# Set random seeds for reproducibility
np.random.seed(42)

# Configure plotting
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

print("âœ… Libraries imported and environment configured")


In [None]:
# 2. Define Paths
DATA_DIR = os.path.join(os.pardir, "data")
EMB_DIR = os.path.join(DATA_DIR, "embeddings")

# Clustered data from different notebooks
resumes_hybrid_path = os.path.join(DATA_DIR, "resumes_clustered_hybrid.csv")  # from 5clustering
jobs_hybrid_path = os.path.join(DATA_DIR, "jobs_clustered_hybrid.csv")        # from 5clustering
resumes_semantic_path = os.path.join(DATA_DIR, "resumes_clustered.csv")       # from 6clustering

# Embeddings
resume_emb_path = os.path.join(EMB_DIR, "resume_embeddings.npy")
job_emb_path = os.path.join(EMB_DIR, "job_embeddings.npy")

# Verify paths exist
for path in [resumes_hybrid_path, jobs_hybrid_path, resume_emb_path, job_emb_path]:
    assert os.path.exists(path), f"File not found: {path}"

print("âœ… All required files found")


## Step 1: Load Clustered Data and Embeddings

In [None]:
# Load clustered data
resumes_clustered = pd.read_csv(resumes_hybrid_path)
jobs_clustered = pd.read_csv(jobs_hybrid_path)

# Load embeddings
resume_embeddings = np.load(resume_emb_path)
job_embeddings = np.load(job_emb_path)

print(f"Loaded {len(resumes_clustered)} resumes with {len(resumes_clustered.columns)} columns")
print(f"Loaded {len(jobs_clustered)} jobs with {len(jobs_clustered.columns)} columns")
print(f"\nResume embeddings shape: {resume_embeddings.shape}")
print(f"Job embeddings shape: {job_embeddings.shape}")

# Display available columns
print("\nðŸ“Š Resume columns:", list(resumes_clustered.columns))
print("\nðŸ“Š Job columns:", list(jobs_clustered.columns))


## Step 2: Define Jessica's Scoring Functions

These functions compute match scores based on:
- **Skills**: Overlap between resume and job skills
- **Experience**: Years of experience and seniority indicators
- **Education**: Highest degree mentioned
- **Domain**: Industry/field alignment
- **Semantic**: Text similarity (will be replaced with embedding-based similarity)


In [None]:
# Scoring configuration (from new.ipynb)
import re

# Feature weights
weights = {
    'skills': 0.35,
    'experience': 0.20,
    'education': 0.15,
    'semantic': 0.15,
    'domain': 0.15
}

# Skills keywords
skills = ['excel', 'word', 'powerpoint', 'sql', 'python', 'project management', 
          'data analysis', 'ms office', 'microsoft office']

# Education levels
education_levels = {'phd': 4, 'master': 3, 'bachelor': 2, 'associate': 1, 'diploma': 0.5}

# Experience indicators
experience_words = ['manager', 'director', 'senior', 'lead', 'specialist', 'analyst']

# Domain keywords (simplified for this example)
domain_keywords = {
    'hr': ['human resources', 'hr', 'recruitment', 'hiring', 'payroll', 'benefits'],
    'finance': ['finance', 'accounting', 'budget', 'audit', 'tax', 'bookkeeping'],
    'it': ['programming', 'software', 'python', 'java', 'sql', 'cloud', 'aws'],
    'sales': ['sales', 'business development', 'crm', 'client', 'revenue'],
    'administration': ['administrative', 'secretary', 'assistant', 'coordination'],
    'research': ['research', 'analyst', 'analysis', 'data analysis', 'study']
}

print("âœ… Scoring configuration loaded")


In [None]:
# Scoring functions
def calculate_skills_score(resume_text, job_text):
    """Calculate skill overlap between resume and job"""
    resume_lower = str(resume_text).lower()
    job_lower = str(job_text).lower()
    job_skills = [skill for skill in skills if skill in job_lower]
    if not job_skills:
        return 0
    resume_skills = [skill for skill in job_skills if skill in resume_lower]
    return len(resume_skills) / len(job_skills)

def calculate_experience_score(resume_text):
    """Estimate experience level from resume"""
    text_lower = str(resume_text).lower()
    years_matches = re.findall(r'(\d+)\s*(?:years?|yrs?)', text_lower)
    max_years = max([int(year) for year in years_matches]) if years_matches else 0
    exp_count = sum(1 for word in experience_words if word in text_lower)
    return min((max_years / 10) + (exp_count / 5), 1.0)

def calculate_education_score(resume_text):
    """Determine highest education level"""
    text_lower = str(resume_text).lower()
    max_education = 0
    for level, score in education_levels.items():
        if level in text_lower:
            max_education = max(max_education, score)
    return min(max_education / 4, 1.0)

def calculate_domain_score(resume_text, job_text):
    """Check domain/industry alignment"""
    resume_lower = str(resume_text).lower()
    job_lower = str(job_text).lower()
    job_domain = 'general'
    max_domain_score = 0
    for domain, keywords in domain_keywords.items():
        domain_score = sum(1 for keyword in keywords if keyword in job_lower)
        if domain_score > max_domain_score:
            max_domain_score = domain_score
            job_domain = domain
    if job_domain == 'general':
        return 0.5
    domain_keywords_list = domain_keywords[job_domain]
    matches = sum(1 for keyword in domain_keywords_list if keyword in resume_lower)
    return min(matches / len(domain_keywords_list), 1.0)

def calculate_semantic_score_embedding(resume_emb, job_emb):
    """Calculate semantic similarity using embeddings"""
    return cosine_similarity([resume_emb], [job_emb])[0][0]

def calculate_composite_score(resume_text, job_text, resume_emb, job_emb):
    """Compute weighted composite score with all components"""
    scores = {
        'skills': calculate_skills_score(resume_text, job_text),
        'experience': calculate_experience_score(resume_text),
        'education': calculate_education_score(resume_text),
        'domain': calculate_domain_score(resume_text, job_text),
        'semantic': calculate_semantic_score_embedding(resume_emb, job_emb)
    }
    final_score = sum(weights[component] * scores[component] for component in scores.keys())
    return final_score, scores

print("âœ… Scoring functions defined")


## Step 3: Generate Resume-Job Pairs with Scores and Cluster Features

For efficiency, we'll sample jobs and find top-matching resumes.
We integrate:
- Jessica's component scores (skills, experience, education, domain, semantic)
- Cluster information (resume cluster, job cluster, domain labels)
- Derived features (same_domain_cluster, cluster_distance)


In [None]:
# Sample jobs for efficiency (use all for production)
N_JOBS_SAMPLE = 100  # Adjust as needed
job_sample = jobs_clustered.head(N_JOBS_SAMPLE).reset_index(drop=True)

print(f"Processing {len(job_sample)} jobs...")

training_pairs = []

for job_idx, job_row in job_sample.iterrows():
    if job_idx % 20 == 0:
        print(f"Processing job {job_idx}/{len(job_sample)}...")
    
    job_text = job_row['job_text_clean']
    job_cluster = job_row['cluster'] if 'cluster' in job_row else job_row.get('PredictedCluster')
    job_domain_label = job_row.get('JobClusterDomainLabel', 'Unknown')
    job_emb = job_embeddings[job_idx]
    
    resume_results = []
    
    for resume_idx, resume_row in resumes_clustered.iterrows():
        resume_text = resume_row.get('Resume_clean', '')
        resume_id = resume_row.get('ID', resume_idx)
        resume_cluster = resume_row['cluster'] if 'cluster' in resume_row else resume_row.get('PredictedCluster')
        resume_domain_label = resume_row.get('ClusterDomainLabel', 'Unknown')
        resume_emb = resume_embeddings[resume_idx]
        
        # Calculate scores
        final_score, component_scores = calculate_composite_score(
            resume_text, job_text, resume_emb, job_emb
        )
        
        # Cluster features
        same_domain_cluster = int(resume_domain_label == job_domain_label)
        cluster_distance = abs((resume_cluster or 0) - (job_cluster or 0))
        
        resume_results.append({
            'job_idx': job_idx,
            'resume_idx': resume_idx,
            'resume_id': resume_id,
            'resume_cluster': resume_cluster,
            'resume_domain_label': resume_domain_label,
            'job_cluster': job_cluster,
            'job_domain_label': job_domain_label,
            'same_domain_cluster': same_domain_cluster,
            'cluster_distance': cluster_distance,
            'final_score': final_score,
            **component_scores  # Unpack all component scores
        })
    
    # Keep top 10 matches per job
    resume_results.sort(key=lambda x: x['final_score'], reverse=True)
    training_pairs.extend(resume_results[:10])

# Convert to DataFrame
training_data = pd.DataFrame(training_pairs)

print(f"\nâœ… Generated {len(training_data)} training pairs")
print(f"Columns: {list(training_data.columns)}")


## Step 4: Create Labels Based on Score Thresholds

In [None]:
# Create labels using quartiles
scores = training_data['final_score']
high_threshold = scores.quantile(0.75)
low_threshold = scores.quantile(0.25)

print(f"Score thresholds: High={high_threshold:.3f}, Low={low_threshold:.3f}")

# Assign labels
def assign_label(score):
    if score >= high_threshold:
        return 1  # Good match
    elif score <= low_threshold:
        return 0  # Poor match
    else:
        return -1  # Ambiguous (will be filtered)

training_data['label'] = training_data['final_score'].apply(assign_label)

# Filter out ambiguous cases
labeled_training_data = training_data[training_data['label'] != -1].copy()

print(f"\nâœ… Labeled dataset created")
print(f"Total pairs: {len(labeled_training_data)}")
print(f"Good matches (1): {(labeled_training_data['label'] == 1).sum()}")
print(f"Poor matches (0): {(labeled_training_data['label'] == 0).sum()}")


## Step 5: Analyze Feature Distributions

In [None]:
# Visualize score distribution
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Final score distribution
axes[0, 0].hist(labeled_training_data['final_score'], bins=30, color='skyblue', edgecolor='black')
axes[0, 0].set_title('Final Score Distribution')
axes[0, 0].set_xlabel('Score')
axes[0, 0].set_ylabel('Frequency')

# Component scores
component_names = ['skills', 'experience', 'education', 'domain', 'semantic']
for idx, component in enumerate(component_names):
    row = (idx + 1) // 3
    col = (idx + 1) % 3
    axes[row, col].hist(labeled_training_data[component], bins=20, color='salmon', edgecolor='black')
    axes[row, col].set_title(f'{component.capitalize()} Score')
    axes[row, col].set_xlabel('Score')
    axes[row, col].set_ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
# Analyze cluster features
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Same domain cluster vs label
same_domain_by_label = labeled_training_data.groupby('label')['same_domain_cluster'].mean()
axes[0].bar(['Poor Match (0)', 'Good Match (1)'], same_domain_by_label.values, color=['coral', 'lightgreen'])
axes[0].set_title('Same Domain Cluster Rate by Match Quality')
axes[0].set_ylabel('Proportion')
axes[0].set_ylim([0, 1])

# Cluster distance vs label
cluster_dist_by_label = labeled_training_data.groupby('label')['cluster_distance'].mean()
axes[1].bar(['Poor Match (0)', 'Good Match (1)'], cluster_dist_by_label.values, color=['coral', 'lightgreen'])
axes[1].set_title('Average Cluster Distance by Match Quality')
axes[1].set_ylabel('Distance')

plt.tight_layout()
plt.show()

print("\nðŸ“Š Cluster Feature Analysis:")
print(f"Same domain cluster rate for good matches: {same_domain_by_label.get(1, 0):.2%}")
print(f"Same domain cluster rate for poor matches: {same_domain_by_label.get(0, 0):.2%}")
print(f"Avg cluster distance for good matches: {cluster_dist_by_label.get(1, 0):.2f}")
print(f"Avg cluster distance for poor matches: {cluster_dist_by_label.get(0, 0):.2f}")


## Step 6: Save Training Dataset

In [None]:
# Save the final training dataset
output_path = os.path.join(DATA_DIR, "training_data_with_clusters.csv")
labeled_training_data.to_csv(output_path, index=False)

print(f"âœ… Saved training data to: {output_path}")
print(f"\nðŸ“Š Final Dataset Summary:")
print(f"Total samples: {len(labeled_training_data)}")
print(f"Features: {len(labeled_training_data.columns)}")
print(f"\nColumn list:")
for col in labeled_training_data.columns:
    print(f"  - {col}")


## Step 7: Preview Sample Matches

Let's look at some example good and poor matches to validate the scoring system.


In [None]:
# Show a few examples
print("\nTop 3 good matches:")
print(labeled_training_data.sort_values('final_score', ascending=False).head(3)[
    ['resume_id', 'resume_domain_label', 'job_domain_label', 'final_score', 'skills', 'semantic']
])

print("\nTop 3 poor matches:")
print(labeled_training_data.sort_values('final_score', ascending=True).head(3)[
    ['resume_id', 'resume_domain_label', 'job_domain_label', 'final_score', 'skills', 'semantic']
])
