## Creating the training set 

In [9]:
# 1. Imports & Paths
import importlib.util
import os, sys, re, pandas as pd
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from joblib import Parallel, delayed


sys.path.append(os.path.join(os.pardir, "utils"))

DATA_DIR = os.path.join(os.pardir, "data/clusters")
resume_path = os.path.join(DATA_DIR, "resumes_clustered.csv")
jobs_path   = os.path.join(DATA_DIR, "jobs_clustered.csv")


EMB_DIR = os.path.join(os.pardir, "data/embeddings")
remb_path = os.path.join(EMB_DIR, "resume_embeddings.npy")
jemb_path   = os.path.join(EMB_DIR, "job_embeddings.npy")

In [10]:
%load_ext autoreload
%autoreload 2
from utils import *


  from .autonotebook import tqdm as notebook_tqdm


In [11]:
resume_df = pd.read_csv(resume_path)
job_posts_df = pd.read_csv(jobs_path)
resume_embeddings = np.load(remb_path)
job_embeddings = np.load(jemb_path)

In [12]:
print(resume_df.columns)
print(job_posts_df.columns)

Index(['ID', 'Resume_str', 'Resume_html', 'Category', 'Resume_clean',
       'DomainCluster', 'PredictedCluster', 'x', 'y', 'KeywordDomain',
       'ClusterDomainLabel'],
      dtype='object')
Index(['jobpost', 'date', 'Title', 'Company', 'Location', 'JobDescription',
       'JobRequirment', 'RequiredQual', 'Salary', 'ApplicationP', 'AboutC',
       'Year', 'Month', 'IT', 'job_text', 'job_text_clean', 'PredictedCluster',
       'KeywordDomain', 'ClusterDomainLabel', 'DomainCluster', 'x', 'y'],
      dtype='object')


In [13]:
print(f"Loaded {len(resume_df)} resumes and {len(job_posts_df)} job postings.")
print(f"Embeddings: {resume_embeddings.shape[1]}-D resumes, {job_embeddings.shape[1]}-D jobs\n")


Loaded 2484 resumes and 5448 job postings.
Embeddings: 384-D resumes, 384-D jobs



In [14]:
if 'DomainCluster' in resume_df.columns:
    resumes = resume_df.drop(columns=['DomainCluster'])

## Calculating Functions

In [15]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def calculate_semantic_score(resume_emb, job_emb, resume_text=None, job_text=None):
    """
    Returns embedding-based semantic similarity (cosine similarity) between resume and job.
    If embeddings are missing, falls back to lexical overlap as a backup.
    """
    if resume_emb is not None and job_emb is not None:
        sim = cosine_similarity([resume_emb], [job_emb])[0][0]
        return float(sim)
    
    # fallback if embeddings missing
    if resume_text and job_text:
        resume_words = set(resume_text.lower().split())
        job_words = set(job_text.lower().split())
        if not job_words:
            return 0
        return len(resume_words.intersection(job_words)) / len(job_words)
    return 0


In [16]:
domain_keywords= {
    "Tech & IT": [
        'programming', 'software engineering', 'software development', 'python', 'java', 'c++',
        'sql', 'api', 'database', 'web development', 'frontend', 'backend',
        'data science', 'machine learning', 'artificial intelligence', 'deep learning',
        'devops', 'docker', 'kubernetes', 'cloud computing', 'aws', 'azure', 'gcp',
        'network administration', 'cybersecurity', 'linux', 'git', 'automation', 'debugging'
    ],

    "Finance & Accounting": [
        'finance', 'accounting', 'bookkeeping', 'audit', 'taxation', 'financial reporting',
        'budget forecast', 'cash flow', 'balance sheet', 'ledger', 'valuation',
        'accounts payable', 'accounts receivable', 'profit loss', 'treasury',
        'banking operations', 'credit risk', 'capital markets', 'investment analysis',
        'financial modeling', 'cost analysis', 'invoice', 'reconciliation', 'economics'
    ],

    "Business & Sales": [
        'business development', 'sales strategy', 'b2b', 'b2c', 'client acquisition',
        'cold calling', 'negotiation', 'lead generation', 'pipeline management',
        'revenue growth', 'sales forecasting', 'crm', 'prospecting', 'partnerships',
        'account executive', 'quotas', 'upselling', 'cross-selling', 'closing deals'
    ],

    "Law & Advocacy": [
        'legal', 'attorney', 'lawyer', 'litigation', 'legal research', 'contract law',
        'corporate law', 'civil law', 'criminal law', 'intellectual property',
        'regulatory compliance', 'court', 'case management', 'legal drafting'
    ],

    "Healthcare": [
        'healthcare', 'medical', 'nurse', 'clinic', 'patient care', 'diagnosis',
        'treatment', 'therapy', 'hospital', 'physician', 'pharmacy', 'rehabilitation',
        'fitness', 'nutrition', 'public health', 'wellness', 'surgery', 'medication'
    ],

    "HR & Operations": [
        'human resources', 'recruitment', 'recruiter', 'talent acquisition',
        'employee engagement', 'onboarding', 'offboarding', 'payroll',
        'benefits administration', 'performance appraisal', 'training development',
        'conflict resolution', 'hr policies', 'labor relations', 'workforce planning',
        'compliance', 'organizational development', 'hr analytics', 'diversity inclusion',
        'operations management', 'administrative', 'vendor management', 'logistics',
        'inventory management', 'documentation'
    ],

    "Creative & Design": [
        'graphic design', 'illustrator', 'photoshop', 'branding', 'typography',
        'ux design', 'ui design', 'web design', 'layout', 'content creation',
        'copywriting', 'animation', 'video editing', 'photography', 'creative direction',
        'visual storytelling', 'marketing collateral', 'digital media', 'social media campaign'
    ],

    "Education": [
        'teacher', 'professor', 'lecturer', 'education', 'curriculum development',
        'classroom management', 'student engagement', 'lesson planning',
        'academic advisor', 'training instructor', 'tutoring', 'pedagogy', 'educational leadership'
    ],

    "Manufacturing & Construction": [
        'construction', 'engineering', 'project planning', 'blueprint', 'contractor',
        'civil engineering', 'structural', 'mechanical', 'electrical', 'site management',
        'quality assurance', 'manufacturing', 'production line', 'machinery', 'safety compliance'
    ],

    "Agriculture & Environment": [
        'agriculture', 'farming', 'crop', 'soil', 'sustainability', 'irrigation',
        'environmental management', 'ecology', 'forestry', 'conservation',
        'organic farming', 'livestock', 'rural development', 'water resources'
    ],

    "Hospitality & Food": [
        'chef', 'kitchen', 'menu planning', 'catering', 'hospitality', 'food safety',
        'culinary arts', 'restaurant operations', 'inventory control', 'sanitation',
        'banquet', 'hotel management', 'customer service', 'guest relations'
    ],

    "Other Services": [
        'customer support', 'bpo', 'client service', 'call center', 'aviation',
        'airline', 'flight attendant', 'pilot', 'maintenance', 'fashion', 'apparel',
        'retail', 'merchandising', 'event planning'
    ]
}


In [17]:
def calculate_skills_score(resume_text, job_text):
    resume_lower = resume_text.lower()
    job_lower = job_text.lower()
    job_skills = [skill for skill in skills if skill in job_lower]
    if not job_skills:
        return 0
    resume_skills = [skill for skill in job_skills if skill in resume_lower]
    return len(resume_skills) / len(job_skills)


In [18]:
def calculate_experience_score(resume_text, resume_cluster=None):
    text_lower = resume_text.lower()
    years_matches = re.findall(r'(\d+)\s*(?:years?|yrs?)', text_lower)
    max_years = max([int(year) for year in years_matches]) if years_matches else 0
    exp_count = sum(1 for word in experience_words if word in text_lower)
    base_score = min((max_years / 10) + (exp_count / 5), 1.0)
    
    # Certain clusters imply senior roles (you could customize this mapping)
    senior_clusters = [2, 5, 9]  # example
    if resume_cluster in senior_clusters:
        base_score = min(base_score * 1.1, 1.0)
    return base_score


In [19]:
def calculate_education_score(resume_text):
    text_lower = resume_text.lower()
    max_education = 0
    for level, score in education_levels.items():
        if level in text_lower:
            max_education = max(max_education, score)
    return min(max_education / 4, 1.0)


In [20]:
def calculate_skills_score(resume_text, job_text, same_domain=1):
    resume_lower = resume_text.lower()
    job_lower = job_text.lower()
    job_skills = [skill for skill in skills if skill in job_lower]
    if not job_skills:
        return 0
    
    resume_skills = [skill for skill in job_skills if skill in resume_lower]
    score = len(resume_skills) / len(job_skills)
    
    # Reward domain-aligned pairs
    if same_domain:
        score *= 1.1
    return min(score, 1.0)


In [21]:
def calculate_domain_score(resume_text, job_text, resume_domain=None, job_domain=None):
    resume_lower = resume_text.lower()
    job_lower = job_text.lower()
    
    # Base domain detection (keyword overlap)
    job_domain_detected = 'general'
    max_score = 0
    for domain, keywords in domain_keywords.items():
        score = sum(kw in job_lower for kw in keywords)
        if score > max_score:
            max_score = score
            job_domain_detected = domain

    if job_domain_detected == 'general':
        return 0.5

    domain_keywords_list = domain_keywords[job_domain_detected]
    matches = sum(kw in resume_lower for kw in domain_keywords_list)
    score = matches / len(domain_keywords_list)
    
    # Strong reward for actual cluster/domain match
    if resume_domain == job_domain:
        score = min(score * 1.2 + 0.1, 1.0)
    return round(score, 3)


In [22]:
def calculate_composite_score(resume_text, job_text, resume_emb, job_emb,
                              resume_cluster=None, job_cluster=None,
                              resume_domain=None, job_domain=None):
    
    same_domain = int(resume_domain == job_domain)
    same_cluster = int(resume_cluster == job_cluster)
    
    scores = {
        'skills': calculate_skills_score(resume_text, job_text, same_domain),
        'experience': calculate_experience_score(resume_text, resume_cluster),
        'education': calculate_education_score(resume_text),
        'domain': calculate_domain_score(resume_text, job_text, resume_domain, job_domain),
        'semantic': calculate_semantic_score(resume_emb, job_emb, resume_text, job_text)
    }
    
    # Weighted average (same weights as before)
    final_score = sum(weights.get(k, 0) * scores[k] for k in scores)
    
    # Small reward if both cluster and domain match
    if same_cluster:
        final_score += 0.05
    if same_domain:
        final_score += 0.05

    return min(final_score, 1.0), scores


In [23]:
# 1. Check data types
print(resume_df["Resume_clean"].dtype, job_posts_df["job_text_clean"].dtype)

# 2. Find any non-string entries
bad_resumes = resume_df[~resume_df["Resume_clean"].apply(lambda x: isinstance(x, str))]
bad_jobs = job_posts_df[~job_posts_df["job_text_clean"].apply(lambda x: isinstance(x, str))]

print(f"Bad resume entries: {len(bad_resumes)}")
print(f"Bad job entries: {len(bad_jobs)}")

if len(bad_resumes):
    print(bad_resumes.head(3))
if len(bad_jobs):
    print(bad_jobs.head(3))
# Replace NaN or non-string resumes with empty strings
resume_df["Resume_clean"] = (
    resume_df["Resume_clean"]
    .fillna("")        # replace NaN with ""
    .astype(str)       # ensure everything is a string
    .apply(lambda x: x.strip())
)


object object
Bad resume entries: 1
Bad job entries: 0
           ID             Resume_str  \
656  12632728                          

                                           Resume_html              Category  \
656  <div class="fontsize fontface vmargins hmargin...  BUSINESS-DEVELOPMENT   

    Resume_clean     DomainCluster  PredictedCluster         x         y  \
656          NaN  Business & Sales                 2 -0.248796  0.026417   

    KeywordDomain ClusterDomainLabel  
656     Tech & IT          Education  


## Generation

In [24]:
job_keep = [
    "PredictedCluster", "KeywordDomain", "ClusterDomainLabel", "DomainCluster"
]

resume_keep = [
    "ID", "DomainCluster",
    "PredictedCluster", "KeywordDomain", "ClusterDomainLabel"
]

jobs_filtered = job_posts_df[job_keep].copy()
resumes_filtered = resume_df[resume_keep].copy()

print("Jobs filtered shape:", jobs_filtered.shape)
print("Resumes filtered shape:", resumes_filtered.shape)


Jobs filtered shape: (5448, 8)
Resumes filtered shape: (2484, 7)


In [25]:
# from joblib import Parallel, delayed
# import numpy as np
# import pandas as pd
# from sklearn.metrics.pairwise import cosine_similarity
# import os
# import matplotlib.pyplot as plt

# # ===============================================================
# # Per-Domain Processing with Retained Columns
# # ===============================================================
# def process_one_domain(domain, resume_df, job_posts_df, resume_embeddings, job_embeddings):
#     domain_resumes = resume_df[resume_df["ClusterDomainLabel"] == domain]
#     domain_jobs = job_posts_df[job_posts_df["ClusterDomainLabel"] == domain]
#     results = []

#     if domain_jobs.empty or domain_resumes.empty:
#         return results

#     print(f"Processing domain: {domain} ({len(domain_jobs)} jobs × {len(domain_resumes)} resumes)")

#     for j_idx, job_row in domain_jobs.iterrows():
#         j_emb = job_embeddings[j_idx]
#         job_cluster = job_row["PredictedCluster"]
#         job_domain = job_row["ClusterDomainLabel"]

#         for r_idx, resume_row in domain_resumes.iterrows():
#             r_emb = resume_embeddings[r_idx]
#             resume_cluster = resume_row["PredictedCluster"]
#             resume_domain = resume_row["ClusterDomainLabel"]

#             # --- Semantic cosine similarity ---
#             cos_sim = cosine_similarity([r_emb], [j_emb])[0][0]

#             # --- Hybrid composite scoring ---
#             final_score, components = calculate_composite_score(
#                 resume_row["Resume_clean"],
#                 job_row["job_text_clean"],
#                 r_emb, j_emb,
#                 resume_cluster, job_cluster,
#                 resume_domain, job_domain
#             )

#             # Prefix original columns to distinguish resume/job features
#             resume_features = {f"resume_{col}": val for col, val in resume_row.items()}
#             job_features = {f"job_{col}": val for col, val in job_row.items()}

#             # Combine everything
#             result = {
#                 **resume_features,
#                 **job_features,
#                 "cosine_similarity": cos_sim,
#                 **components,
#                 "final_score": final_score,
#                 "same_cluster": int(resume_cluster == job_cluster),
#                 "same_domain": 1  # all domain-aligned here
#             }

#             results.append(result)
#     return results

# # ===============================================================
# # Parallel Wrapper for All Domains
# # ===============================================================
# def generate_training_pairs_parallel(resume_df, job_posts_df, resume_embeddings, job_embeddings):
#     domains = sorted(set(resume_df["ClusterDomainLabel"]) & set(job_posts_df["ClusterDomainLabel"]))
    
#     all_results = Parallel(n_jobs=-1, backend="loky")(
#         delayed(process_one_domain)(domain, resume_df, job_posts_df, resume_embeddings, job_embeddings)
#         for domain in domains
#     )

#     pairs_flat = [x for sublist in all_results for x in sublist]
#     training_pairs = pd.DataFrame(pairs_flat)
#     print(f"\n✅ Generated {len(training_pairs):,} resume–job pairs across {len(domains)} domains.")
#     return training_pairs

# # ===============================================================
# # Run the Generator
# # ===============================================================
# training_pairs = generate_training_pairs_parallel(
#     resumes_filtered, jobs_filtered, resume_embeddings, job_embeddings
# )

# # ===============================================================
# # Labeling: 3 Classes (Good / Medium / Poor)
# # ===============================================================
# scores = training_pairs["final_score"]
# high_t = scores.quantile(0.75)
# low_t = scores.quantile(0.25)

# def assign_label(score):
#     if score >= high_t:
#         return 1          # good fit
#     elif score <= low_t:
#         return 0          # poor fit
#     else:
#         return 0.5        # medium fit

# training_pairs["label"] = scores.apply(assign_label)
# print(f"Label thresholds: High ≥ {high_t:.3f}, Low ≤ {low_t:.3f}")

# # ===============================================================
# # Save and Visualize
# # ===============================================================
# os.makedirs("data_outputs", exist_ok=True)
# out_path = os.path.join("data_outputs", "resume_job_training_data.csv")
# training_pairs.to_csv(out_path, index=False)
# print(f"✅ Saved labeled training data → {out_path}")

# plt.figure(figsize=(8,5))
# plt.hist(training_pairs["final_score"], bins=30, color='skyblue', edgecolor='black')
# plt.title("Distribution of Resume–Job Final Fit Scores")
# plt.xlabel("Final Score")
# plt.ylabel("Frequency")
# plt.show()


   resume_idx  job_idx                                   resume_embedding  \
0           0        0  [0.24834046583886438, 0.6028677949857333, 0.40...   
1           1        1  [0.18558372274559476, 0.021485521539090557, 0....   

                                       job_embedding     label  
0  [0.6745862916163523, 0.9764964376127354, 0.666...  0.752134  
1  [0.07788232095608172, 0.7474506018596786, 0.04...  0.762876  


FileNotFoundError: [Errno 2] No such file or directory: 'resumes_clustered.csv'