In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression  
import joblib
import warnings
warnings.filterwarnings('ignore')

def create_features(jd_embs, res_embs):
        return np.concatenate([
            np.abs(jd_embs - res_embs),
            jd_embs * res_embs
        ], axis=1) 

data_path = 'final_best_features.npz' 
data = np.load(data_path)
X_train = data['X_train']
y_train = data['y_train']

X_val = data['X_val']
y_val = data['y_val']

X_test = data['X_test']
y_test = data['y_test']

print(f"Loaded - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

X_train_val = np.vstack([X_train, X_val])
y_train_val = np.concatenate([y_train, y_val])
print(f"Combined train+val: {X_train_val.shape}")

print(f"Loaded - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

X_train_val = np.vstack([X_train, X_val])
y_train_val = np.concatenate([y_train, y_val])
print(f"Combined train+val: {X_train_val.shape}")



Loaded - Train: (4916, 1024), Val: (519, 1024), Test: (565, 1024)
Combined train+val: (5435, 1024)
Loaded - Train: (4916, 1024), Val: (519, 1024), Test: (565, 1024)
Combined train+val: (5435, 1024)


In [2]:
import os
import random
from collections import defaultdict
import time 

import numpy as np
import pandas as pd
from tqdm import tqdm
import joblib

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import xgboost as xgb

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

MODEL_NAME = "nomic-ai/nomic-bert-2048"
MAX_LEN = 2048
BATCH_SIZE = 32
LR_DEFAULT = 1e-5
NUM_EPOCHS = 5
PROJ_DIM = 512
ACCUM_STEPS = 4
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DATA_PATH = "clean.csv"
OUT_DIR = "hc_outputs_adaptive"
os.makedirs(OUT_DIR, exist_ok=True)

PERFORM_TUNING = True

MARGIN_CANDIDATES = [0.1, 0.3, 0.5, 0.7] 
LR_CANDIDATES = [5e-6, 1e-5] 
MARGIN_DEFAULT = 0.3 

try:
    df = pd.read_csv(DATA_PATH)
except FileNotFoundError:
    print(f"Error: {DATA_PATH} not found. Using mock data.")
    df = pd.DataFrame({
        'jd_text': [f"Job Description {i}" for i in range(100)],
        'resume_text': [f"Resume {i}" for i in range(100)],
        'label': ['good fit'] * 50 + ['no fit'] * 50
    })

df['label'] = df['label'].astype(str).str.strip().str.lower()

print(f"Original samples: {len(df)}")
df_binary = df[df['label'].isin(['no fit', 'good fit'])].copy()
print(f"After filtering (good/no only): {len(df_binary)}")

label_map = {'no fit': 0, 'good fit': 1}
df_binary['label_class'] = df_binary['label'].map(label_map)

print("Binary class distribution:")
class_counts = df_binary['label_class'].value_counts().sort_index()
print(class_counts)

if 0 in class_counts and 1 in class_counts and class_counts[1] != 0:
    SCALE_POS_WEIGHT = class_counts[0] / class_counts[1]
else:
    SCALE_POS_WEIGHT = 1.0
print(f"Calculated XGBoost scale_pos_weight: {SCALE_POS_WEIGHT:.2f}")


if 'job_id' not in df_binary.columns:
    job_to_id = {}
    next_jid = 0
    job_ids = []
    for jd in df_binary['jd_text'].astype(str):
        key = jd.strip()
        if key not in job_to_id:
            job_to_id[key] = next_jid
            next_jid += 1
        job_ids.append(job_to_id[key])
    df_binary['job_id'] = job_ids

print("Job groups:", df_binary['job_id'].nunique())

train_df, temp_df = train_test_split(df_binary, test_size=0.2, stratify=df_binary['label_class'], random_state=SEED)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label_class'], random_state=SEED)
print(f"\nSplits: Train {len(train_df)}, Val {len(val_df)}, Test {len(test_df)}")


Error: clean.csv not found. Using mock data.
Original samples: 100
After filtering (good/no only): 100
Binary class distribution:
label_class
0    50
1    50
Name: count, dtype: int64
Calculated XGBoost scale_pos_weight: 1.00
Job groups: 100

Splits: Train 80, Val 10, Test 10


---

In [3]:
import numpy as np
import pandas as pd
import joblib
import warnings
from sklearn.metrics import f1_score, accuracy_score, classification_report, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from collections import Counter


print(f"dataset shape {Counter(y_train_val)}")
print(f"New Feature Matrix Shape: {X_train_val.shape}")

f1_scorer = make_scorer(f1_score, average='macro')
TARGET_NAMES = ['No Fit (0)', 'Good Fit (1)']

baseline_models = {
    'LogisticRegression': make_pipeline(
        StandardScaler(),
        LogisticRegression(max_iter=2000, n_jobs=-1, random_state=42)
    ),

    'DecisionTree': DecisionTreeClassifier(random_state=42),

    'RandomForest': RandomForestClassifier(n_jobs=-1, random_state=42),

    'KNeighbors': make_pipeline(
        StandardScaler(),
        KNeighborsClassifier(n_jobs=-1)  
    ),

    'LinearSVM': make_pipeline(
        StandardScaler(),
        LinearSVC(max_iter=5000, random_state=42)
    ),

    'MLP_1Layer': make_pipeline(
        StandardScaler(),
        MLPClassifier(
            hidden_layer_sizes=(100,), activation='relu', solver='adam',
            max_iter=500, verbose=False, random_state=42
        )
    ),
    'XGBoost': xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='mlogloss',
        use_label_encoder=False,
        n_jobs=-1,
        random_state=42
    )
}


# ==============================================================================
# Train and Evaluate Baselines
# ==============================================================================
print("\n" + "="*80)
print("BASELINE MODEL TRAINING AND EVALUATION")
print("="*80)

results = []
best_f1 = -1.0
best_model_name = ""

for name, model in baseline_models.items():
    print(f"\n--- Training {name} ---")
    try:
        start_time = time.time()
        model.fit(X_train_val, y_train_val)
        fit_time = time.time() - start_time
        
        train_preds = model.predict(X_train_val)
        train_acc = accuracy_score(y_train_val, train_preds)
        train_f1 = f1_score(y_train_val, train_preds, average='macro')
        
        test_preds = model.predict(X_test)
        test_acc = accuracy_score(y_test, test_preds)
        test_f1 = f1_score(y_test, test_preds, average='macro')
        
        results.append({
            'Model': name,
            'Fit_Time_s': f'{fit_time:.2f}',
            'Train_Accuracy': f'{train_acc:.3f}',
            'Train_F1_Macro': f'{train_f1:.3f}',
            'Test_Accuracy': f'{test_acc:.3f}',
            'Test_F1_Macro': f'{test_f1:.3f}'
        })
        
        if test_f1 > best_f1:
            best_f1 = test_f1
            best_model_name = name
            
        print(f"  -> Fit Time: {fit_time:.2f}s")
        print(f"  -> Train Accuracy: {train_acc:.3f}, Train F1 (Macro): {train_f1:.3f}")
        print(f"  -> Test Accuracy: {test_acc:.3f}, Test F1 (Macro): {test_f1:.3f}")
        print("\n  Classification Report (Test Set):")
        report_str = classification_report(y_test, test_preds, target_names=TARGET_NAMES)
        print("  " + report_str.replace('\n', '\n  '))
        
    except Exception as e:
        print(f"  -> ERROR training {name}: {e}")
        results.append({
            'Model': name,
            'Fit_Time_s': 'ERROR',
            'Train_Accuracy': 'ERROR',
            'Train_F1_Macro': 'ERROR',
            'Test_Accuracy': 'ERROR',
            'Test_F1_Macro': 'ERROR'
        })

# ============================================================================== 
# Final Comparison and Saving
# ============================================================================== 
print("\n" + "="*80)
print("BASELINE MODEL COMPARISON SUMMARY")
print("="*80)

final_df = pd.DataFrame(results)
final_df['Test_F1_Macro'] = pd.to_numeric(final_df['Test_F1_Macro'], errors='coerce')
final_df = final_df.sort_values(by='Test_F1_Macro', ascending=False).fillna('ERROR')

final_df.to_csv("baseline_models_comparison.csv", index=False)
print(final_df.to_string(index=False))
print("\nSaved comparison table to: baseline_models_comparison.csv")

print(f"\n**BEST BASELINE MODEL (F1-Macro): {best_model_name}**")


dataset shape Counter({0.0: 3599, 1.0: 1836})
New Feature Matrix Shape: (5435, 1024)

BASELINE MODEL TRAINING AND EVALUATION

--- Training LogisticRegression ---
  -> Fit Time: 4.12s
  -> Train Accuracy: 0.972, Train F1 (Macro): 0.968
  -> Test Accuracy: 0.699, Test F1 (Macro): 0.668

  Classification Report (Test Set):
                precision    recall  f1-score   support
  
    No Fit (0)       0.84      0.71      0.77       401
  Good Fit (1)       0.49      0.68      0.57       164
  
      accuracy                           0.70       565
     macro avg       0.66      0.69      0.67       565
  weighted avg       0.74      0.70      0.71       565
  

--- Training DecisionTree ---
  -> Fit Time: 5.67s
  -> Train Accuracy: 0.999, Train F1 (Macro): 0.999
  -> Test Accuracy: 0.623, Test F1 (Macro): 0.576

  Classification Report (Test Set):
                precision    recall  f1-score   support
  
    No Fit (0)       0.77      0.67      0.72       401
  Good Fit (1)       0.38  

In [4]:
import numpy as np
import pandas as pd
import time
import joblib
import warnings
import os
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, classification_report
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from sklearn.metrics import make_scorer, f1_score, classification_report, accuracy_score
warnings.filterwarnings('ignore')

OUTPUT_DIR = 'hc_outputs'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

def create_features(jd_embs, res_embs):
    return np.concatenate([jd_embs, res_embs, np.abs(jd_embs - res_embs), jd_embs * res_embs], axis=1)


data_path = 'final_best_features.npz' 
data = np.load(data_path)

X_train = data['X_train']
y_train = data['y_train']

X_val = data['X_val']
y_val = data['y_val']

X_test = data['X_test']
y_test = data['y_test']

print(f"Loaded - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

X_train_val = np.vstack([X_train, X_val])
y_train_val = np.concatenate([y_train, y_val])
print(f"Combined train+val: {X_train_val.shape}")

print(f"Loaded - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

X_train_val = np.vstack([X_train, X_val])
y_train_val = np.concatenate([y_train, y_val])
print(f"Combined train+val: {X_train_val.shape}")

f1_scorer = make_scorer(f1_score, average='macro')
TARGET_NAMES = ['No Fit (0)', 'Good Fit (1)']

# ----------------------------------------------------------------------
# Hyperparameter Search Spaces 
# ----------------------------------------------------------------------

mlp_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', MLPClassifier(random_state=42, max_iter=1000, early_stopping=True))
])
mlp_params = {
    'mlp__hidden_layer_sizes': [(100,), (50,), (64,32)],
    'mlp__activation': ['relu', 'tanh'],
    'mlp__alpha': [1e-3, 1e-2, 0.1],
    'mlp__learning_rate_init': [0.001, 0.005],
    'mlp__solver': ['adam'],
    'mlp__learning_rate': ['constant', 'adaptive'],
    'mlp__max_iter': [500],
    'mlp__early_stopping': [True]
}

lsvc_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('lsvc', LinearSVC(random_state=42, max_iter=5000, dual='auto'))
])
lsvc_params = {
    'lsvc__C': [0.1, 1.0, 10.0],
    'lsvc__class_weight': ['balanced'],  
    'lsvc__loss': ['hinge', 'squared_hinge']
}

logreg_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(random_state=42, max_iter=2000, n_jobs=-1))
])
logreg_params = {
    'logreg__C': [0.01, 0.1, 1.0],   
    'logreg__solver': ['lbfgs', 'liblinear'],
    'logreg__class_weight': ['balanced'],
    'logreg__penalty': ['l2']
}

rf_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(random_state=42, n_jobs=-1))
])
rf_params = {
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [10, 20],
    'rf__min_samples_leaf': [2, 4],
    'rf__min_samples_split': [5, 10],
    'rf__class_weight': ['balanced'],
    'rf__max_features': ['sqrt', 'log2']
}

xgb_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', xgb.XGBClassifier(
        objective='binary:logistic', eval_metric='logloss',
        random_state=42, n_jobs=-1
    ))
])
xgb_params = {
    'xgb__n_estimators': [100, 150],
    'xgb__max_depth': [3, 4],
    'xgb__learning_rate': [0.01, 0.03],
    'xgb__subsample': [0.7, 0.8],
    'xgb__colsample_bytree': [0.7, 0.8],
    'xgb__min_child_weight': [3, 5],
    'xgb__scale_pos_weight': [2],  
    'xgb__reg_lambda': [2, 5, 10],
}

models_to_tune = {
    'MLP_1Layer': (mlp_pipe, mlp_params, 50),
    'LinearSVM': (lsvc_pipe, lsvc_params, 20),
    'LogisticRegression': (logreg_pipe, logreg_params, 20),
    'RandomForest': (rf_pipe, rf_params, 30),
    'XGBoost': (xgb_pipe, xgb_params, 40),
}

# ----------------------------------------------------------------------
# Tuning Execution Functions
# ----------------------------------------------------------------------

def run_tuning(X_train_val, y_train_val, X_test, y_test, models_to_tune, scorer):
    tuning_results = []
    best_estimators = {} 
    
    for name, (pipeline, params, n_iter) in models_to_tune.items():
        print("="*80)
        print(f"Starting Randomized Search for: {name} (N_iter: {n_iter})")
        print("="*80)
        
        start_time = time.time()
        search = RandomizedSearchCV(
            estimator=pipeline,
            param_distributions=params,
            n_iter=n_iter,
            scoring=scorer,
            cv=5,
            verbose=1,
            random_state=42,
            n_jobs=-1
        )
        
        search.fit(X_train_val, y_train_val)
        fit_time = time.time() - start_time
        
        best_estimator = search.best_estimator_
        best_estimators[name] = best_estimator
        joblib.dump(best_estimator, os.path.join(OUTPUT_DIR, f'tuned_{name}_model.joblib'))
        
        test_preds = best_estimator.predict(X_test)
        test_f1 = f1_score(y_test, test_preds, average='macro')
        
        print(f"\n--- {name} Results ---")
        print(f"Total Tuning Time: {fit_time:.2f}s")
        print(f"Best CV F1-Macro: {search.best_score_:.4f}")
        print(f"Test F1-Macro: {test_f1:.4f}")

        tuning_results.append({
            'Model': name,
            'Test_F1_Macro': f'{test_f1:.4f}',
            'Best_Params': str(search.best_params_).replace('\n', ' ')
        })

    return tuning_results, best_estimators



# ----------------------------------------------------------------------
# Run All Tuning and Report
# ----------------------------------------------------------------------
individual_results, best_estimators = run_tuning(
    X_train_val, y_train_val, X_test, y_test, models_to_tune, f1_scorer
)

all_results = individual_results 
final_tuning_df = pd.DataFrame(all_results)
final_tuning_df['Test_F1_Macro'] = pd.to_numeric(final_tuning_df['Test_F1_Macro'], errors='coerce')
final_tuning_df = final_tuning_df.sort_values(by='Test_F1_Macro', ascending=False)

output_filename = os.path.join(OUTPUT_DIR, "hyperparameter_tuning_results.csv")
final_tuning_df.to_csv(output_filename, index=False)


print("\n" + "="*100)
print("FINAL TUNING COMPARISON SUMMARY (Includes Optimised Stacking Model)")
print("="*100)
print(f"Results saved to: {output_filename}")
print(final_tuning_df.to_string(index=False))

best_overall_model_name = final_tuning_df.iloc[0]['Model']
best_overall_f1 = final_tuning_df.iloc[0]['Test_F1_Macro']

print("\n" + "="*100)
print(f"**BEST OVERALL MODEL (F1-Macro): {best_overall_model_name} (F1: {best_overall_f1})**")
print("="*100)

Loaded - Train: (4916, 1024), Val: (519, 1024), Test: (565, 1024)
Combined train+val: (5435, 1024)
Loaded - Train: (4916, 1024), Val: (519, 1024), Test: (565, 1024)
Combined train+val: (5435, 1024)
Starting Randomized Search for: MLP_1Layer (N_iter: 50)
Fitting 5 folds for each of 50 candidates, totalling 250 fits

--- MLP_1Layer Results ---
Total Tuning Time: 36.75s
Best CV F1-Macro: 0.8605
Test F1-Macro: 0.6316
Starting Randomized Search for: LinearSVM (N_iter: 20)
Fitting 5 folds for each of 6 candidates, totalling 30 fits

--- LinearSVM Results ---
Total Tuning Time: 62.84s
Best CV F1-Macro: 0.8172
Test F1-Macro: 0.6493
Starting Randomized Search for: LogisticRegression (N_iter: 20)
Fitting 5 folds for each of 6 candidates, totalling 30 fits

--- LogisticRegression Results ---
Total Tuning Time: 13.42s
Best CV F1-Macro: 0.8437
Test F1-Macro: 0.6586
Starting Randomized Search for: RandomForest (N_iter: 30)
Fitting 5 folds for each of 30 candidates, totalling 150 fits

--- RandomFore