In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import pickle
import time

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# 1. Load labeled data
labeled_df = pd.read_csv('datasets/labeled_resume_job_pairs.csv')
labeled_df = labeled_df.dropna(subset=['resume_text', 'job_text', 'relevancy_score'])

# Optional: sample for faster experimentation (remove or increase for full data)
if len(labeled_df) > 3000:
    labeled_df = labeled_df.sample(n=3000, random_state=42)

In [3]:
# 2. Load sentence transformer model for embeddings
print("Loading sentence-transformers model...")
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# 3. Create embeddings for resume and job texts
print("Embedding resume texts...")
resume_embeddings = embedder.encode(labeled_df['resume_text'].tolist(), show_progress_bar=True, convert_to_numpy=True)

print("Embedding job description texts...")
job_embeddings = embedder.encode(labeled_df['job_text'].tolist(), show_progress_bar=True, convert_to_numpy=True)

Loading sentence-transformers model...
Embedding resume texts...


Batches: 100%|██████████| 94/94 [00:53<00:00,  1.76it/s]


Embedding job description texts...


Batches: 100%|██████████| 94/94 [00:11<00:00,  8.51it/s]


In [4]:
# 4. Combine embeddings (concatenate)
X = np.hstack([resume_embeddings, job_embeddings])

# 5. Prepare target variable and scale it
y = labeled_df['relevancy_score'].values.reshape(-1, 1)
scaler = MinMaxScaler()
y_scaled = scaler.fit_transform(y).ravel()

In [5]:
# 6. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_scaled, test_size=0.2, random_state=42)

# 7. Define models with reasonable parameters
models = {
    'Ridge': Ridge(),
    'RandomForest': RandomForestRegressor(n_estimators=50, max_depth=15, n_jobs=-1, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=50, max_depth=7, random_state=42),
    'SVR': SVR(kernel='linear', C=1.0)
}

results = {}

In [6]:
# 8. Train, evaluate, and save models
for name, model in models.items():
    print(f"\nTraining {name}...")
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time

    print(f"Predicting with {name}...")
    y_pred_scaled = model.predict(X_test)
    y_pred = scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()
    y_true = scaler.inverse_transform(y_test.reshape(-1, 1)).ravel()

    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    results[name] = {'mse': mse, 'r2': r2, 'train_time_sec': train_time}

    print(f"{name} Results -> MSE: {mse:.3f}, R²: {r2:.3f}, Training Time: {train_time:.2f} sec")

    # Save model
    with open(f"Trained_Models/{name}_relevancy_model.pkl", "wb") as f:
        pickle.dump(model, f)


Training Ridge...
Predicting with Ridge...
Ridge Results -> MSE: 4.744, R²: 0.670, Training Time: 0.19 sec

Training RandomForest...
Predicting with RandomForest...
RandomForest Results -> MSE: 2.215, R²: 0.846, Training Time: 5.30 sec

Training GradientBoosting...
Predicting with GradientBoosting...
GradientBoosting Results -> MSE: 1.867, R²: 0.870, Training Time: 53.26 sec

Training SVR...
Predicting with SVR...
SVR Results -> MSE: 5.315, R²: 0.630, Training Time: 0.66 sec


In [7]:
# 9. Save the embedding model and scaler for inference
embedder.save('sentence_transformer_model')
with open('Trained_Models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# 10. Print summary
print("\n=== Model Performance Summary ===")
for name, metrics in results.items():
    print(f"{name}: MSE={metrics['mse']:.3f}, R²={metrics['r2']:.3f}, Train Time={metrics['train_time_sec']:.2f}s")

best_model = min(results.items(), key=lambda x: x[1]['mse'])
print(f"\nBest model based on MSE: {best_model[0]} with MSE={best_model[1]['mse']:.3f}")



=== Model Performance Summary ===
Ridge: MSE=4.744, R²=0.670, Train Time=0.19s
RandomForest: MSE=2.215, R²=0.846, Train Time=5.30s
GradientBoosting: MSE=1.867, R²=0.870, Train Time=53.26s
SVR: MSE=5.315, R²=0.630, Train Time=0.66s

Best model based on MSE: GradientBoosting with MSE=1.867


In [None]:
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer

# Load the embedding model, scaler, and trained regression model
embedder = SentenceTransformer('sentence_transformer_model')
scaler = pickle.load(open('Trained_Models/scaler.pkl', 'rb'))
model = pickle.load(open('Trained_Models/GradientBoosting_relevancy_model.pkl', 'rb'))  # Change filename as needed

def predict_relevancy(resume_text, job_text):
    # Embed texts separately
    resume_emb = embedder.encode([resume_text], convert_to_numpy=True)
    job_emb = embedder.encode([job_text], convert_to_numpy=True)

    # Combine embeddings (concatenate)
    combined_emb = np.hstack([resume_emb, job_emb])

    # Predict scaled score and inverse transform to original scale
    pred_scaled = model.predict(combined_emb)
    pred = scaler.inverse_transform(pred_scaled.reshape(-1, 1))[0][0]

    # Clamp score between 0 and 100 for safety
    pred = max(0, min(100, pred))

    return round(pred, 2)

# Example usage:
resume_example = "Skills: Python, HTML, CSS. Education: Bachelor. Experience: 1 year as Web Developer."
job_example = "Required Skills: Python, SQL, Data Analysis, Machine Learning, Pandas, NumPy. Required Education: Bachelor. Required Experience: 2 years."

score = predict_relevancy(resume_example, job_example)
print(f"Predicted Relevancy Score: {score}")


FileNotFoundError: [Errno 2] No such file or directory: 'scaler.pkl'