In [None]:
!pip install faiss-cpu  # Use faiss-gpu if you need GPU acceleration

In [None]:
#File management for saving re-runs
import os
os.makedirs('data/processed', exist_ok=True)

!cp /content/drive/MyDrive/ColabNotebooks/JobRecommenderProject/job_embeddings.npy data/processed/
!cp /content/drive/MyDrive/ColabNotebooks/JobRecommenderProject/faiss_index.index  data/processed/

In [None]:
import numpy as np
import faiss

# Load saved embeddings and index
job_embeddings = np.load('data/processed/job_embeddings.npy')
index = faiss.read_index('data/processed/faiss_index.index')

# Verify loaded data (optional)
print(f"Loaded embeddings shape: {job_embeddings.shape}")
print(f"FAISS index size: {index.ntotal}")

In [None]:
#pip install --upgrade transformers sentence-transformers

In [None]:
#Avoid Re-run

from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
from datasets import Dataset

# Load df (adjust path)
df = pd.read_csv('data/processed/cleaned_jobs_tech_aug.csv')  # Use your augmented tech DF

# Create positive pairs: summary + (title + skills as positive)
titles_skills = df['job_title'] + ' ' + df['cleaned_skills'].apply(lambda x: ' '.join(eval(x) if isinstance(x, str) else x))
train_data = {
    'sentence1': df['cleaned_summary'].tolist(),
    'sentence2': titles_skills.tolist()
}
train_dataset = Dataset.from_dict(train_data)

model = SentenceTransformer('all-MiniLM-L6-v2')
train_loss = losses.MultipleNegativesRankingLoss(model)
args = SentenceTransformerTrainingArguments(output_dir='models/finetuned_model', num_train_epochs=1, report_to="none")
trainer = SentenceTransformerTrainer(model=model, train_dataset=train_dataset, loss=train_loss, args=args)
trainer.train()
model.save('models/finetuned_model')

In [None]:
#File management for saving re-runs

!cp -r /content/drive/MyDrive/ColabNotebooks/JobRecommenderProject/finetuned_model models/

In [None]:
def recommend_jobs(resume_text, job_texts, job_df, top_k=10):
    resume_embedding = generate_embeddings([resume_text])
    index = faiss.read_index('data/processed/faiss_index.index')
    distances, indices = index.search(resume_embedding, top_k)
    recs = job_df.iloc[indices[0]].copy()
    recs['similarity_score'] = 1 - distances[0] / 2  # Normalize to [0,1] cosine
    return recs[['job_title', 'company', 'job_location', 'similarity_score']]

# Example
if __name__ == "__main__":
    # Load parsed resume example
    resume_text = "master computer science ml dl pytorch nlp bert"  # From parsed_resumes
    recs = recommend_jobs(resume_text, job_texts, df)
    print(recs)

In [None]:
from sklearn.metrics import precision_score, ndcg_score
from sklearn.model_selection import train_test_split
import numpy as np

# Load data (from Step 2/3)
df = pd.read_csv('/content/drive/MyDrive/cleaned_jobs_tech_aug.csv')
job_embeddings = np.load('data/processed/job_embeddings.npy')  # Adjust path
index = faiss.read_index('data/processed/faiss_index.index')

# Split: Use resumes as queries; jobs as items (80/20 for train/test jobs)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_embeddings = job_embeddings[train_df.index]  # Subset embeddings
test_embeddings = job_embeddings[test_df.index]

# Rebuild FAISS for train (or use full for simplicity; rebuild for strict split)
train_index = build_faiss_index(train_embeddings)  # From embedding_model.py

# Replace [...] with actual list (example; use your parsed texts)
parsed_resumes = [
    "your resume text 1 here from parse_resume",
    "resume text 2",
    "resume text 3",
    "resume text 4",
    "resume text 5"
]  # Or list(parsed_resumes.values()) if dict

y_true_all = []
y_pred_all = []

for resume_text in parsed_resumes:
    resume_embedding = generate_embeddings([resume_text])
    distances, indices = train_index.search(resume_embedding, k=10)
    rec_scores = 1 - distances[0] / 2
    
    y_true = np.array([1 if score > 0.7 else 0 for score in rec_scores])
    y_true_all.extend(y_true)
    
    y_pred = np.ones_like(y_true)
    y_pred_all.extend(rec_scores)

y_true_all = np.array(y_true_all)
y_pred_all = np.array(y_pred_all)

precision = precision_score(y_true_all, y_pred_all > 0.7, average='micro')
print(f"Precision@10: {precision}")

ndcg = ndcg_score([y_true_all], [y_pred_all], k=10)
print(f"NDCG@10: {ndcg}")