# JibJob Recommender System - Model Training

This notebook demonstrates how to train the Graph Convolutional Network (GCN) recommendation model using sample data.

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime

# Add project root to path for imports
project_root = str(Path().absolute().parent)
sys.path.append(project_root)

# Import project modules
from jibjob_recommender_system.config.config_loader import ConfigLoader
from jibjob_recommender_system.data_handling.data_loader import DataLoader
from jibjob_recommender_system.data_handling.preprocessor import DataPreprocessor
from jibjob_recommender_system.feature_engineering.feature_orchestrator import FeatureOrchestrator
from jibjob_recommender_system.graph_construction.graph_builder import GraphBuilder
from jibjob_recommender_system.models.gcn_recommender import GCNRecommender
from jibjob_recommender_system.training.train_gcn import GCNTrainer
from jibjob_recommender_system.evaluation.evaluation import RecommendationEvaluator

# Set up visualization settings
plt.style.use('ggplot')
sns.set_style('whitegrid')

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load configuration
config_path = os.path.join(project_root, 'jibjob_recommender_system', 'config', 'settings.yaml')
config = ConfigLoader.load_config(config_path)

## 1. Generate or Load Sample Data

In [None]:
# Option 1: Generate sample data
from jibjob_recommender.sample_data.generate_sample_data import DataGenerator

# Set parameters for sample data
num_users = 100
num_jobs = 500
num_applications = 1000

# Generate data
data_generator = DataGenerator(seed=42)
sample_data = data_generator.generate_sample_data(
    num_users=num_users,
    num_jobs=num_jobs,
    num_applications=num_applications
)

# Extract dataframes
users_df = sample_data['users']
jobs_df = sample_data['jobs']
job_applications_df = sample_data['job_applications']
categories_df = sample_data['categories']

# Create a dictionary of dataframes
data_dict = {
    'users': users_df,
    'jobs': jobs_df,
    'job_applications': job_applications_df,
    'categories': categories_df
}

# Print data summary
print(f"Generated {len(users_df)} users ({len(users_df[users_df['user_type'] == 'professional'])} professionals, "
      f"{len(users_df[users_df['user_type'] == 'employer'])} employers)")
print(f"Generated {len(jobs_df)} jobs")
print(f"Generated {len(job_applications_df)} job applications")

## 2. Preprocess the Data

In [None]:
# Create preprocessor
preprocessor = DataPreprocessor(config)

# Preprocess data
processed_data = preprocessor.preprocess(data_dict)

# Display preprocessing summary
print("\nAfter preprocessing:")
for key, df in processed_data.items():
    print(f"{key}: {len(df)} records")
    
# Check for missing values
print("\nMissing values count:")
for key, df in processed_data.items():
    print(f"{key}:\n{df.isnull().sum()}")

## 3. Generate Features

In [None]:
# Create feature orchestrator
feature_orchestrator = FeatureOrchestrator(config)

# Generate features
feature_data = feature_orchestrator.generate_features(processed_data)

# Check embedding dimensions
if 'users' in feature_data and 'embedding' in feature_data['users'].columns:
    user_embedding = feature_data['users']['embedding'].iloc[0]
    print(f"User embedding dimension: {len(user_embedding)}")
    
if 'jobs' in feature_data and 'embedding' in feature_data['jobs'].columns:
    job_embedding = feature_data['jobs']['embedding'].iloc[0]
    print(f"Job embedding dimension: {len(job_embedding)}")

# Display feature data summary
print("\nFeature data summary:")
for key, df in feature_data.items():
    print(f"{key} features: {', '.join(df.columns)}")

## 4. Build the Graph

In [None]:
# Create graph builder
graph_builder = GraphBuilder(config)

# Build the graph
graph_data = graph_builder.build_graph(feature_data)

# Extract graph components
graph = graph_data['graph']
user_mapping = graph_data['user_mapping']
job_mapping = graph_data['job_mapping']

# Display graph information
print(f"Graph has {graph.number_of_nodes()} nodes and {graph.number_of_edges()} edges")
print(f"User nodes: {len(user_mapping)}")
print(f"Job nodes: {len(job_mapping)}")

# Get node features
node_features = graph_data['node_features']
print(f"Node features shape: {node_features.shape}")

# Visualize edge weight distribution
edge_weights = [graph[u][v]['weight'] for u, v in graph.edges()]
plt.figure(figsize=(10, 6))
plt.hist(edge_weights, bins=20)
plt.title('Edge Weight Distribution')
plt.xlabel('Weight')
plt.ylabel('Count')
plt.show()

## 5. Split Data for Training

In [None]:
# Split the data into training, validation, and test sets
def split_data(job_applications, splits=(0.7, 0.2, 0.1), seed=42):
    """
    Split job applications into train, validation, and test sets.
    """
    # Set random seed for reproducibility
    np.random.seed(seed)
    
    # Shuffle the data
    shuffled_idx = np.random.permutation(len(job_applications))
    job_applications_shuffled = job_applications.iloc[shuffled_idx].reset_index(drop=True)
    
    # Calculate split indices
    n = len(job_applications_shuffled)
    train_end = int(splits[0] * n)
    val_end = train_end + int(splits[1] * n)
    
    # Split the data
    train_data = job_applications_shuffled[:train_end]
    val_data = job_applications_shuffled[train_end:val_end]
    test_data = job_applications_shuffled[val_end:]
    
    return train_data, val_data, test_data

# Create both ground truth and training data
if 'job_applications' in data_dict:
    train_data, val_data, test_data = split_data(data_dict['job_applications'])
    
    print(f"Training set: {len(train_data)} applications")
    print(f"Validation set: {len(val_data)} applications")
    print(f"Test set: {len(test_data)} applications")
    
    # Convert to ground truth format (user_id -> list of relevant job_ids)
    def create_ground_truth(interactions):
        ground_truth = {}
        for _, row in interactions.iterrows():
            user_id = row['user_id']
            job_id = row['job_id']
            if user_id not in ground_truth:
                ground_truth[user_id] = []
            ground_truth[user_id].append(job_id)
        return ground_truth
    
    train_ground_truth = create_ground_truth(train_data)
    val_ground_truth = create_ground_truth(val_data)
    test_ground_truth = create_ground_truth(test_data)
    
    print(f"Training ground truth: {len(train_ground_truth)} users")
    print(f"Validation ground truth: {len(val_ground_truth)} users")
    print(f"Test ground truth: {len(test_ground_truth)} users")
else:
    print("No job applications data available for splitting")
    # Create synthetic splits using similarity data
    # This is a fallback if we don't have actual user-job interactions

## 6. Train the GCN Model

In [None]:
# Create GCN model
input_dim = node_features.shape[1]  # Feature dimension
hidden_dims = [64, 32]  # Hidden layer dimensions
output_dim = 32  # Output embedding dimension

# Set model parameters
model_params = {
    'input_dim': input_dim,
    'hidden_dims': hidden_dims,
    'output_dim': output_dim,
    'dropout': 0.2,
    'use_hetero_gnn': True  # Whether to use heterogeneous GNN
}

# Create model
model = GCNRecommender(model_params)
model = model.to(device)

print(f"Created GCN model with {input_dim} input features, "
      f"{hidden_dims} hidden dimensions, and {output_dim} output dimensions")

In [None]:
# Set training parameters
training_params = {
    'num_epochs': 50,
    'learning_rate': 0.001,
    'weight_decay': 5e-4,
    'early_stopping_patience': 10,
    'batch_size': 64,
    'negative_samples': 5  # Number of negative samples per positive sample
}

# Create trainer
trainer = GCNTrainer(
    model=model,
    graph=graph,
    node_features=torch.FloatTensor(node_features).to(device),
    user_mapping=user_mapping,
    job_mapping=job_mapping,
    train_interactions=train_data,
    val_interactions=val_data,
    params=training_params
)

# Train the model
print("Starting model training...")
history = trainer.train()

# Plot training history
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history['train_loss'], label='Train Loss')
plt.plot(history['val_loss'], label='Validation Loss')
plt.title('Loss During Training')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
for metric in ['hit_rate', 'ndcg', 'precision', 'recall']:
    if metric in history:
        plt.plot(history[metric], label=metric)
plt.title('Metrics During Training')
plt.xlabel('Epoch')
plt.ylabel('Value')
plt.legend()

plt.tight_layout()
plt.show()

## 7. Evaluate the Model on Test Data

In [None]:
# Create evaluator
evaluator = RecommendationEvaluator(config)

# Generate recommendations for test users
test_user_ids = list(test_ground_truth.keys())
recommendations = {}
top_k = 10  # Number of recommendations to generate

for user_id in test_user_ids:
    # Skip user if not in mapping
    if user_id not in user_mapping:
        continue
        
    # Get user node index
    user_idx = user_mapping[user_id]
    
    # Get user embedding (forward pass through the trained model)
    model.eval()
    with torch.no_grad():
        node_embeddings = model(graph, torch.FloatTensor(node_features).to(device))
        user_emb = node_embeddings[user_idx]
        
        # Calculate scores for all jobs
        job_scores = []
        for job_id, job_idx in job_mapping.items():
            job_emb = node_embeddings[job_idx]
            score = torch.dot(user_emb, job_emb).item()
            job_scores.append((job_id, score))
            
        # Sort by score and take top k
        job_scores.sort(key=lambda x: x[1], reverse=True)
        recommendations[user_id] = [js[0] for js in job_scores[:top_k]]

# Calculate evaluation metrics
print("\nEvaluation Results:")
results = evaluator.evaluate_all_metrics(
    recommendations=recommendations,
    ground_truth=test_ground_truth,
    job_categories={job_id: job_row['categories'] for job_id, job_row in feature_data['jobs'].iterrows()},
    all_items=feature_data['jobs'].index.tolist()
)

# Display metrics for each k value
for metric, values in results.items():
    print(f"\n{metric.upper()}:")
    for k, value in values.items():
        print(f"  @{k}: {value:.4f}")
        
# Visualize key metrics at different k values
plt.figure(figsize=(14, 8))
metrics = ['hit_rate', 'precision', 'recall', 'ndcg', 'map']

for i, metric in enumerate(metrics):
    if metric in results:
        plt.subplot(2, 3, i+1)
        ks = list(results[metric].keys())
        values = list(results[metric].values())
        plt.plot(ks, values, 'o-')
        plt.title(f'{metric.upper()}')
        plt.xlabel('k')
        plt.ylabel('Score')
        plt.grid(True)

plt.tight_layout()
plt.show()

## 8. Save the Trained Model

In [None]:
# Create models directory if it doesn't exist
models_dir = os.path.join(project_root, 'models')
os.makedirs(models_dir, exist_ok=True)

# Save the model with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_path = os.path.join(models_dir, f"gcn_model_{timestamp}.pt")

# Prepare model data for saving
model_data = {
    'model_state_dict': model.state_dict(),
    'model_params': model_params,
    'user_mapping': user_mapping,
    'job_mapping': job_mapping,
    'input_dim': input_dim,
    'timestamp': timestamp,
    'config': config
}

torch.save(model_data, model_path)
print(f"Model saved to {model_path}")

## 9. Generate Sample Recommendations

In [None]:
# Select a few sample users
sample_user_ids = list(user_mapping.keys())[:5]

# Generate recommendations for sample users
sample_recommendations = {}
top_k = 10

for user_id in sample_user_ids:
    user_idx = user_mapping[user_id]
    
    # Get user embedding
    model.eval()
    with torch.no_grad():
        node_embeddings = model(graph, torch.FloatTensor(node_features).to(device))
        user_emb = node_embeddings[user_idx]
        
        # Calculate scores for all jobs
        job_scores = []
        for job_id, job_idx in job_mapping.items():
            job_emb = node_embeddings[job_idx]
            score = torch.dot(user_emb, job_emb).item()
            job_scores.append((job_id, score))
            
        # Sort by score and take top k
        job_scores.sort(key=lambda x: x[1], reverse=True)
        top_jobs = job_scores[:top_k]
        
        # Get job details
        recommendations_with_details = []
        for job_id, score in top_jobs:
            if job_id in feature_data['jobs'].index:
                job_info = feature_data['jobs'].loc[job_id]
                recommendations_with_details.append({
                    'job_id': job_id,
                    'score': score,
                    'categories': job_info.get('categories', []),
                    'title': job_info.get('title', 'No title available')
                })
                
        sample_recommendations[user_id] = recommendations_with_details

# Display recommendations for each sample user
for user_id, recs in sample_recommendations.items():
    # Get user details
    user_info = feature_data['users'].loc[user_id] if user_id in feature_data['users'].index else {}
    user_categories = user_info.get('categories', [])
    
    print(f"\n=== Recommendations for User {user_id} ===")
    print(f"User Categories: {', '.join(user_categories)}")
    print("\nTop Recommended Jobs:")
    for i, rec in enumerate(recs, 1):
        print(f"{i}. {rec['title']} (ID: {rec['job_id']})")
        print(f"   Score: {rec['score']:.4f}")
        print(f"   Categories: {', '.join(rec['categories'])}")

## 10. Conclusion

In this notebook, we demonstrated how to:

1. Generate or load sample data for the JibJob recommendation system
2. Preprocess the data and generate features
3. Build a graph representation of users and jobs
4. Train a GCN model on the graph data
5. Evaluate the model using various recommendation metrics
6. Generate and display sample recommendations

This approach can be extended to real-world data by replacing the sample data generation with actual data from the JibJob platform. The same model architecture and training process would apply.