In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from pathlib import Path
import math # Import math

# Add project root to sys.path
project_root = Path.cwd().parent # Should be RECSYS_FINAL
src_path = project_root / "src"
sys.path.append(str(project_root)) # Add project root for imports like 'src.config'

# Import project modules
from src import config
from src.data import preprocess # For time_based_split
from src.evaluation.evaluator import RecEvaluator # Import the evaluator class
from src.models.popularity import PopularityRecommender # Import the model

# Set display options
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
sns.set_style("whitegrid")
print("Setup complete. Modules imported.")
print(f"Project Root: {project_root}")
print(f"Processed Data Dir: {config.PROCESSED_DATA_DIR}")

In [None]:
# Load the processed parquet files
try:
    interactions_df = pd.read_parquet(config.PROCESSED_DATA_DIR / "interactions_final.parquet")
    users_df = pd.read_parquet(config.PROCESSED_DATA_DIR / "users_final.parquet")
    items_df = pd.read_parquet(config.PROCESSED_DATA_DIR / "items_final.parquet") # Contains presentation_id as column
    print("Processed data loaded successfully.")
    print(f"Interactions shape: {interactions_df.shape}")
    print(f"Users shape: {users_df.shape}")
    print(f"Items shape: {items_df.shape}")

    # Set presentation_id as index for items_df if needed later (evaluator uses it)
    if 'presentation_id' in items_df.columns:
        items_df = items_df.set_index('presentation_id')
        print("Set 'presentation_id' as index for items_df.")

except FileNotFoundError as e:
    print(f"Error loading processed files: {e}")
    print("Please ensure the preprocessing pipeline (run_preprocessing.py) has been run successfully.")
    # Stop execution or handle error
    raise e
except Exception as e:
    print(f"An unexpected error occurred during loading: {e}")
    raise e

# Display heads
print("\nInteractions Head:\n", interactions_df.head())
print("\nUsers Head:\n", users_df.head())
print("\nItems Head:\n", items_df.head())

In [None]:
# Cell [3]: Time-Based Split (Using Threshold)

time_col = 'last_interaction_date'
user_col_in_df = 'id_student'      # Actual column name in interactions_df
item_col_in_df = 'presentation_id' # Actual column name in interactions_df

# --- Determine Threshold ---
print("--- Determining Time Threshold ---")
print(interactions_df[time_col].describe(percentiles=[.75, .8, .85, .9, .95]))
# Choose threshold based on percentiles (e.g., 80th percentile)
# ***** REPLACE 229 WITH YOUR CHOSEN VALUE *****
TIME_THRESHOLD = 250
print(f"Chosen Time Threshold: {TIME_THRESHOLD}")
print("--- End Threshold Determination ---")


# --- Perform Split ---
if time_col not in interactions_df.columns:
    raise ValueError(f"Time column '{time_col}' not found in interactions data.")
if user_col_in_df not in interactions_df.columns:
    raise ValueError(f"User column '{user_col_in_df}' not found in interactions data.")
if item_col_in_df not in interactions_df.columns:
    raise ValueError(f"Item column '{item_col_in_df}' not found in interactions data.")
if not pd.api.types.is_numeric_dtype(interactions_df[time_col]):
     raise TypeError(f"Time column '{time_col}' must be numeric.")

train_df, test_df = preprocess.time_based_split(
    interactions_df=interactions_df,
    user_col=user_col_in_df,
    item_col=item_col_in_df,
    time_col=time_col,
    time_unit_threshold=TIME_THRESHOLD # <<< Use the threshold
    # split_ratio=None # Ensure split_ratio is not used
)

# --- Verify Split ---
print(f"\nTrain shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
if not test_df.empty:
    print(f"Min time in Train: {train_df[time_col].min()}, Max time in Train: {train_df[time_col].max()}")
    print(f"Min time in Test: {test_df[time_col].min()}, Max time in Test: {test_df[time_col].max()}")
    # Check user/item overlap
    train_users_final = set(train_df[user_col_in_df].unique())
    test_users_final = set(test_df[user_col_in_df].unique())
    print(f"Users in Train: {len(train_users_final)}, Users in Test: {len(test_users_final)}")
    print(f"Users ONLY in Test: {len(test_users_final - train_users_final)}") # Should be 0 after filtering in split func

    train_items_final = set(train_df[item_col_in_df].unique())
    test_items_final = set(test_df[item_col_in_df].unique())
    print(f"Items in Train: {len(train_items_final)}, Items in Test: {len(test_items_final)}")
    print(f"Items ONLY in Test: {len(test_items_final - train_items_final)}") # Should be 0 after filtering in split func

else:
    print("Warning: Test DataFrame is empty!")

In [None]:
# Cell [4] - Train Popularity Model

# Initialize and train the Popularity model
# Ensure the item_col matches the column name in train_df and test_df
pop_model = PopularityRecommender(
    user_col='id_student',          # <<< Use the actual user column name
    item_col='presentation_id',     # <<< Use the actual item column name
    score_col='implicit_feedback'
)

# Fit the model using the training data
pop_model.fit(train_df)

# (Optional) Test prediction for a sample user/items
if not test_df.empty:
    sample_user = test_df['id_student'].iloc[0]
    sample_items_all = items_df.index.tolist() # Get all unique item IDs from items_df index
    sample_items_subset = np.random.choice(sample_items_all, min(10, len(sample_items_all)), replace=False).tolist() # Ensure not sampling more than available
    print(f"\nTesting prediction for user {sample_user} on items: {sample_items_subset}")
    scores = pop_model.predict(sample_user, sample_items_subset)
    print("Scores (Popularity):", scores)
else:
    print("\nSkipping prediction test as test_df is empty.")

In [None]:
# Cell [5] - Evaluate Popularity Model

# Initialize the evaluator
# Ensure items_df has presentation_id as index before passing
if test_df.empty:
     print("\nCannot evaluate model: Test data is empty.")
elif items_df.index.name != 'presentation_id':
     print("\nError: items_df must have 'presentation_id' set as index for evaluator.")
else:
    evaluator = RecEvaluator(
        train_df=train_df,
        test_df=test_df,
        item_features_df=items_df, # Pass items_df with index set
        user_col='id_student',     # <<< Use the actual user column name
        item_col='presentation_id',# <<< Use the actual item column name
        k=config.TOP_K             # Use K from config
    )

    # Evaluate the popularity model
    # Using n_neg_samples can speed things up significantly for evaluation if needed
    # Set n_neg_samples=100 for faster (approximate) evaluation, or None for full evaluation
    print("\n--- Starting Evaluation of Popularity Model ---")
    pop_results = evaluator.evaluate_model(pop_model, n_neg_samples=100)

    print("\nPopularity Model Evaluation Results:")
    print(pop_results)

In [None]:
# Cell [6] - Train ItemCF Model

# Import the model
from src.models.item_cf import ItemCFRecommender

# Initialize and train the ItemCF model
itemcf_model = ItemCFRecommender(
    user_col='id_student',          # Use the actual user column name
    item_col='presentation_id',     # Use the actual item column name
    score_col='implicit_feedback'
)

# Fit the model using the training data
itemcf_model.fit(train_df)

# (Optional) Test prediction for a sample user/items
if not test_df.empty:
    # Use the same sample user as before or pick a new one
    sample_user_id = test_df['id_student'].iloc[0]
    # Ensure the user exists in the model's mapping
    if sample_user_id in itemcf_model.user_id_to_idx:
        # Get items the user interacted with in train and test for context
        user_train_interactions = train_df[train_df['id_student'] == sample_user_id]['presentation_id'].tolist()
        user_test_interactions = test_df[test_df['id_student'] == sample_user_id]['presentation_id'].tolist()
        print(f"\n--- ItemCF Prediction Test ---")
        print(f"Sample User ID: {sample_user_id}")
        print(f" User's Training Items: {user_train_interactions}")
        print(f" User's Test Items (Ground Truth): {user_test_interactions}")

        # Predict scores for the test items and a few others
        sample_items_all = items_df.index.tolist()
        items_to_predict = user_test_interactions + np.random.choice(sample_items_all, 5, replace=False).tolist()
        items_to_predict = list(set(items_to_predict)) # Ensure unique items

        print(f" Predicting for Items: {items_to_predict}")
        scores = itemcf_model.predict(sample_user_id, items_to_predict)
        print(" Predicted Scores:", scores)
        print("--- End Prediction Test ---")

    else:
        print(f"Sample user {sample_user_id} not found in ItemCF model training data.")

else:
    print("\nSkipping ItemCF prediction test as test_df is empty.")

In [None]:
# Cell [7] - Evaluate ItemCF Model

# Evaluate the ItemCF model using the same evaluator instance
if 'evaluator' in locals() and evaluator is not None: # Check if evaluator exists
    print("\n--- Starting Evaluation of ItemCF Model ---")
    itemcf_results = evaluator.evaluate_model(itemcf_model, n_neg_samples=100) # Use negative sampling

    print("\nItemCF Model Evaluation Results:")
    print(itemcf_results)

elif test_df.empty:
    print("\nCannot evaluate model: Test data is empty.")
else:
     print("\nError: Evaluator not initialized. Please run Cell [5] successfully first.")

In [None]:
# Cell [10] - Train ItemCF Model

# Import the model
from src.models.item_cf import ItemCFRecommender

print("\n--- Training ItemCF Model ---")

# Initialize and train the ItemCF model
itemcf_model = ItemCFRecommender(
    user_col='id_student',          # Use the actual user column name from interactions_df
    item_col='presentation_id',     # Use the actual item column name from interactions_df
    score_col='implicit_feedback'
)

# Fit the model using the training data
# This might take a moment as it calculates the similarity matrix
itemcf_model.fit(train_df)

# (Optional) Test prediction for a sample user/items
if not test_df.empty:
    # Use the same sample user as before or pick a new one from the test set
    sample_user_id = test_df['id_student'].iloc[0] # Example: first user in test set

    # Ensure the user exists in the model's mapping
    if sample_user_id in itemcf_model.user_id_to_idx:
        # Get items the user interacted with in train and test for context
        user_train_interactions = train_df[train_df['id_student'] == sample_user_id]['presentation_id'].tolist()
        user_test_interactions = test_df[test_df['id_student'] == sample_user_id]['presentation_id'].tolist()
        print(f"\n--- ItemCF Prediction Test ---")
        print(f"Sample User ID: {sample_user_id}")
        print(f" User's Training Items: {user_train_interactions}")
        print(f" User's Test Items (Ground Truth): {user_test_interactions}")

        # Predict scores for the test items and a few others
        sample_items_all = items_df.index.tolist()
        items_to_predict = user_test_interactions + np.random.choice(sample_items_all, 5, replace=False).tolist()
        items_to_predict = list(set(items_to_predict)) # Ensure unique items

        print(f" Predicting for Items: {items_to_predict}")
        scores = itemcf_model.predict(sample_user_id, items_to_predict)
        print(" Predicted Scores:", scores)
        # Display scores alongside item IDs for better readability
        scored_preds = sorted(list(zip(items_to_predict, scores)), key=lambda x: x[1], reverse=True)
        print(" Predicted Scores (Sorted):", scored_preds)
        print("--- End Prediction Test ---")
    else:
        print(f"\nSample user {sample_user_id} not found in ItemCF model training data (this shouldn't happen if test set was filtered correctly).")
else:
    print("\nSkipping ItemCF prediction test as test_df is empty.")

print("\n--- Finished Training ItemCF Model ---")

In [None]:
# Cell [11] - Evaluate ItemCF Model

# Evaluate the ItemCF model using the same evaluator instance
if 'evaluator' in locals() and evaluator is not None: # Check if evaluator exists
    print("\n--- Starting Evaluation of ItemCF Model ---")
    itemcf_results = evaluator.evaluate_model(itemcf_model, n_neg_samples=100) # Use negative sampling

    print("\nItemCF Model Evaluation Results:")
    print(itemcf_results)

elif test_df.empty:
    print("\nCannot evaluate ItemCF model: Test data is empty.")
else:
     print("\nError: Evaluator not initialized. Please run the cell that initializes 'evaluator' successfully first.")

In [None]:
# Cell [12] - Train ALS Model

# Import the model
from src.models.matrix_factorization import ImplicitALSWrapper

print("\n--- Training Implicit ALS Model ---")

# Initialize and train the ALS model
# Adjust hyperparameters as needed (these are examples)
als_model = ImplicitALSWrapper(
    user_col='id_student',
    item_col='presentation_id',
    score_col='implicit_feedback',
    factors=50,           # Latent factors
    regularization=0.05,  # Regularization
    iterations=25,        # Iterations
    random_state=config.RANDOM_SEED
)

# Fit the model using the training data
# This will take longer than Popularity or ItemCF
als_model.fit(train_df)

# (Optional) Test prediction for a sample user/items
if not test_df.empty:
    sample_user_id = test_df['id_student'].iloc[0]
    if sample_user_id in als_model.user_id_to_idx:
        user_train_interactions = train_df[train_df['id_student'] == sample_user_id]['presentation_id'].tolist()
        user_test_interactions = test_df[test_df['id_student'] == sample_user_id]['presentation_id'].tolist()
        print(f"\n--- ALS Prediction Test ---")
        print(f"Sample User ID: {sample_user_id}")
        print(f" User's Training Items: {user_train_interactions}")
        print(f" User's Test Items (Ground Truth): {user_test_interactions}")

        sample_items_all = items_df.index.tolist()
        items_to_predict = user_test_interactions + np.random.choice(sample_items_all, 5, replace=False).tolist()
        items_to_predict = list(set(items_to_predict))

        print(f" Predicting for Items: {items_to_predict}")
        scores = als_model.predict(sample_user_id, items_to_predict)
        scored_preds = sorted(list(zip(items_to_predict, scores)), key=lambda x: x[1], reverse=True)
        print(" Predicted Scores (Sorted):", scored_preds)
        print("--- End Prediction Test ---")
    else:
        print(f"\nSample user {sample_user_id} not found in ALS model training data.")
else:
    print("\nSkipping ALS prediction test as test_df is empty.")

print("\n--- Finished Training Implicit ALS Model ---")

In [None]:
# Cell [13] - Evaluate ALS Model

# Evaluate the ALS model using the same evaluator instance
if 'evaluator' in locals() and evaluator is not None:
    print("\n--- Starting Evaluation of Implicit ALS Model ---")
    als_results = evaluator.evaluate_model(als_model, n_neg_samples=100) # Use negative sampling

    print("\nImplicit ALS Model Evaluation Results:")
    print(als_results)

elif test_df.empty:
    print("\nCannot evaluate ALS model: Test data is empty.")
else:
     print("\nError: Evaluator not initialized. Please run the cell that initializes 'evaluator' successfully first.")