In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from pathlib import Path
import math # Import math

# Add project root to sys.path
project_root = Path.cwd().parent # Should be RECSYS_FINAL
src_path = project_root / "src"
sys.path.append(str(project_root)) # Add project root for imports like 'src.config'

# Import project modules
from src import config
from src.data import preprocess # For time_based_split
from src.evaluation.evaluator import RecEvaluator # Import the evaluator class
from src.models.popularity import PopularityRecommender # Import the model

# Set display options
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
sns.set_style("whitegrid")
print("Setup complete. Modules imported.")
print(f"Project Root: {project_root}")
print(f"Processed Data Dir: {config.PROCESSED_DATA_DIR}")

Loading .env from: /Users/mohit/Desktop/everything/ATLAS/Semester 4/Pinnacle/recsys_final/.env
Database URI configured: Yes
Setup complete. Modules imported.
Project Root: /Users/mohit/Desktop/everything/ATLAS/Semester 4/Pinnacle/recsys_final
Processed Data Dir: /Users/mohit/Desktop/everything/ATLAS/Semester 4/Pinnacle/recsys_final/data/processed


In [2]:
# Load the processed parquet files
try:
    interactions_df = pd.read_parquet(config.PROCESSED_DATA_DIR / "interactions_final.parquet")
    users_df = pd.read_parquet(config.PROCESSED_DATA_DIR / "users_final.parquet")
    items_df = pd.read_parquet(config.PROCESSED_DATA_DIR / "items_final.parquet") # Contains presentation_id as column
    print("Processed data loaded successfully.")
    print(f"Interactions shape: {interactions_df.shape}")
    print(f"Users shape: {users_df.shape}")
    print(f"Items shape: {items_df.shape}")

    # Set presentation_id as index for items_df if needed later (evaluator uses it)
    if 'presentation_id' in items_df.columns:
        items_df = items_df.set_index('presentation_id')
        print("Set 'presentation_id' as index for items_df.")

except FileNotFoundError as e:
    print(f"Error loading processed files: {e}")
    print("Please ensure the preprocessing pipeline (run_preprocessing.py) has been run successfully.")
    # Stop execution or handle error
    raise e
except Exception as e:
    print(f"An unexpected error occurred during loading: {e}")
    raise e

# Display heads
print("\nInteractions Head:\n", interactions_df.head())
print("\nUsers Head:\n", users_df.head())
print("\nItems Head:\n", items_df.head())

Processed data loaded successfully.
Interactions shape: (28466, 7)
Users shape: (25364, 9)
Items shape: (22, 22)
Set 'presentation_id' as index for items_df.

Interactions Head:
    id_student presentation_id  total_clicks  interaction_days  \
0        6516       AAA_2014J          2791               159   
1        8462       DDD_2013J           646                56   
2        8462       DDD_2014J            10                 1   
3       11391       AAA_2013J           934                40   
4       23629       BBB_2013B           161                16   

   first_interaction_date  last_interaction_date  implicit_feedback  
0                     -23                    269           7.934513  
1                      -6                    118           6.472346  
2                      10                     10           2.397895  
3                      -5                    253           6.840547  
4                      -6                     87           5.087596  

Users Hea

In [4]:
# Cell 3: Time-Based Split (Corrected)

# Perform time-based split on the aggregated interactions data
time_col = 'last_interaction_date'
user_col_in_df = 'id_student'      # <<< Actual column name in interactions_df
item_col_in_df = 'presentation_id' # <<< Actual column name in interactions_df

if time_col not in interactions_df.columns:
    raise ValueError(f"Time column '{time_col}' not found in interactions data.")
if user_col_in_df not in interactions_df.columns:
    raise ValueError(f"User column '{user_col_in_df}' not found in interactions data.")
if item_col_in_df not in interactions_df.columns:
    raise ValueError(f"Item column '{item_col_in_df}' not found in interactions data.")
if not pd.api.types.is_numeric_dtype(interactions_df[time_col]):
     raise TypeError(f"Time column '{time_col}' must be numeric.")

# Using split ratio for demonstration (adjust ratio as needed)
train_df, test_df = preprocess.time_based_split(
    interactions_df=interactions_df,
    user_col=user_col_in_df,         # <<< Pass the correct user column name
    item_col=item_col_in_df,         # <<< Pass the correct item column name
    split_ratio=0.8,
    time_col=time_col
)

# Verify the split
print(f"\nTrain shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
if not test_df.empty:
    print(f"Min time in Train: {train_df[time_col].min()}, Max time in Train: {train_df[time_col].max()}")
    print(f"Min time in Test: {test_df[time_col].min()}, Max time in Test: {test_df[time_col].max()}")
else:
    print("Warning: Test DataFrame is empty!")

Performing time-based split...
Original interactions shape: (28466, 7)
Splitting based on ratio per user: 80.0% train
 Initial train size: 28465, Initial test size: 1
Final Training set shape: (28465, 7)
Final Test set shape: (1, 7)
Users in Train: 25364, Users in Test: 1
Items in Train: 22, Items in Test: 1

Train shape: (28465, 7)
Test shape: (1, 7)
Min time in Train: -25, Max time in Train: 269
Min time in Test: 88, Max time in Test: 88


In [6]:
# Cell [4] - Train Popularity Model

# Initialize and train the Popularity model
# Ensure the item_col matches the column name in train_df and test_df
pop_model = PopularityRecommender(
    user_col='id_student',          # <<< Use the actual user column name
    item_col='presentation_id',     # <<< Use the actual item column name
    score_col='implicit_feedback'
)

# Fit the model using the training data
pop_model.fit(train_df)

# (Optional) Test prediction for a sample user/items
if not test_df.empty:
    sample_user = test_df['id_student'].iloc[0]
    sample_items_all = items_df.index.tolist() # Get all unique item IDs from items_df index
    sample_items_subset = np.random.choice(sample_items_all, min(10, len(sample_items_all)), replace=False).tolist() # Ensure not sampling more than available
    print(f"\nTesting prediction for user {sample_user} on items: {sample_items_subset}")
    scores = pop_model.predict(sample_user, sample_items_subset)
    print("Scores (Popularity):", scores)
else:
    print("\nSkipping prediction test as test_df is empty.")

Fitting PopularityRecommender...
Fit complete. Calculated popularity for 22 items.
Top 5 most popular items: ['FFF_2013J', 'FFF_2014J', 'CCC_2014J', 'BBB_2014J', 'FFF_2013B']

Testing prediction for user 584077 on items: ['CCC_2014B', 'BBB_2014J', 'GGG_2013J', 'DDD_2013J', 'FFF_2013B', 'BBB_2013J', 'DDD_2014J', 'GGG_2014J', 'DDD_2013B', 'AAA_2014J']
Scores (Popularity): [10208.63075689 11428.69371655  5186.41032883 10853.63605915
 11041.97961388 10596.41647959  9711.1514301   3994.78316198
  7720.85371512  2440.3889407 ]


In [7]:
# Cell [5] - Evaluate Popularity Model

# Initialize the evaluator
# Ensure items_df has presentation_id as index before passing
if test_df.empty:
     print("\nCannot evaluate model: Test data is empty.")
elif items_df.index.name != 'presentation_id':
     print("\nError: items_df must have 'presentation_id' set as index for evaluator.")
else:
    evaluator = RecEvaluator(
        train_df=train_df,
        test_df=test_df,
        item_features_df=items_df, # Pass items_df with index set
        user_col='id_student',     # <<< Use the actual user column name
        item_col='presentation_id',# <<< Use the actual item column name
        k=config.TOP_K             # Use K from config
    )

    # Evaluate the popularity model
    # Using n_neg_samples can speed things up significantly for evaluation if needed
    # Set n_neg_samples=100 for faster (approximate) evaluation, or None for full evaluation
    print("\n--- Starting Evaluation of Popularity Model ---")
    pop_results = evaluator.evaluate_model(pop_model, n_neg_samples=100)

    print("\nPopularity Model Evaluation Results:")
    print(pop_results)

Evaluator initialized with 22 unique candidate items.
Stored 28465 training interactions for filtering.
Prepared test data for 1 users.

--- Starting Evaluation of Popularity Model ---

--- Evaluating Model: PopularityRecommender ---


Evaluating users:   0%|          | 0/1 [00:00<?, ?it/s]

 User 584077: Scoring 1 positives + 17 negatives.

--- Evaluation Results (K=10) ---
Precision@10: 0.0000
Recall@10: 0.0000
NDCG@10: 0.0000
n_users_evaluated: 1.0000
------------------------------

Popularity Model Evaluation Results:
{'Precision@10': 0.0, 'Recall@10': 0.0, 'NDCG@10': 0.0, 'n_users_evaluated': 1}
