In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from pathlib import Path

# Add project root to sys.path (not src)
project_root = Path.cwd().parent  # Should be RECSYS_FINAL
sys.path.append(str(project_root))

# Import specific modules/functions we want to test
from src import config
from src.data import load_raw
from src.data import utils
from src.data import preprocess  # Import the main preprocessing module

# Set display options
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
sns.set_style("whitegrid")
print("Setup complete. Modules imported.")

In [None]:
# Test loading all raw data
try:
    raw_data = load_raw.load_all_raw_data()
    print("\nRaw data loaded successfully into 'raw_data' dictionary.")
    # Display shapes
    for name, df in raw_data.items():
        print(f"- {name}: {df.shape}")
except Exception as e:
    print(f"Error loading raw data: {e}")

In [None]:
# Test cleaning functions (one by one)
print("--- Testing Cleaning Functions ---")
student_info_clean = preprocess.clean_student_info(raw_data['student_info'])
registrations_clean = preprocess.clean_registrations(raw_data['student_registration'])
assessments_clean = preprocess.clean_assessments(raw_data['assessments'])
student_assessment_clean = preprocess.clean_student_assessment(raw_data['student_assessment'])
vle_clean = preprocess.clean_vle(raw_data['vle'])
student_vle_clean = preprocess.clean_student_vle(raw_data['student_vle'])
print("--- Finished Testing Cleaning Functions ---")
# Optional: Print heads/info if needed for debugging
# print("\nCleaned studentInfo Head:\n", student_info_clean.head())
# print("\nCleaned registrations Head:\n", registrations_clean.head())
# print("\nCleaned assessments Head:\n", assessments_clean.head())
# print("\nCleaned studentAssessment Head:\n", student_assessment_clean.head())
# print("\nCleaned vle Head:\n", vle_clean.head())
# print("\nCleaned studentVle Head:\n", student_vle_clean.head())

In [None]:
# Test filtering interactions by registration dates
print("\n--- Testing Registration Filtering ---")
interactions_filtered = preprocess.filter_interactions_by_registration(
    student_vle_clean, registrations_clean
)
print("\nFiltered Interactions Head:\n", interactions_filtered.head())
print(f"\nShape after filtering by registration: {interactions_filtered.shape}")
print("--- Finished Testing Registration Filtering ---")

In [None]:
# Test applying interaction count filters (BEFORE aggregation)
print("\n--- Testing Interaction Count Filtering ---")
# Use the default thresholds from config.py
interactions_count_filtered = preprocess.apply_interaction_count_filters(
    interactions_filtered # Apply to the output of the previous step
)
print("\nInteractions after Interaction Count Filters Head:\n", interactions_count_filtered.head())
print(f"\nShape after interaction count filters: {interactions_count_filtered.shape}")
print(f"\nUnique users remaining: {interactions_count_filtered['id_student'].nunique()}")
print(f"Unique items remaining: {interactions_count_filtered['presentation_id'].nunique()}")
print("--- Finished Testing Interaction Count Filtering ---")

In [None]:
# Test creating aggregated interaction features (AFTER count filtering)
print("\n--- Testing Interaction Aggregation ---")
aggregated_interactions = preprocess.create_interaction_features(
    interactions_count_filtered # Apply to the output of the previous step
)
print("\nAggregated Interactions Head:\n", aggregated_interactions.head())
print(f"\nShape of aggregated interactions: {aggregated_interactions.shape}")

# Plot distribution of implicit feedback score if aggregation is not empty
if not aggregated_interactions.empty:
    plt.figure(figsize=(10, 5))
    sns.histplot(aggregated_interactions['implicit_feedback'], bins=50, kde=True)
    plt.title('Distribution of Implicit Feedback Score (log1p(total_clicks))')
    plt.xlabel('Implicit Feedback Score')
    plt.ylabel('Count')
    plt.show()
else:
    print("\nAggregated interactions DataFrame is empty, skipping plot.")
print("--- Finished Testing Interaction Aggregation ---")

In [None]:
print("\n--- Testing User Feature Generation ---")
# Generate user features for the valid users found *after filtering and aggregation*
if not aggregated_interactions.empty:
    valid_user_ids_test = aggregated_interactions['id_student'].unique()
    print(f"Number of valid users for feature generation: {len(valid_user_ids_test)}")
    users_features_test = preprocess.generate_user_features(
        student_info_clean, # Pass the cleaned student info
        valid_user_ids_test # Pass the list of valid IDs
    )
    print("\nUser Features Head:\n", users_features_test.head())
    print(f"\nShape of user features: {users_features_test.shape}")
    # Verify shape matches unique user count
    assert users_features_test.shape[0] == len(valid_user_ids_test)
else:
    print("Skipping user feature generation as aggregated_interactions is empty.")
    users_features_test = pd.DataFrame() # Assign empty dataframe
print("--- Finished Testing User Feature Generation ---")


In [None]:
print("\n--- Testing Item Feature Generation ---")
# Generate item features for the valid items found *after filtering and aggregation*
if not aggregated_interactions.empty:
    valid_item_ids_test = aggregated_interactions['presentation_id'].unique()
    print(f"Number of valid items for feature generation: {len(valid_item_ids_test)}")
    # Need courses_df with presentation_id
    courses_with_pres_id = utils.create_presentation_id(raw_data['courses'])
    items_features_test = preprocess.generate_item_features(
        courses_with_pres_id, # Pass cleaned courses
        vle_clean, # Pass cleaned VLE info
        valid_item_ids_test # Pass the list of valid IDs
    )
    print("\nItem Features Head:\n", items_features_test.head())
    print(f"\nShape of item features: {items_features_test.shape}")
    # Verify shape matches unique item count
    assert items_features_test.shape[0] == len(valid_item_ids_test)
else:
    print("Skipping item feature generation as aggregated_interactions is empty.")
    items_features_test = pd.DataFrame() # Assign empty dataframe
print("--- Finished Testing Item Feature Generation ---")

In [9]:
# Cell [9] - Optional: Test Full Pipeline
# print("\n--- Testing full preprocess_all_data() function ---")
# processed_data_test = preprocess.preprocess_all_data()
# print("\n--- Full pipeline test finished ---")
# print(f"Final Users shape: {processed_data_test['users'].shape}")
# print(f"Final Items shape: {processed_data_test['items'].shape}")
# print(f"Final Interactions shape: {processed_data_test['interactions'].shape}")