In [1]:
# ===== 1) Imports =====
import pandas as pd
import numpy as np
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, auc_score, recall_at_k
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix
import pickle
import time
import warnings
warnings.filterwarnings('ignore')



In [2]:
# ===== 2) Load Data =====
# Load the MovieLens ratings file containing user‚Äìmovie interactions and ratings
ratings = pd.read_csv("ratings.csv")

# Load the movies file containing movie metadata such as titles and genres
movies = pd.read_csv("movies.csv")

# Load the tags file containing user-generated tags describing movie content
tags = pd.read_csv("tags.csv")

# Display the dimensions of each dataset to verify successful loading
# and understand the scale of the data
print(f"Ratings data: {ratings.shape}")
print(f"Movies data: {movies.shape}")
print(f"Tags data: {tags.shape}")

Ratings data: (100836, 4)
Movies data: (9742, 3)
Tags data: (3683, 4)


In [6]:
# ===== 3) Detect Bias =====
# This function detects genre imbalance by comparing the most frequent
# genre (major) to the least frequent genre (minor) in the dataset
def major_minor_ratio_genres(movies_df):
    # Create a copy of the dataframe to avoid modifying the original data
    movies_df = movies_df.copy()
    
    # Split the genre string into a list of individual genres
    movies_df["genres"] = movies_df["genres"].str.split("|")
    
    # Count the occurrence of each genre across all movies
    genre_counts = {}
    for genre_list in movies_df["genres"]:
        if isinstance(genre_list, list):
            for g in genre_list:
                # Exclude placeholder entries that do not represent real genres
                if g != "(no genres listed)":
                    genre_counts[g] = genre_counts.get(g, 0) + 1
    
    # Convert the genre counts dictionary into a Pandas Series for easier analysis
    genre_counts = pd.Series(genre_counts)
    
    # Identify the most common (major) and least common (minor) genres
    major = genre_counts.max()
    minor = genre_counts.min()
    
    # Compute the major‚Äìminor ratio as an indicator of genre imbalance
    ratio = major / minor
    
    # Print detailed diagnostic information for bias analysis
    print("\n" + "="*60)
    print("üìä BIAS DETECTION - Major-Minor Ratio")
    print("="*60)
    
    # Display the most frequent genres (top 10)
    print("üé≠ Genre Counts (Top 10):")
    print(genre_counts.sort_values(ascending=False).head(10))
    
    # Display the least frequent genres (bottom 10)
    print("\nüìâ Genre Counts (Bottom 10):")
    print(genre_counts.sort_values().head(10))
    
    # Report the imbalance statistics
    print(f"\nüü¶ Major‚ÄìMinor Ratio: {ratio:.2f}")
    print(f"Most common genre has: {major} movies")
    print(f"Least common genre has: {minor} movies")
    
    # Interpret the imbalance severity using a simple threshold
    if ratio > 10:
        print("‚ö†Ô∏è Warning: Strong genre imbalance detected!")
    else:
        print("‚úÖ Genre distribution is reasonably balanced.")
    
    # Return the imbalance ratio and full genre count distribution
    return ratio, genre_counts

# Run the bias detection function on the movies dataset
ratio, genre_counts = major_minor_ratio_genres(movies)



üìä BIAS DETECTION - Major-Minor Ratio
üé≠ Genre Counts (Top 10):
Drama        4361
Comedy       3756
Thriller     1894
Action       1828
Romance      1596
Adventure    1263
Crime        1199
Sci-Fi        980
Horror        978
Fantasy       779
dtype: int64

üìâ Genre Counts (Bottom 10):
Film-Noir       87
IMAX           158
Western        167
Musical        334
War            382
Documentary    440
Mystery        573
Animation      611
Children       664
Fantasy        779
dtype: int64

üü¶ Major‚ÄìMinor Ratio: 50.13
Most common genre has: 4361 movies
Least common genre has: 87 movies


In [7]:
# ===== 4) Calculate IDF Weights =====
# This section applies IDF-based reweighting to reduce genre imbalance
# by assigning higher weights to rare genres and lower weights to common ones

print("\n" + "="*60)
print("‚öñÔ∏è  APPLYING IDF-BASED REWEIGHTING")
print("="*60)

# Compute the total number of movies with valid genre information
total_movies = len(movies[movies['genres'] != "(no genres listed)"])

# Dictionary to store IDF weights for each genre
genre_weights = {}

# Calculate the IDF weight for each genre based on its frequency
for genre, count in genre_counts.items():
    # IDF formula: log(total_movies / number_of_movies_with_genre)
    idf_weight = np.log(total_movies / count)
    genre_weights[genre] = idf_weight

# Display the computed IDF weights for interpretability
print("\nGenre Weights (IDF):")

# Sort genres by IDF weight in descending order
sorted_weights = sorted(genre_weights.items(), key=lambda x: x[1], reverse=True)

# Show genres with the highest IDF weights (rare genres that are boosted)
print("\nTop 5 Highest Weights (rare genres - boosted):")
for genre, weight in sorted_weights[:5]:
    count = genre_counts[genre]
    print(f"   {genre:20s}: weight={weight:.3f} (appears in {count} movies)")

# Show genres with the lowest IDF weights (common genres that are down-weighted)
print("\nTop 5 Lowest Weights (common genres - reduced):")
for genre, weight in sorted_weights[-5:]:
    count = genre_counts[genre]
    print(f"   {genre:20s}: weight={weight:.3f} (appears in {count} movies)")


‚öñÔ∏è  APPLYING IDF-BASED REWEIGHTING

Genre Weights (IDF):

Top 5 Highest Weights (rare genres - boosted):
   Film-Noir           : weight=4.715 (appears in 87 movies)
   IMAX                : weight=4.118 (appears in 158 movies)
   Western             : weight=4.063 (appears in 167 movies)
   Musical             : weight=3.370 (appears in 334 movies)
   War                 : weight=3.235 (appears in 382 movies)

Top 5 Lowest Weights (common genres - reduced):
   Romance             : weight=1.805 (appears in 1596 movies)
   Action              : weight=1.670 (appears in 1828 movies)
   Thriller            : weight=1.634 (appears in 1894 movies)
   Comedy              : weight=0.950 (appears in 3756 movies)
   Drama               : weight=0.800 (appears in 4361 movies)


In [8]:
# ===== 5) Prepare Content Features =====
# Split the genre string into lists so each movie can have multiple genres
movies['genres'] = movies['genres'].str.split('|')

# Collect all unique genres across the dataset
all_genres = set()
for genre_list in movies['genres']:
    if isinstance(genre_list, list):
        for genre in genre_list:
            # Exclude placeholder values that do not represent real genres
            if genre != "(no genres listed)":
                all_genres.add(genre)

# Select the top 50 most frequent user-generated tags
# These tags act as additional content features for the hybrid model
top_tags = tags['tag'].value_counts().head(50).index.tolist()

# Display summary statistics for content features
print(f"\nTotal unique genres: {len(all_genres)}")
print(f"Top 50 tags: {top_tags[:10]}...")

# ===== 6) Prepare Dataset =====
# Define the threshold for converting explicit ratings into positive implicit feedback
RATING_THRESHOLD = 4.0

# Filter ratings to keep only positive interactions
positive = ratings[ratings["rating"] >= RATING_THRESHOLD].copy()

# Display dataset preparation summary
print(f"\n{'='*60}")
print(f"DATASET PREPARATION")
print(f"{'='*60}")
print(f"Total ratings: {len(ratings)}")
print(f"Positive ratings (>= {RATING_THRESHOLD}): {len(positive)}")

# Extract unique users and movies from the ratings data
all_users = ratings["userId"].unique()
all_items = ratings["movieId"].unique()

# Display dataset cardinality
print(f"Unique users: {len(all_users)}")
print(f"Unique movies in ratings: {len(all_items)}")

# Identify movies that appear in both the ratings and movies metadata
movies_in_ratings = movies[movies['movieId'].isin(all_items)]
print(f"Movies common to both ratings and movies: {len(movies_in_ratings)}")

# Create and initialize the LightFM Dataset object
dataset = Dataset()

# Fit the dataset with users, items, and combined content features
# Content features include both genres and top user-generated tags
dataset.fit(
    users=all_users,
    items=all_items,
    item_features=list(all_genres) + top_tags
)


Total unique genres: 19
Top 50 tags: ['In Netflix queue', 'atmospheric', 'thought-provoking', 'superhero', 'funny', 'surreal', 'Disney', 'religion', 'sci-fi', 'quirky']...

DATASET PREPARATION
Total ratings: 100836
Positive ratings (>= 4.0): 48580
Unique users: 610
Unique movies in ratings: 9724
Movies common to both ratings and movies: 9724


In [9]:
# ===== 7) Build Interactions with Weighted Features =====
# This function builds the user‚Äìitem interaction matrix and applies
# genre-based reweighting to item features to mitigate genre imbalance
def prepare_features_with_reweighting(genre_weights, reweight_strength=1.0):
    """
    Build item features with genre reweighting
    
    Args:
        genre_weights: Dictionary containing IDF-based weights for each genre
        reweight_strength: Controls how strongly the reweighting is applied
                            (0 = no reweighting, 1 = full reweighting)
    """
    
    # Build the user‚Äìitem interaction matrix using positive implicit feedback
    interactions, _ = dataset.build_interactions(
        [(row.userId, row.movieId) for row in positive.itertuples(index=False)]
    )
    
    # Prepare a list to store weighted item features for each movie
    item_features_list = []
    
    # Iterate over all movies in the dataset
    for movie_id in all_items:
        # Retrieve the genre list associated with the current movie
        movie_genres = movies[movies['movieId'] == movie_id]['genres']
        
        # Dictionary to store weighted genre features for the current movie
        weighted_features = {}
        
        if len(movie_genres) > 0:
            genres_str = movie_genres.iloc[0]
            if isinstance(genres_str, list):
                for genre in genres_str:
                    # Ignore placeholder values that do not represent real genres
                    if genre != "(no genres listed)":
                        # Retrieve the base IDF weight for the genre
                        base_weight = genre_weights.get(genre, 1.0)
                        
                        # Adjust the weight based on the chosen reweighting strength
                        weight = 1.0 + (base_weight - 1.0) * reweight_strength
                        
                        # Store the weighted genre feature using the required dictionary format
                        weighted_features[genre] = weight
        
        # Append the movie ID and its weighted features to the feature list
        item_features_list.append((movie_id, weighted_features))
    
    # Report successful construction of weighted features
    print(f"‚úÖ Built weighted features for {len(item_features_list)} movies")
    
    # Convert the weighted feature list into a sparse item feature matrix
    item_features_matrix = dataset.build_item_features(item_features_list)
    
    # Return the interaction matrix and the weighted item feature matrix
    return interactions, item_features_matrix

# Build interactions and item features using controlled genre reweighting
interactions, item_features = prepare_features_with_reweighting(
    genre_weights, 
    reweight_strength=0.7  # Controls how strongly genre imbalance is corrected
)


‚úÖ Built weighted features for 9724 movies


In [10]:
# ===== 8) Final check =====
# Perform a final consistency check before training the hybrid recommendation model

# Print a summary header for the final validation step
print(f"\nüîç Final check:")

# Display the total number of unique movies used in the dataset
print(f"   all_items: {len(all_items)} movies")

# Display the number of rows in the item feature matrix
# Each row should correspond to exactly one movie
print(f"   item_features: {item_features.shape[0]} rows")

# Verify that the number of movies matches the number of item feature rows
print(f"   Match: {len(all_items) == item_features.shape[0]}")

# If the counts match, the dataset is correctly prepared for hybrid modeling
if len(all_items) == item_features.shape[0]:
    print("‚úÖ READY FOR HYBRID RECOMMENDATIONS!")
else:
    # If the counts do not match, there is a misalignment that must be fixed
    print("‚ùå NEED TO FIX ITEM FEATURES!")


üîç Final check:
   all_items: 9724 movies
   item_features: 9724 rows
   Match: True
‚úÖ READY FOR HYBRID RECOMMENDATIONS!


In [11]:
# ===== 9) Split Data =====
# Split the positive implicit interactions into training and testing sets
# using a fixed random seed to ensure reproducibility
train_df, test_df = train_test_split(
    positive,
    test_size=0.2,
    random_state=42
)

# Display the number of interactions in each split
print(f"\nTraining data: {len(train_df)} interactions")
print(f"Testing data: {len(test_df)} interactions")

# Ensure that all users and items in the test set
# also appear in the training set
# This avoids cold-start issues during evaluation
train_users = set(train_df["userId"].unique())
train_items = set(train_df["movieId"].unique())

# Filter the test set to keep only valid user‚Äìitem pairs
test_df = test_df[
    test_df["userId"].isin(train_users) & 
    test_df["movieId"].isin(train_items)
].copy()

# Display the size of the filtered test set
print(f"Testing data after filtering: {len(test_df)} interactions")

# Helper function to convert a dataframe of interactions
# into a LightFM-compatible interaction matrix
def prepare_interactions(df):
    return dataset.build_interactions(
        [(row.userId, row.movieId) for row in df.itertuples(index=False)]
    )[0]

# Build the final interaction matrices for training and testing
train = prepare_interactions(train_df)
test = prepare_interactions(test_df)

# Display the shapes of the interaction matrices and item feature matrix
print(f"Train interactions shape: {train.shape}")
print(f"Test interactions shape: {test.shape}")
print(f"Item features shape: {item_features.shape}")


Training data: 38864 interactions
Testing data: 9716 interactions
Testing data after filtering: 9142 interactions
Train interactions shape: (610, 9724)
Test interactions shape: (610, 9724)
Item features shape: (9724, 9793)


In [12]:
# ===== 10) Evaluation Function =====
# This function evaluates the recommendation model using ranking-based metrics
# on both training and testing interaction matrices
def evaluate_model(model, train_interactions, test_interactions, item_features, k=10):
    
    # -------- Evaluation on training data --------
    # Compute Precision@k on the training set
    prec_tr = precision_at_k(
        model, 
        train_interactions, 
        item_features=item_features, 
        k=k, 
        num_threads=1
    ).mean()
    
    # Compute AUC on the training set to measure overall ranking quality
    auc_tr = auc_score(
        model, 
        train_interactions, 
        item_features=item_features, 
        num_threads=1
    ).mean()
    
    # Compute Recall@k on the training set to measure retrieval completeness
    rec_tr = recall_at_k(
        model, 
        train_interactions, 
        item_features=item_features,
        k=k, 
        num_threads=1
    ).mean()   # Recall@k on training data

    # -------- Evaluation on test data --------
    # Compute Precision@k on the test set using the training interactions
    # as the reference for known positives
    prec_te = precision_at_k(
        model, 
        test_interactions, 
        train_interactions=train_interactions,
        item_features=item_features, 
        k=k, 
        num_threads=1
    ).mean()
    
    # Compute AUC on the test set to assess generalization performance
    auc_te = auc_score(
        model, 
        test_interactions, 
        train_interactions=train_interactions,
        item_features=item_features, 
        num_threads=1
    ).mean()
    
    # Compute Recall@k on the test set
    rec_te = recall_at_k(
        model, 
        test_interactions, 
        train_interactions=train_interactions,
        item_features=item_features,
        k=k, 
        num_threads=1
    ).mean()  # Recall@k on test data

    # -------- Print evaluation results --------
    # Display Precision@k for both training and testing sets
    print(
        f"Precision@{k}: train {prec_tr:.4f} ({prec_tr*100:.2f}%), "
        f"test {prec_te:.4f} ({prec_te*100:.2f}%)"
    )
    
    # Display Recall@k for both training and testing sets
    print(
        f"Recall@{k}:    train {rec_tr:.4f} ({rec_tr*100:.2f}%), "
        f"test {rec_te:.4f} ({rec_te*100:.2f}%)  (Recommendation Accuracy)"
    )
    
    # Display AUC for both training and testing sets
    print(f"AUC:           train {auc_tr:.4f}, test {auc_te:.4f}")

In [13]:
# ===== 11) Train Hybrid Model with Epoch-level Checkpointing =====
# This section trains the hybrid LightFM model using epoch-level training
# and saves a checkpoint after each epoch to ensure recoverability

print("\n" + "="*50)
print("Hybrid Model Training (WARP + IDF-Weighted Genres)")
print("Checkpoint saved after each epoch")
print("="*50)

import pickle
from lightfm import LightFM

# Define the total number of training epochs
num_epochs = 15

# Initialize the LightFM hybrid model with WARP loss
# WARP is chosen to optimize ranking quality in top-N recommendations
model_hybrid = LightFM(
    loss="warp",
    learning_rate=0.05,
    random_state=42
)

# Train the model incrementally, one epoch at a time
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    # Perform one epoch of training using user‚Äìitem interactions
    # and IDF-weighted item features
    model_hybrid.fit_partial(
        train,
        item_features=item_features,
        epochs=1,
        num_threads=1
    )

    # Save a model checkpoint after each epoch
    # This allows recovery in case of interruption
    with open("lightfm_hybrid_checkpoint.pkl", "wb") as f:
        pickle.dump(model_hybrid, f)

    print("üíæ Checkpoint saved")

# ===== 11a) Final Evaluation =====
# Evaluate the final trained model on both training and testing data
# using ranking-based metrics (Precision@k, Recall@k, and AUC)
evaluate_model(model_hybrid, train, test, item_features, k=10)


Hybrid Model Training (WARP + IDF-Weighted Genres)
Checkpoint saved after each epoch

Epoch 1/15
üíæ Checkpoint saved

Epoch 2/15
üíæ Checkpoint saved

Epoch 3/15
üíæ Checkpoint saved

Epoch 4/15
üíæ Checkpoint saved

Epoch 5/15
üíæ Checkpoint saved

Epoch 6/15
üíæ Checkpoint saved

Epoch 7/15
üíæ Checkpoint saved

Epoch 8/15
üíæ Checkpoint saved

Epoch 9/15
üíæ Checkpoint saved

Epoch 10/15
üíæ Checkpoint saved

Epoch 11/15
üíæ Checkpoint saved

Epoch 12/15
üíæ Checkpoint saved

Epoch 13/15
üíæ Checkpoint saved

Epoch 14/15
üíæ Checkpoint saved

Epoch 15/15
üíæ Checkpoint saved
Precision@10: train 0.2993 (29.93%), test 0.1173 (11.73%)
Recall@10:    train 0.0828 (8.28%), test 0.1000 (10.00%)  (Recommendation Accuracy)
AUC:           train 0.9486, test 0.9305


In [14]:
# ===== 12) Generate Sample Recommendations =====
# This function generates sample movie recommendations for selected users
# using the trained hybrid LightFM model and proper ID mappings
def sample_recommendations(model, user_ids, item_features, dataset, n_items=5):
    """Generate movie recommendations using hybrid model with proper ID mapping"""
    
    # Retrieve internal-to-original ID mappings from the LightFM dataset
    user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()
    
    # Get the list of all available internal movie IDs
    available_movies = list(item_id_map.values())
    print(f"üìä Hybrid predictions for {len(available_movies)} movies")
    
    # Generate recommendations for each specified user
    for user_id in user_ids:
        print(f"\nüîç Generating recommendations for user {user_id}...")
        user_start = time.time()
        
        # Convert external user ID to internal LightFM user ID
        user_internal_id = user_id_map.get(user_id)
        if user_internal_id is None:
            print(f"‚ùå User {user_id} not found in dataset")
            continue
        
        scores = []
        original_movie_ids = []
        
        # Predict scores for all movies for the given user
        for movie_internal_id in available_movies:
            score = model.predict(
                np.array([user_internal_id], dtype=np.int32), 
                np.array([movie_internal_id], dtype=np.int32),
                item_features=item_features,
                num_threads=1
            )[0]
            scores.append(score)
            
            # Map internal movie ID back to the original movieId
            original_id = [k for k, v in item_id_map.items() if v == movie_internal_id][0]
            original_movie_ids.append(original_id)
        
        # Convert scores list to NumPy array for sorting
        scores = np.array(scores)
        
        # Select the indices of the top-N highest scoring movies
        top_indices = np.argsort(-scores)[:n_items]
        top_movies = []
        
        # Retrieve metadata (title and genres) for the top recommendations
        for idx in top_indices:
            original_movie_id = original_movie_ids[idx]
            movie_data = movies[movies['movieId'] == original_movie_id]
            if len(movie_data) > 0:
                title = movie_data['title'].values[0]
                genres = movie_data['genres'].values[0]
                top_movies.append((title, genres, scores[idx]))
        
        # Display the top-N recommendations for the current user
        print(f"üé¨ User {user_id} - Top {n_items} Hybrid Recommendations:")
        for i, (title, genres, score) in enumerate(top_movies, 1):
            print(f"   {i}. {title}")
            print(f"      ‚≠ê Score: {score:.3f} | üé≠ {genres}")
        
        # Measure and display the response time for generating recommendations
        user_elapsed = time.time() - user_start
        print(f"‚è± Response time for user {user_id}: {user_elapsed:.3f} seconds")


# Display sample recommendations for the first three users
print("\n" + "="*60)
print("üéØ HYBRID RECOMMENDATIONS (IDF-Weighted)")
print("="*60)
sample_users = list(all_users)[:3]
sample_recommendations(model_hybrid, sample_users, item_features, dataset)



üéØ HYBRID RECOMMENDATIONS (IDF-Weighted)
üìä Hybrid predictions for 9724 movies

üîç Generating recommendations for user 1...
üé¨ User 1 - Top 5 Hybrid Recommendations:
   1. Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
      ‚≠ê Score: -0.730 | üé≠ ['Action', 'Adventure']
   2. Shawshank Redemption, The (1994)
      ‚≠ê Score: -0.825 | üé≠ ['Crime', 'Drama']
   3. Matrix, The (1999)
      ‚≠ê Score: -0.846 | üé≠ ['Action', 'Sci-Fi', 'Thriller']
   4. Terminator 2: Judgment Day (1991)
      ‚≠ê Score: -0.850 | üé≠ ['Action', 'Sci-Fi']
   5. Star Wars: Episode IV - A New Hope (1977)
      ‚≠ê Score: -0.874 | üé≠ ['Action', 'Adventure', 'Sci-Fi']
‚è± Response time for user 1: 2.455 seconds

üîç Generating recommendations for user 2...
üé¨ User 2 - Top 5 Hybrid Recommendations:
   1. Silence of the Lambs, The (1991)
      ‚≠ê Score: 1.579 | üé≠ ['Crime', 'Horror', 'Thriller']
   2. Shining, The (1980)
      ‚≠ê Score: 1.522 | üé≠ ['Horror']