In [None]:
# ===== 1) Imports =====
import pandas as pd
import numpy as np
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, auc_score, recall_at_k
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix
import pickle
import time
import warnings
warnings.filterwarnings('ignore')


In [46]:
# ===== 2) Load Data =====
ratings = pd.read_csv("ratings.csv")
movies = pd.read_csv("movies.csv")
tags = pd.read_csv("tags.csv")

print(f"Ratings data: {ratings.shape}")
print(f"Movies data: {movies.shape}")
print(f"Tags data: {tags.shape}")

Ratings data: (100836, 4)
Movies data: (9742, 3)
Tags data: (3683, 4)


In [47]:
# ===== 3) Detect Bias =====
def major_minor_ratio_genres(movies_df):
    movies_df = movies_df.copy()
    movies_df["genres"] = movies_df["genres"].str.split("|")
    
    genre_counts = {}
    for genre_list in movies_df["genres"]:
        if isinstance(genre_list, list):
            for g in genre_list:
                if g != "(no genres listed)":
                    genre_counts[g] = genre_counts.get(g, 0) + 1
    
    genre_counts = pd.Series(genre_counts)
    
    major = genre_counts.max()
    minor = genre_counts.min()
    ratio = major / minor
    
    print("\n" + "="*60)
    print("üìä BIAS DETECTION - Major-Minor Ratio")
    print("="*60)
    print("üé≠ Genre Counts (Top 10):")
    print(genre_counts.sort_values(ascending=False).head(10))
    print("\nüìâ Genre Counts (Bottom 10):")
    print(genre_counts.sort_values().head(10))
    
    print(f"\nüü¶ Major‚ÄìMinor Ratio: {ratio:.2f}")
    print(f"Most common genre has: {major} movies")
    print(f"Least common genre has: {minor} movies")
    
    if ratio > 10:
        print("‚ö†Ô∏è Warning: Strong genre imbalance detected!")
    else:
        print("‚úÖ Genre distribution is reasonably balanced.")
    
    return ratio, genre_counts

ratio, genre_counts = major_minor_ratio_genres(movies)


üìä BIAS DETECTION - Major-Minor Ratio
üé≠ Genre Counts (Top 10):
Drama        4361
Comedy       3756
Thriller     1894
Action       1828
Romance      1596
Adventure    1263
Crime        1199
Sci-Fi        980
Horror        978
Fantasy       779
dtype: int64

üìâ Genre Counts (Bottom 10):
Film-Noir       87
IMAX           158
Western        167
Musical        334
War            382
Documentary    440
Mystery        573
Animation      611
Children       664
Fantasy        779
dtype: int64

üü¶ Major‚ÄìMinor Ratio: 50.13
Most common genre has: 4361 movies
Least common genre has: 87 movies


In [48]:
# ===== 4) Calculate IDF Weights =====
print("\n" + "="*60)
print("‚öñÔ∏è  APPLYING IDF-BASED REWEIGHTING")
print("="*60)

total_movies = len(movies[movies['genres'] != "(no genres listed)"])
genre_weights = {}

for genre, count in genre_counts.items():
    idf_weight = np.log(total_movies / count)
    genre_weights[genre] = idf_weight

print("\nGenre Weights (IDF):")
sorted_weights = sorted(genre_weights.items(), key=lambda x: x[1], reverse=True)
print("\nTop 5 Highest Weights (rare genres - boosted):")
for genre, weight in sorted_weights[:5]:
    count = genre_counts[genre]
    print(f"   {genre:20s}: weight={weight:.3f} (appears in {count} movies)")

print("\nTop 5 Lowest Weights (common genres - reduced):")
for genre, weight in sorted_weights[-5:]:
    count = genre_counts[genre]
    print(f"   {genre:20s}: weight={weight:.3f} (appears in {count} movies)")



‚öñÔ∏è  APPLYING IDF-BASED REWEIGHTING

Genre Weights (IDF):

Top 5 Highest Weights (rare genres - boosted):
   Film-Noir           : weight=4.715 (appears in 87 movies)
   IMAX                : weight=4.118 (appears in 158 movies)
   Western             : weight=4.063 (appears in 167 movies)
   Musical             : weight=3.370 (appears in 334 movies)
   War                 : weight=3.235 (appears in 382 movies)

Top 5 Lowest Weights (common genres - reduced):
   Romance             : weight=1.805 (appears in 1596 movies)
   Action              : weight=1.670 (appears in 1828 movies)
   Thriller            : weight=1.634 (appears in 1894 movies)
   Comedy              : weight=0.950 (appears in 3756 movies)
   Drama               : weight=0.800 (appears in 4361 movies)


In [49]:
# ===== 5) Prepare Content Features =====
movies['genres'] = movies['genres'].str.split('|')
all_genres = set()
for genre_list in movies['genres']:
    if isinstance(genre_list, list):
        for genre in genre_list:
            if genre != "(no genres listed)":
                all_genres.add(genre)

top_tags = tags['tag'].value_counts().head(50).index.tolist()
print(f"\nTotal unique genres: {len(all_genres)}")
print(f"Top 50 tags: {top_tags[:10]}...")

# ===== 6) Prepare Dataset =====
RATING_THRESHOLD = 4.0
positive = ratings[ratings["rating"] >= RATING_THRESHOLD].copy()

print(f"\n{'='*60}")
print(f"DATASET PREPARATION")
print(f"{'='*60}")
print(f"Total ratings: {len(ratings)}")
print(f"Positive ratings (>= {RATING_THRESHOLD}): {len(positive)}")

all_users = ratings["userId"].unique()
all_items = ratings["movieId"].unique()

print(f"Unique users: {len(all_users)}")
print(f"Unique movies in ratings: {len(all_items)}")

movies_in_ratings = movies[movies['movieId'].isin(all_items)]
print(f"Movies common to both ratings and movies: {len(movies_in_ratings)}")

# Create Dataset
dataset = Dataset()
dataset.fit(
    users=all_users,
    items=all_items,
    item_features=list(all_genres) + top_tags
)



Total unique genres: 19
Top 50 tags: ['In Netflix queue', 'atmospheric', 'thought-provoking', 'superhero', 'funny', 'surreal', 'Disney', 'religion', 'sci-fi', 'quirky']...

DATASET PREPARATION
Total ratings: 100836
Positive ratings (>= 4.0): 48580
Unique users: 610
Unique movies in ratings: 9724
Movies common to both ratings and movies: 9724


In [50]:
# ===== 7) Build Interactions with Weighted Features =====
def prepare_features_with_reweighting(genre_weights, reweight_strength=1.0):
    """
    Build item features with genre reweighting
    
    Args:
        genre_weights: Dictionary of genre weights
        reweight_strength: Controls the strength of reweighting (0=no reweight, 1=full reweight)
    """
    # Build interactions
    interactions, _ = dataset.build_interactions(
        [(row.userId, row.movieId) for row in positive.itertuples(index=False)]
    )
    
    # Build weighted item features - USE DICTIONARY FORMAT
    item_features_list = []
    
    for movie_id in all_items:
        movie_genres = movies[movies['movieId'] == movie_id]['genres']
        
        weighted_features = {}  # Dictionary for weighted features
        if len(movie_genres) > 0:
            genres_str = movie_genres.iloc[0]
            if isinstance(genres_str, list):
                for genre in genres_str:
                    if genre != "(no genres listed)":
                        # Apply genre weight
                        base_weight = genre_weights.get(genre, 1.0)
                        # Apply reweight strength
                        weight = 1.0 + (base_weight - 1.0) * reweight_strength
                        # CORRECT FORMAT: dictionary {feature_name: weight}
                        weighted_features[genre] = weight
        
        item_features_list.append((movie_id, weighted_features))
    
    print(f"‚úÖ Built weighted features for {len(item_features_list)} movies")
    
    item_features_matrix = dataset.build_item_features(item_features_list)
    
    return interactions, item_features_matrix

# Build features with reweighting
interactions, item_features = prepare_features_with_reweighting(
    genre_weights, 
    reweight_strength=0.7  # Adjust this value: 0=no reweight, 1=full reweight
)


‚úÖ Built weighted features for 9724 movies


In [51]:
# ===== 8) Final check =====
print(f"\nüîç Final check:")
print(f"   all_items: {len(all_items)} movies")
print(f"   item_features: {item_features.shape[0]} rows")
print(f"   Match: {len(all_items) == item_features.shape[0]}")

if len(all_items) == item_features.shape[0]:
    print("‚úÖ READY FOR HYBRID RECOMMENDATIONS!")
else:
    print("‚ùå NEED TO FIX ITEM FEATURES!")



üîç Final check:
   all_items: 9724 movies
   item_features: 9724 rows
   Match: True
‚úÖ READY FOR HYBRID RECOMMENDATIONS!


In [52]:
# ===== 9) Split Data =====
train_df, test_df = train_test_split(
    positive,
    test_size=0.2,
    random_state=42
)

print(f"\nTraining data: {len(train_df)} interactions")
print(f"Testing data: {len(test_df)} interactions")

# Ensure test data exists in training
train_users = set(train_df["userId"].unique())
train_items = set(train_df["movieId"].unique())
test_df = test_df[
    test_df["userId"].isin(train_users) & 
    test_df["movieId"].isin(train_items)
].copy()

print(f"Testing data after filtering: {len(test_df)} interactions")

def prepare_interactions(df):
    return dataset.build_interactions(
        [(row.userId, row.movieId) for row in df.itertuples(index=False)]
    )[0]

train = prepare_interactions(train_df)
test = prepare_interactions(test_df)

print(f"Train interactions shape: {train.shape}")
print(f"Test interactions shape: {test.shape}")
print(f"Item features shape: {item_features.shape}")


Training data: 38864 interactions
Testing data: 9716 interactions
Testing data after filtering: 9142 interactions
Train interactions shape: (610, 9724)
Test interactions shape: (610, 9724)
Item features shape: (9724, 9793)


In [53]:
# ===== 10) Evaluation Function =====
def evaluate_model(model, train_interactions, test_interactions, item_features, k=10):
    # On training data
    prec_tr = precision_at_k(model, train_interactions, item_features=item_features, 
                             k=k, num_threads=1).mean()
    auc_tr = auc_score(model, train_interactions, item_features=item_features, 
                       num_threads=1).mean()
    rec_tr = recall_at_k(model, train_interactions, item_features=item_features,
                         k=k, num_threads=1).mean()   # ‚¨ÖÔ∏è (ÿ≥ÿ∑ÿ± ÿ¨ÿØŸäÿØ) Recall@k ÿπŸÑŸâ ÿßŸÑÿ™ÿØÿ±Ÿäÿ®

    # On test data
    prec_te = precision_at_k(model, test_interactions, train_interactions=train_interactions,
                             item_features=item_features, k=k, num_threads=1).mean()
    auc_te = auc_score(model, test_interactions, train_interactions=train_interactions,
                       item_features=item_features, num_threads=1).mean()
    rec_te = recall_at_k(model, test_interactions, train_interactions=train_interactions,
                         item_features=item_features, k=k, num_threads=1).mean()  # ‚¨ÖÔ∏è (ÿ≥ÿ∑ÿ± ÿ¨ÿØŸäÿØ) Recall@k ÿπŸÑŸâ ÿßŸÑÿßÿÆÿ™ÿ®ÿßÿ±

    print(f"Precision@{k}: train {prec_tr:.4f} ({prec_tr*100:.2f}%), "
          f"test {prec_te:.4f} ({prec_te*100:.2f}%)")
    print(f"Recall@{k}:    train {rec_tr:.4f} ({rec_tr*100:.2f}%), "
          f"test {rec_te:.4f} ({rec_te*100:.2f}%)  (Recommendation Accuracy)")  # ‚¨ÖÔ∏è (ÿ≥ÿ∑ÿ± ÿ¨ÿØŸäÿØ ŸÑŸÑÿ∑ÿ®ÿßÿπÿ©)
    print(f"AUC:           train {auc_tr:.4f}, test {auc_te:.4f}")


In [54]:
# ===== 11) Train Hybrid Model =====
print("\n" + "="*50)
print("Hybrid Model Training (WARP + IDF-Weighted Genres)")
print("="*50)

model_hybrid = LightFM(
    loss="warp",
    learning_rate=0.05,
    random_state=42
)

# Train with IDF-weighted content features
model_hybrid.fit(
    train,
    item_features=item_features,
    epochs=15,
    num_threads=1
)

# Evaluate hybrid model
evaluate_model(model_hybrid, train, test, item_features, k=10)

# ===== 11a) Save Model Checkpoint =====
with open("lightfm_hybrid_checkpoint.pkl", "wb") as f:
    pickle.dump(model_hybrid, f)

print("üíæ Saved model checkpoint to 'lightfm_hybrid_checkpoint.pkl'")



Hybrid Model Training (WARP + IDF-Weighted Genres)
Precision@10: train 0.2995 (29.95%), test 0.1166 (11.66%)
Recall@10:    train 0.0850 (8.50%), test 0.0963 (9.63%)  (Recommendation Accuracy)
AUC:           train 0.9489, test 0.9310
üíæ Saved model checkpoint to 'lightfm_hybrid_checkpoint.pkl'


In [55]:
# ===== 12) Generate Sample Recommendations =====
def sample_recommendations(model, user_ids, item_features, dataset, n_items=5):
    """Generate movie recommendations using hybrid model with proper ID mapping"""
    
    user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()
    
    available_movies = list(item_id_map.values())
    print(f"üìä Hybrid predictions for {len(available_movies)} movies")
    
    for user_id in user_ids:
        print(f"\nüîç Generating recommendations for user {user_id}...")
        user_start = time.time()
        
        user_internal_id = user_id_map.get(user_id)
        if user_internal_id is None:
            print(f"‚ùå User {user_id} not found in dataset")
            continue
        
        scores = []
        original_movie_ids = []
        
        for movie_internal_id in available_movies:
            score = model.predict(
                np.array([user_internal_id], dtype=np.int32), 
                np.array([movie_internal_id], dtype=np.int32),
                item_features=item_features,
                num_threads=1
            )[0]
            scores.append(score)
            
            original_id = [k for k, v in item_id_map.items() if v == movie_internal_id][0]
            original_movie_ids.append(original_id)
        
        scores = np.array(scores)
        
        # Get top recommendations
        top_indices = np.argsort(-scores)[:n_items]
        top_movies = []
        
        for idx in top_indices:
            original_movie_id = original_movie_ids[idx]
            movie_data = movies[movies['movieId'] == original_movie_id]
            if len(movie_data) > 0:
                title = movie_data['title'].values[0]
                genres = movie_data['genres'].values[0]
                top_movies.append((title, genres, scores[idx]))
        
        print(f"üé¨ User {user_id} - Top {n_items} Hybrid Recommendations:")
        for i, (title, genres, score) in enumerate(top_movies, 1):
            print(f"   {i}. {title}")
            print(f"      ‚≠ê Score: {score:.3f} | üé≠ {genres}")
        user_elapsed = time.time() - user_start
        print(f"‚è± Response time for user {user_id}: {user_elapsed:.3f} seconds")


# Show recommendations for first 3 users
print("\n" + "="*60)
print("üéØ HYBRID RECOMMENDATIONS (IDF-Weighted)")
print("="*60)
sample_users = list(all_users)[:3]
sample_recommendations(model_hybrid, sample_users, item_features, dataset)


üéØ HYBRID RECOMMENDATIONS (IDF-Weighted)
üìä Hybrid predictions for 9724 movies

üîç Generating recommendations for user 1...
üé¨ User 1 - Top 5 Hybrid Recommendations:
   1. Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
      ‚≠ê Score: -0.484 | üé≠ ['Action', 'Adventure']
   2. Shawshank Redemption, The (1994)
      ‚≠ê Score: -0.569 | üé≠ ['Crime', 'Drama']
   3. Lord of the Rings: The Fellowship of the Ring, The (2001)
      ‚≠ê Score: -0.674 | üé≠ ['Adventure', 'Fantasy']
   4. Matrix, The (1999)
      ‚≠ê Score: -0.773 | üé≠ ['Action', 'Sci-Fi', 'Thriller']
   5. Lord of the Rings: The Two Towers, The (2002)
      ‚≠ê Score: -0.785 | üé≠ ['Adventure', 'Fantasy']
‚è± Response time for user 1: 2.247 seconds

üîç Generating recommendations for user 2...
üé¨ User 2 - Top 5 Hybrid Recommendations:
   1. Shawshank Redemption, The (1994)
      ‚≠ê Score: 1.597 | üé≠ ['Crime', 'Drama']
   2. Pulp Fiction (1994)
      ‚≠ê Score: 1.401 | üé≠ 