# Data Prep

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rounakbanik/the-movies-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/rounakbanik/the-movies-dataset?dataset_version_number=7...


100%|██████████| 228M/228M [00:11<00:00, 21.3MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/rounakbanik/the-movies-dataset/versions/7


In [None]:
!pip uninstall -y numpy scikit-surprise  # Remove all traces
!pip install "numpy==1.26.4"            # Install last stable NumPy 1.x
!pip install scikit-surprise --no-cache-dir  # Force rebuild without cached binaries

# RESTART RUNTIME: Runtime → Restart runtime


Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
[0mCollecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m58.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
Successfully installed numpy-1.26.4
Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata 

In [None]:
import numpy, surprise
print(f"NumPy: {numpy.__version__}")      # Should show 1.x.x
print(f"Surprise: {surprise.__version__}")# Should show 1.1.x


NumPy: 1.26.4
Surprise: 1.1.4


In [None]:
!ls /root/.cache/kagglehub/datasets/rounakbanik/the-movies-dataset/versions/7

credits.csv   links.csv        movies_metadata.csv  ratings_small.csv
keywords.csv  links_small.csv  ratings.csv


In [None]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse, mae

# Load ratings.csv
ratings = pd.read_csv('/root/.cache/kagglehub/datasets/rounakbanik/the-movies-dataset/versions/7/ratings.csv')[['userId', 'movieId', 'rating']]

# Define rating scale (e.g., 0.5-5.0)
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings, reader)

# Split data into 80% train, 20% test
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Matrix Factorization

In [None]:
# Train SVD model with 100 latent factors
model = SVD(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02)
model.fit(trainset)

# Generate predictions
predictions = model.test(testset)

In [None]:
# Calculate RMSE and MAE
rmse_score = rmse(predictions)
mae_score = mae(predictions)

RMSE: 0.7966
MAE:  0.6021


In [None]:
from surprise.model_selection import cross_validate
cross_validate(SVD(), data, measures=['rmse', 'mae'], cv=5, verbose=True)

NameError: name 'SVD' is not defined

In [None]:
from surprise.model_selection import GridSearchCV

param_grid = {
    'n_factors': [50, 100, 150],
    'lr_all': [0.002, 0.005, 0.01],
    'reg_all': [0.02, 0.05, 0.1]
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(data)

print(f'Best RMSE: {gs.best_score["rmse"]}')
print(f'Optimal parameters: {gs.best_params["rmse"]}')

# Mixture Model (Baseline + KNN + SVD)

In [None]:
from surprise import Dataset, Reader, SVD, KNNBaseline, BaselineOnly
from surprise.model_selection import cross_validate
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Flatten, Concatenate, Dense
import numpy as np

# Baseline estimates
baseline = BaselineOnly()

# Collaborative filtering (user-user)
sim_options = {'name': 'pearson_baseline', 'user_based': True}
knn = KNNBaseline(sim_options=sim_options)

# Matrix factorization
svd = SVD(n_factors=128, n_epochs=25, lr_all=0.005, reg_all=0.02)

In [None]:
# Neural network for blending
def build_blender(n_users, n_items):
    user_input = tf.keras.Input(shape=(1,))
    item_input = tf.keras.Input(shape=(1,))

    user_embed = Embedding(n_users, 64)(user_input)
    item_embed = Embedding(n_items, 64)(item_input)

    merged = Concatenate()([Flatten()(user_embed), Flatten()(item_embed)])
    dense = Dense(128, activation='relu')(merged)
    output = Dense(1)(dense)

    return tf.keras.Model(inputs=[user_input, item_input], outputs=output)

In [None]:
class HybridModel:
    def __init__(self, models, blender):
        self.models = models
        self.blender = blender

    def fit(self, trainset):
        # Train base models
        for model in self.models:
            model.fit(trainset)

        # Generate blended features
        user_ids = [uid for (uid, _, _) in trainset.all_ratings()]
        item_ids = [iid for (_, iid, _) in trainset.all_ratings()]
        predictions = np.array([
            [model.predict(uid, iid).est for model in self.models]
            for uid, iid in zip(user_ids, item_ids)
        ])

        # Train blender
        self.blender.fit(
            x=[np.array(user_ids), np.array(item_ids)],
            y=np.array([r for (_, _, r) in trainset.all_ratings()]),
            epochs=10,
            batch_size=1024
        )

    def predict(self, uid, iid):
        base_preds = [model.predict(uid, iid).est for model in self.models]
        nn_pred = self.blender.predict([
            np.array([uid]),
            np.array([iid])
        ])[0][0]
        return np.mean([*base_preds, nn_pred])

In [None]:
# Initialize components
n_users = ratings.userId.nunique()
n_items = ratings.movieId.nunique()
blender = build_blender(n_users, n_items)
hybrid = HybridModel([baseline, knn, svd], blender)

# Cross-validate
cross_validate(hybrid, data, measures=['rmse', 'mae'], cv=3, verbose=True)

In [None]:
from surprise.model_selection import GridSearchCV

# Tune SVD component
svd_params = {
    'n_factors': [64, 128],
    'lr_all': [0.003, 0.005],
    'reg_all': [0.02, 0.04]
}
gs_svd = GridSearchCV(SVD, svd_params, measures=['rmse'], cv=3)
gs_svd.fit(data)

# Tune KNN component
knn_params = {'k': [20, 40], 'sim_options': {'name': ['pearson_baseline']}}
gs_knn = GridSearchCV(KNNBaseline, knn_params, measures=['rmse'], cv=3)
gs_knn.fit(data)

In [None]:
best_svd = gs_svd.best_estimator['rmse']
best_knn = gs_knn.best_estimator['rmse']
optimized_hybrid = HybridModel([baseline, best_knn, best_svd], blender)