# Imports && Initializations


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix

## Create DF

In [7]:
df = pd.read_csv('../data/cleaned_movies_full.csv')

# See the first "row"
print(len(df))
print(df.head())
print("First entry:")

4669820
     User_ID   Movie_ID  Rating        Date
0  ur4592644  tt0120884      10  2005-01-16
1  ur3174947  tt0118688       3  2005-01-16
2  ur3780035  tt0387887       8  2005-01-16
3  ur4592628  tt0346491       1  2005-01-16
4  ur3174947  tt0094721       8  2005-01-16
First entry:


# Feature Engineering

## Pre - clustering

### Dimensionality Analysis

In [None]:
n_users = df['User_ID'].nunique()
n_movies = df['Movie_ID'].nunique()
n_ratings = len(df)

possible_ratings = n_users * n_movies
sparsity = 1 - (n_ratings / possible_ratings)

print(f"Users: {n_users} | Movies: {n_movies}")
print(f"Sparsity: {sparsity * 100:.2f}%")

### Label enconding

In [8]:
user_enc = LabelEncoder()
movie_enc = LabelEncoder()

df['user_idx'] = user_enc.fit_transform(df['User_ID'])
df['movie_idx'] = movie_enc.fit_transform(df['Movie_ID'])

### Creating the "User Vectors" for Clustering

In [9]:
# Create a Compressed Sparse Row matrix
# row = user index, col = movie index, data = rating
user_movie_matrix = csr_matrix((df['Rating'], (df['user_idx'], df['movie_idx'])))

## Normalization

### 1) Prohibit data from clustering people who give only 1 or 10

In [10]:
# 1. Calculate the average rating for each user
user_means = df.groupby('User_ID')['Rating'].mean()

# 2. Subtract the mean to center the data around zero
# A positive 'Rating_Norm' means they liked it more than their average
df['Rating_Norm'] = df['Rating'] - df['User_ID'].map(user_means)

print("✅ Normalization complete.")
df[['User_ID', 'Rating', 'Rating_Norm']].head()

✅ Normalization complete.


Unnamed: 0,User_ID,Rating,Rating_Norm
0,ur4592644,10,0.0
1,ur3174947,3,-4.355556
2,ur3780035,8,0.125
3,ur4592628,1,0.0
4,ur3174947,8,0.644444


### 2) Mean centering normalize ratings because some users are "easy graders" (average 9/10) and some are "harsh critics" (average 4/10). We want to know if a user liked a movie **more than their own average**

In [11]:
# 1. Calculate the average rating for each user
user_means = df.groupby('User_ID')['Rating'].mean()

# 2. Map those means back to the main dataframe
df['User_Mean'] = df['User_ID'].map(user_means)

# 3. Create the Normalized Rating (Relative Preference)
df['Rating_Norm'] = df['Rating'] - df['User_Mean']

print("✅ Normalization complete.")
print(df[['User_ID', 'Rating', 'User_Mean', 'Rating_Norm']].head())

✅ Normalization complete.
     User_ID  Rating  User_Mean  Rating_Norm
0  ur4592644      10  10.000000     0.000000
1  ur3174947       3   7.355556    -4.355556
2  ur3780035       8   7.875000     0.125000
3  ur4592628       1   1.000000     0.000000
4  ur3174947       8   7.355556     0.644444


#### 3) Filter out user with 1 or 2 ratings. We dont need them and are noise

In [12]:
# Count ratings per user
user_counts = df.groupby('User_ID').size()

# Keep only users with 5 or more ratings
active_user_ids = user_counts[user_counts >= 5].index
df_filtered = df[df['User_ID'].isin(active_user_ids)].copy()

print(f"Original users: {len(user_counts):,}")
print(f"Active users (5+ ratings): {len(active_user_ids):,}")
print(f"Data remaining: {len(df_filtered):,} rows.")

Original users: 1,499,238
Active users (5+ ratings): 120,250
Data remaining: 2,822,422 rows.


### Now the code is able to predict the Rating_Norm. So the results mean either: 2.5 > **User will likely love current movie** or if < -3.0 **User will likely hate the movie**

### Build **User - Item Matrix**

In [None]:
# Initialize the encoders
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

# Fit and transform the IDs into integer indices
df_filtered['user_idx'] = user_encoder.fit_transform(df_filtered['User_ID'])
df_filtered['movie_idx'] = movie_encoder.fit_transform(df_filtered['Movie_ID'])

# Let's see the result
n_users = df_filtered['user_idx'].nunique()
n_movies = df_filtered['movie_idx'].nunique()

print(f"✅ Label Encoding Complete.")
print(f"Total Unique Users: {n_users:,}")
print(f"Total Unique Movies: {n_movies:,}")
print(df_filtered[['User_ID', 'user_idx', 'Movie_ID', 'movie_idx']].head())

#### Because the data are too big, the **User - Item Matrix** will be compressed. This way K-means won't struggle

In [None]:
# Create the matrix: (data, (row_indices, col_indices))
user_movie_sparse = csr_matrix(
    (df_filtered['Rating_Norm'], (df_filtered['user_idx'], df_filtered['movie_idx'])),
    shape=(n_users, n_movies)
)

print(f"✅ Sparse Matrix created with shape: {user_movie_sparse.shape}")