In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('IMDB-Movie-Data.csv')

df_clean = df[['Title', 'Rating']].dropna()

num_users = 500
num_ratings_per_movie = np.random.randint(5, 50, size=len(df_clean))

user_movie_ratings = []
for idx, row in df_clean.iterrows():
    movie_title = row['Title']
    true_rating = row['Rating']
    n_ratings = num_ratings_per_movie[idx]

    for user_id in range(n_ratings):
        simulated_user_id = np.random.randint(1, num_users + 1)
        noise = np.random.normal(0, 0.5)
        user_rating = np.clip(true_rating + noise, 1, 10)
        user_movie_ratings.append({
            'user_id': simulated_user_id,
            'movie_title': movie_title,
            'rating': user_rating
        })

ratings_df = pd.DataFrame(user_movie_ratings)

le_user = LabelEncoder()
le_movie = LabelEncoder()

ratings_df['user_encoded'] = le_user.fit_transform(ratings_df['user_id'])
ratings_df['movie_encoded'] = le_movie.fit_transform(ratings_df['movie_title'])

train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

n_users = ratings_df['user_encoded'].nunique()
n_movies = ratings_df['movie_encoded'].nunique()

user_movie_matrix = np.zeros((n_users, n_movies))
for _, row in train_df.iterrows():
    user_movie_matrix[int(row['user_encoded']), int(row['movie_encoded'])] = row['rating']

user_means = np.true_divide(user_movie_matrix.sum(axis=1), (user_movie_matrix != 0).sum(axis=1))
user_means[np.isnan(user_means)] = 0

user_movie_matrix_normalized = user_movie_matrix.copy()
for i in range(n_users):
    non_zero_indices = user_movie_matrix[i] != 0
    user_movie_matrix_normalized[i, non_zero_indices] -= user_means[i]

svd = TruncatedSVD(n_components=50, random_state=42)
user_factors = svd.fit_transform(user_movie_matrix_normalized)
movie_factors = svd.components_.T

predictions_matrix = np.dot(user_factors, movie_factors.T)
for i in range(n_users):
    predictions_matrix[i] += user_means[i]

predictions_matrix = np.clip(predictions_matrix, 1, 10)

actual_ratings = []
predicted_ratings = []

for _, row in test_df.iterrows():
    user_idx = int(row['user_encoded'])
    movie_idx = int(row['movie_encoded'])
    actual_ratings.append(row['rating'])
    predicted_ratings.append(predictions_matrix[user_idx, movie_idx])

rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))
mae = mean_absolute_error(actual_ratings, predicted_ratings)

print("USER-BASED MOVIE RATING PREDICTION MODEL")
print("\nModel: Collaborative Filtering (Matrix Factorization with SVD)")
print(f"Total users simulated: {num_users}")
print(f"Total movies: {len(df_clean)}")
print(f"Total ratings: {len(ratings_df)}")
print(f"Training ratings: {len(train_df)}")
print(f"Testing ratings: {len(test_df)}")

print("\n")
print("MODEL PERFORMANCE ON TEST SET:")
print(f"RMSE (Root Mean Squared Error): {rmse:.4f}")
print(f"MAE (Mean Absolute Error):      {mae:.4f}")

print("\n")
print("SAMPLE PREDICTIONS - How users might rate unseen movies:")
print(f"{'User ID':<10} {'Movie Title':<35} {'Actual':<10} {'Predicted':<10}")
for i in range(min(15, len(test_df))):
    row = test_df.iloc[i]
    user_idx = int(row['user_encoded'])
    movie_idx = int(row['movie_encoded'])
    actual = row['rating']
    predicted = predictions_matrix[user_idx, movie_idx]
    movie_title_short = row['movie_title'][:32] + "..." if len(row['movie_title']) > 35 else row['movie_title']
    print(f"{row['user_id']:<10} {movie_title_short:<35} {actual:<10.2f} {predicted:<10.2f}")

test_user_id = 42
if test_user_id in le_user.classes_:
    test_user_encoded = le_user.transform([test_user_id])[0]
    unseen_movie_indices = np.random.choice(n_movies, size=min(5, n_movies), replace=False)

    print(f"\nPREDICTING RATINGS FOR USER {test_user_id} ON UNSEEN MOVIES:")
    print(f"{'Movie Title':<50} {'Predicted Rating':<20}")
    for movie_idx in unseen_movie_indices:
        movie_title = le_movie.inverse_transform([movie_idx])[0]
        predicted_rating = predictions_matrix[test_user_encoded, movie_idx]
        movie_short = movie_title[:47] + "..." if len(movie_title) > 50 else movie_title
        print(f"{movie_short:<50} {predicted_rating:<20.2f}")
else:
    print(f"\nUser {test_user_id} not found in training data.")

USER-BASED MOVIE RATING PREDICTION MODEL

Model: Collaborative Filtering (Matrix Factorization with SVD)
Total users simulated: 500
Total movies: 1000
Total ratings: 27142
Training ratings: 21713
Testing ratings: 5429


MODEL PERFORMANCE ON TEST SET:
RMSE (Root Mean Squared Error): 1.0275
MAE (Mean Absolute Error):      0.8007


SAMPLE PREDICTIONS - How users might rate unseen movies:
User ID    Movie Title                         Actual     Predicted 
395        Toy Story 3                         8.29       6.56      
270        The Levelling                       5.71       6.53      
374        The Dictator                        6.51       6.39      
53         The Family                          5.72       6.59      
230        Django Unchained                    8.95       6.76      
359        1408                                6.83       6.75      
274        The Heat                            6.36       6.55      
214        Legion                              4.40       6.