In [1]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from scipy.sparse.linalg import svds



In [2]:
# Step 2: Load the data
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv("u.data", sep='\t', names=column_names)

# Optional: Drop timestamp
df.drop('timestamp', axis=1, inplace=True)

print("Dataset preview:")
print(df.head())



Dataset preview:
   user_id  item_id  rating
0      196      242       3
1      186      302       3
2       22      377       1
3      244       51       2
4      166      346       1


In [3]:
# Step 3: Create user-item ratings matrix
ratings_matrix = df.pivot(index='user_id', columns='item_id', values='rating').fillna(0)
ratings_matrix.head()


item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Step 4: Matrix Factorization using SVD
R = ratings_matrix.values
user_ratings_mean = np.mean(R, axis=1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

# Compute SVD
U, sigma, Vt = svds(R_demeaned, k=50)
sigma = np.diag(sigma)

# Reconstruct the predicted ratings
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns=ratings_matrix.columns)




In [5]:
# Step 5: Recommend Top 5 Movies for a Specific User

def recommend_movies(predictions_df, user_id, original_ratings, num_recommendations=5):
    user_row_number = user_id - 1
    sorted_user_predictions = predictions_df.iloc[user_row_number].sort_values(ascending=False)

    user_data = original_ratings[original_ratings.user_id == user_id]
    already_rated = user_data['item_id'].tolist()

    recommendations = sorted_user_predictions[~sorted_user_predictions.index.isin(already_rated)].head(num_recommendations)
    
    print(f"Top {num_recommendations} recommendations for User {user_id}:")
    for item_id, score in recommendations.items():
        print(f"Movie ID: {item_id}, Predicted Rating: {score:.2f}")

recommend_movies(preds_df, user_id=10, original_ratings=df, num_recommendations=5)


Top 5 recommendations for User 10:
Movie ID: 507, Predicted Rating: 3.38
Movie ID: 288, Predicted Rating: 3.19
Movie ID: 514, Predicted Rating: 3.15
Movie ID: 204, Predicted Rating: 2.85
Movie ID: 492, Predicted Rating: 2.85


In [6]:
# Step 6: Evaluation Metrics (RMSE and MAE)

from sklearn.metrics import mean_squared_error, mean_absolute_error

# Split the ratings dataset into train and test sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Create training ratings matrix
train_ratings_matrix = train_data.pivot(index='user_id', columns='item_id', values='rating').fillna(0)

# Matrix factorization on training data
R_train = train_ratings_matrix.values
user_means_train = np.mean(R_train, axis=1)
R_demeaned_train = R_train - user_means_train.reshape(-1, 1)

# Perform SVD
U, sigma, Vt = svds(R_demeaned_train, k=50)
sigma = np.diag(sigma)

# Reconstruct predicted ratings
R_predicted = np.dot(np.dot(U, sigma), Vt) + user_means_train.reshape(-1, 1)
predicted_ratings_df = pd.DataFrame(R_predicted, columns=train_ratings_matrix.columns)

# Prepare predictions and ground truth for evaluation
y_true = []
y_pred = []

for index, row in test_data.iterrows():
    user_id = int(row['user_id'])
    item_id = int(row['item_id'])
    true_rating = row['rating']
    
    try:
        predicted_rating = predicted_ratings_df.loc[user_id, item_id]
        y_true.append(true_rating)
        y_pred.append(predicted_rating)
    except KeyError:
        # If item_id was not present in training set
        continue

# Compute RMSE and MAE
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae = mean_absolute_error(y_true, y_pred)

print(f"Evaluation Metrics on Test Set:")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")


Evaluation Metrics on Test Set:
Root Mean Squared Error (RMSE): 3.3251
Mean Absolute Error (MAE): 3.0506
