In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD


In [2]:

# Load the dataset
df = pd.read_csv('indian_movies_ratings.csv')

# Create a pivot table
pivot_table = df.pivot_table(index='user_id', columns='movie_id', values='rating')

# Fill NaN values with 0 (implicit feedback approach)
pivot_table_filled = pivot_table.fillna(0)

# Convert to numpy array
ratings_matrix = pivot_table_filled.to_numpy()


In [3]:
# Train-test split (Here we split indices instead of the matrix directly)
train_indices, test_indices = train_test_split(range(ratings_matrix.shape[0]), test_size=0.2, random_state=42)
train_data = ratings_matrix[train_indices, :]
test_data = ratings_matrix[test_indices, :]

# Apply matrix factorization using SVD
n_factors = min(10, ratings_matrix.shape[1])  # Ensure n_factors <= number of features (movies)
svd = TruncatedSVD(n_components=n_factors)
train_svd = svd.fit_transform(train_data)
test_svd = svd.transform(test_data)

# Reconstruct the ratings matrix
predicted_ratings = np.dot(train_svd, svd.components_)


In [4]:
# Evaluate the model
def get_rmse(actual, predicted):
    # Only consider non-zero values in actual ratings
    mask = actual != 0
    return np.sqrt(mean_squared_error(actual[mask], predicted[mask]))

# Compute RMSE
rmse = get_rmse(train_data, predicted_ratings)
print(f'RMSE: {rmse}')


RMSE: 1.018746842170023e-15


In [5]:
# Function to get top N recommendations for each user
def get_top_n_recommendations(predicted_ratings, user_id, movie_titles, movie_ids, n=10):
    # Get and sort the user's predicted ratings
    user_ratings = predicted_ratings[user_id - 1]
    sorted_ratings_idx = np.argsort(user_ratings)[::-1]

    # Get the top N movie IDs
    top_n_movie_ids = sorted_ratings_idx[:n]
    top_n_movie_ratings = user_ratings[top_n_movie_ids]

    # Create a list of (movie_id, predicted_rating, title)
    recommendations = [
        (movie_ids[movie_id], rating, movie_titles[movie_ids[movie_id]]) 
        for movie_id, rating in zip(top_n_movie_ids, top_n_movie_ratings)
    ]

    return recommendations

# Create a dictionary to map movie IDs to movie titles
movie_titles = df[['movie_id', 'title']].drop_duplicates().set_index('movie_id')['title'].to_dict()

# Create a list of movie IDs
movie_ids = pivot_table.columns.to_list()

# Get the top 10 movie recommendations for the first user
user_id = 1
top_n_recommendations = get_top_n_recommendations(predicted_ratings, user_id, movie_titles, movie_ids, n=10)

# Print the recommendations
print(f"User {user_id} top recommendations:")
for movie_id, rating, title in top_n_recommendations:
    print(f"\tMovie: {title}, Predicted Rating: {rating}")


User 1 top recommendations:
	Movie: Dangal, Predicted Rating: 5.000000000000001
	Movie: 3 Idiots, Predicted Rating: 5.000000000000001
	Movie: Barfi!, Predicted Rating: 5.0
	Movie: Dil Chahta Hai, Predicted Rating: 5.0
	Movie: Pather Panchali, Predicted Rating: 3.9999999999999996
	Movie: Sholay, Predicted Rating: 3.9999999999999996
	Movie: Gully Boy, Predicted Rating: 3.0000000000000004
	Movie: Lagaan, Predicted Rating: 2.0000000000000018
	Movie: Bajrangi Bhaijaan, Predicted Rating: 1.9999999999999993
	Movie: Kabhi Khushi Kabhie Gham, Predicted Rating: 1.000000000000001
