In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances

# --- STEP 1: LOAD AND PREPARE DATA (Do this once) ---
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1')

i_cols = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')

# Create the user-item matrix
data_matrix = ratings.pivot_table(index='user_id', columns='movie_id', values='rating').fillna(0)

# Calculate Similarity and Predictions
user_similarity = pairwise_distances(data_matrix, metric='cosine')
mean_user_rating = data_matrix.values.mean(axis=1)
ratings_diff = (data_matrix.values - mean_user_rating[:, np.newaxis])
user_prediction = mean_user_rating[:, np.newaxis] + user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T

# --- STEP 2: THE RECOMMENDATION FUNCTION ---
def get_recommendations(user_id, top_n=5):
    """
    Takes a real-world User ID (e.g., 34), 
    handles the index shift internally, 
    and returns movie titles.
    """
    # Subtract 1 to match Python's 0-based indexing (User ID 1 is index 0)
    user_idx = user_id - 1
    
    # Get the prediction vector for this user
    user_ratings = user_prediction[user_idx]
    
    # Identify movies the user has ALREADY watched (rating > 0)
    watched_indices = np.where(data_matrix.iloc[user_idx] > 0)[0]
    
    # Create a copy of predictions and set watched movies to -infinity
    # This ensures we only recommend NEW movies
    recommend_ratings = user_ratings.copy()
    recommend_ratings[watched_indices] = -1e9 
    
    # Get indices of the top N highest predicted scores
    top_indices = np.argsort(recommend_ratings)[::-1][:top_n]
    
    # Map indices back to Movie IDs (column names)
    recommended_movie_ids = data_matrix.columns[top_indices]
    
    # Get titles from the items table
    recommendations = items[items['movie_id'].isin(recommended_movie_ids)]['movie_title'].tolist()
    
    return recommendations



In [2]:
Input_userid = 34
results = get_recommendations(Input_userid)

print(f"Top recommendations for User Input_userid:")
for i, title in enumerate(results, 1):
    print(f"{i}. {title}")

Top recommendations for User Input_userid:
1. Star Wars (1977)
2. Fargo (1996)
3. Raiders of the Lost Ark (1981)
4. Return of the Jedi (1983)
5. Contact (1997)


In [3]:
#Why the result is not matching with our legacy code results?
#The function looks at every movie in the entire database that the user hasn't seen yet instead of top five most similar users.
#It sorts the predictions and picks the Top 5 (or N) absolute highest predicted ratings. It doesn't use a fixed cutoff like 1.
#Ensured that the mapping between the matrix index and the Movie ID is preserved