In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# --- 1. Load Data ---

# Load the ratings data (u.data)
# The file is typically tab-separated and contains four columns:
# user id | item id | rating | timestamp
try:
    ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
    df_ratings = pd.read_csv('u.data', sep='\t', names=ratings_cols, encoding='latin-1')
except FileNotFoundError:
    print("Error: 'u.data' file not found. Please upload the data.")
    exit()

# Load the movie titles (u.item)
# We only need movie_id and movie_title
try:
    movie_cols = ['movie_id', 'title'] + [str(i) for i in range(22)] # Title is the 2nd column
    df_movies = pd.read_csv('u.item', sep='|', names=movie_cols, encoding='latin-1', usecols=[0, 1])
except FileNotFoundError:
    print("Error: 'u.item' file not found. Please upload the data.")
    exit()

# --- 2. Merge Data ---

# Combine ratings and movie titles for easier interpretation
df_combined = pd.merge(df_ratings, df_movies, on='movie_id')

print("--- Combined Data Head (Ratings + Titles) ---")
print(df_combined.head())


# --- 3. Create the User-Item Matrix (The Pivot Table) ---

# This matrix is the key to collaborative filtering
user_movie_matrix = df_combined.pivot_table(
    index='user_id',
    columns='title',
    values='rating'
)

# Fill missing ratings (movies not watched/rated) with 0 for now
# We will use this matrix later to calculate similarity
user_movie_matrix_filled = user_movie_matrix.fillna(0)

print("\n--- User-Item Matrix Shape and Head ---")
print(f"Matrix Shape: {user_movie_matrix_filled.shape} (Rows: Users, Columns: Movies)")
print(user_movie_matrix_filled.head())

--- Combined Data Head (Ratings + Titles) ---
   user_id  movie_id  rating  timestamp                       title
0      196       242       3  881250949                Kolya (1996)
1      186       302       3  891717742    L.A. Confidential (1997)
2       22       377       1  878887116         Heavyweights (1994)
3      244        51       2  880606923  Legends of the Fall (1994)
4      166       346       1  886397596         Jackie Brown (1997)

--- User-Item Matrix Shape and Head ---
Matrix Shape: (943, 1664) (Rows: Users, Columns: Movies)
title    'Til There Was You (1997)  1-900 (1994)  101 Dalmatians (1996)  \
user_id                                                                   
1                              0.0           0.0                    2.0   
2                              0.0           0.0                    0.0   
3                              0.0           0.0                    0.0   
4                              0.0           0.0                    0.0  

In [2]:
# --- 1. Calculate Cosine Similarity ---

# The output is a (Users x Users) matrix where each cell [i, j]
# represents the similarity score between User i and User j.
user_similarity = cosine_similarity(user_movie_matrix_filled)

# Convert the NumPy array back to a DataFrame for easier handling
user_similarity_df = pd.DataFrame(user_similarity,
                                  index=user_movie_matrix.index,
                                  columns=user_movie_matrix.index)

print("\n--- User Similarity Matrix Head ---")
print(user_similarity_df.head())

# --- 2. Verify Your "Movie Twin" ---

# Let's check the similarity of User ID 1 with everyone else
# (The highest score, 1.0, should be with themselves)
target_user_id = 1
similarity_to_target = user_similarity_df[target_user_id].sort_values(ascending=False)

print(f"\n--- Top 5 'Movie Twins' for User {target_user_id} ---")
# Excluding the user's own score (1.0)
print(similarity_to_target[1:6])


--- User Similarity Matrix Head ---
user_id       1         2         3         4         5         6         7    \
user_id                                                                         
1        1.000000  0.168937  0.048388  0.064561  0.379670  0.429682  0.443097   
2        0.168937  1.000000  0.113393  0.179694  0.073623  0.242106  0.108604   
3        0.048388  0.113393  1.000000  0.349781  0.021592  0.074018  0.067423   
4        0.064561  0.179694  0.349781  1.000000  0.031804  0.068431  0.091507   
5        0.379670  0.073623  0.021592  0.031804  1.000000  0.238636  0.374733   

user_id       8         9         10   ...       934       935       936  \
user_id                                ...                                 
1        0.320079  0.078385  0.377733  ...  0.372213  0.119860  0.269860   
2        0.104257  0.162470  0.161273  ...  0.147095  0.310661  0.363328   
3        0.084419  0.062039  0.066217  ...  0.033885  0.043453  0.167140   
4        0.1880

In [3]:
# --- Select a Target User ---
TARGET_USER_ID = 1

# Get the target user's similarity scores and sort them (excluding the user's score of 1.0)
similarity_scores = user_similarity_df[TARGET_USER_ID].sort_values(ascending=False)
top_neighbors = similarity_scores.index[1:11] # Select the top 10 most similar users (excluding self)

# --- 1. Identify Rated vs. Unrated Movies ---

# Get the target user's ratings (movies they HAVE seen/rated)
target_user_ratings = user_movie_matrix.loc[TARGET_USER_ID]
rated_movies = target_user_ratings[target_user_ratings.notna()].index.tolist()

# Get all movie titles
all_movies = user_movie_matrix.columns

# Identify movies the target user HAS NOT rated (candidates for recommendation)
unrated_movies = [movie for movie in all_movies if movie not in rated_movies]

# --- 2. Predict Ratings for Unrated Movies ---

predictions = {}

for movie in unrated_movies:
    # Get all ratings for this movie from the top 10 neighbors
    neighbor_ratings = user_movie_matrix.loc[top_neighbors, movie].dropna()

    if len(neighbor_ratings) > 0:
        # Get the similarity scores for only the neighbors who actually rated this movie
        neighbor_similarity = similarity_scores.loc[neighbor_ratings.index]

        # Calculate the weighted average rating
        # Formula: Sum(Rating * Similarity) / Sum(Similarity)
        predicted_rating = (neighbor_ratings * neighbor_similarity).sum() / neighbor_similarity.sum()
        predictions[movie] = predicted_rating

# --- 3. Generate Final Recommendations ---

# Convert predictions to a Series and sort them
predicted_ratings_series = pd.Series(predictions).sort_values(ascending=False)

# Select the Top 5 recommended movies
top_recommendations = predicted_ratings_series.head(5)

print("\n--- Top 5 Movie Recommendations for User ID 1 ---")
print(top_recommendations.to_string())


--- Top 5 Movie Recommendations for User ID 1 ---
Casablanca (1942)               5.0
Waiting for Guffman (1996)      5.0
Secrets & Lies (1996)           5.0
Little Buddha (1993)            5.0
Walk in the Clouds, A (1995)    5.0


In [4]:
# --- 1. Create the Movie-User Matrix (Transpose) ---
# Transpose the original User-Item Matrix to get a matrix where:
# Index (Rows): Movies
# Columns: Users
movie_user_matrix = user_movie_matrix.fillna(0).T

print("--- Movie-User Matrix Shape and Head ---")
print(f"Matrix Shape: {movie_user_matrix.shape} (Rows: Movies, Columns: Users)")
print(movie_user_matrix.head())


# --- 2. Calculate Cosine Similarity on Movies ---

# Calculate the similarity between every movie and every other movie.
# This matrix will be (Movies x Movies)
item_similarity = cosine_similarity(movie_user_matrix)

# Convert the NumPy array back to a DataFrame
item_similarity_df = pd.DataFrame(item_similarity,
                                  index=movie_user_matrix.index,
                                  columns=movie_user_matrix.index)

print("\n--- Item Similarity Matrix Head (Movie-Movie Scores) ---")
print(item_similarity_df.head())

# --- 3. Verify Movie Twins ---
# Check which movies are most similar to 'Star Wars (1977)'
target_movie_title = 'Star Wars (1977)'
similar_movies = item_similarity_df[target_movie_title].sort_values(ascending=False)

print(f"\n--- Top 5 'Movie Twins' for '{target_movie_title}' ---")
# Excluding the movie's own score (1.0)
print(similar_movies[1:6].to_string())

--- Movie-User Matrix Shape and Head ---
Matrix Shape: (1664, 943) (Rows: Movies, Columns: Users)
user_id                    1    2    3    4    5    6    7    8    9    10   \
title                                                                         
'Til There Was You (1997)  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
1-900 (1994)               0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
101 Dalmatians (1996)      2.0  0.0  0.0  0.0  2.0  0.0  0.0  0.0  0.0  0.0   
12 Angry Men (1957)        5.0  0.0  0.0  0.0  0.0  4.0  4.0  0.0  0.0  5.0   
187 (1997)                 0.0  0.0  2.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   

user_id                    ...  934  935  936  937  938  939  940  941  942  \
title                      ...                                                
'Til There Was You (1997)  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
1-900 (1994)               ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
101 Dalmatians (1996)      ...  

In [5]:
TARGET_USER_ID = 1

# Get the target user's ratings (movies they HAVE seen/rated)
target_user_ratings = user_movie_matrix.loc[TARGET_USER_ID].dropna()

# Identify movies the target user HAS NOT rated
unrated_movies = user_movie_matrix.loc[TARGET_USER_ID][user_movie_matrix.loc[TARGET_USER_ID].isna()].index.tolist()

predictions_item_based = {}

for movie_to_predict in unrated_movies:
    # Get the similarity scores between the target movie and ALL movies
    # Only keep scores for movies the target user HAS rated

    # 1. Get Similarity Scores:
    # Scores between 'movie_to_predict' and all movies the user rated.
    movie_similarities = item_similarity_df[movie_to_predict].loc[target_user_ratings.index]

    # 2. Get User Ratings:
    # Ratings given by the user to those same movies.
    user_ratings_of_similar_movies = target_user_ratings.loc[movie_similarities.index]

    if movie_similarities.sum() > 0:
        # Calculate the Weighted Average Rating (Prediction)
        predicted_rating = (user_ratings_of_similar_movies * movie_similarities).sum() / movie_similarities.sum()
        predictions_item_based[movie_to_predict] = predicted_rating

# Sort the predictions
predicted_ratings_series_item = pd.Series(predictions_item_based).sort_values(ascending=False)

# Select the Top 5 recommended movies
top_recommendations_item_based = predicted_ratings_series_item.head(5)

print("\n--- Top 5 Item-Based Recommendations for User ID 1 ---")
print(top_recommendations_item_based.to_string())


--- Top 5 Item-Based Recommendations for User ID 1 ---
Cyclo (1995)                 4.382759
Little City (1998)           4.234926
Office Killer (1997)         4.229406
Death in Brunswick (1991)    4.226813
Mamma Roma (1962)            4.178130
