In [1]:
import pandas as pd

# Load your CSV
df = pd.read_csv("data/ml-latest-small/ratings.csv")  # update filename if different

# Preview
print(df.head(25))


    userId  movieId  rating  timestamp
0        1        1     4.0  964982703
1        1        3     4.0  964981247
2        1        6     4.0  964982224
3        1       47     5.0  964983815
4        1       50     5.0  964982931
5        1       70     3.0  964982400
6        1      101     5.0  964980868
7        1      110     4.0  964982176
8        1      151     5.0  964984041
9        1      157     5.0  964984100
10       1      163     5.0  964983650
11       1      216     5.0  964981208
12       1      223     3.0  964980985
13       1      231     5.0  964981179
14       1      235     4.0  964980908
15       1      260     5.0  964981680
16       1      296     3.0  964982967
17       1      316     3.0  964982310
18       1      333     5.0  964981179
19       1      349     4.0  964982563
20       1      356     4.0  964980962
21       1      362     5.0  964982588
22       1      367     4.0  964981710
23       1      423     3.0  964982363
24       1      441     4

In [2]:
# Pivot the data: rows -> users, columns -> movies, values -> ratings
user_item_matrix = df.pivot_table(index='userId', columns='movieId', values='rating')
# Fill NaN values with 0 (or you can choose to leave them as NaN)
user_item_matrix.head(25)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
6,,4.0,5.0,3.0,5.0,4.0,4.0,3.0,,3.0,...,,,,,,,,,,
7,4.5,,,,,,,,,,...,,,,,,,,,,
8,,4.0,,,,,,,,2.0,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [3]:
from sklearn.metrics.pairwise import cosine_similarity

# Fill NaN with 0s for similarity computation
user_item_matrix_filled = user_item_matrix.fillna(0)

# Compute user similarity matrix
user_similarity = cosine_similarity(user_item_matrix_filled)

# Convert it to a DataFrame for readability
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)


In [4]:
import numpy as np

def predict_rating(user_id, movie_id, k=5):
    if movie_id not in user_item_matrix.columns:
        return np.nan
    
    # Get similarity scores and the ratings for the given movie
    similarities = user_similarity_df.loc[user_id]
    ratings = user_item_matrix[movie_id]
    
    # Combine them, drop NaNs (users who didn't rate the movie)
    data = pd.concat([similarities, ratings], axis=1)
    data.columns = ['similarity', 'rating']
    data = data.dropna()
    
    # Take top-k similar users
    top_k = data.sort_values('similarity', ascending=False).head(k)
    
    if top_k['similarity'].sum() == 0:
        return np.nan  # avoid division by zero
    
    # Weighted average
    pred = np.dot(top_k['similarity'], top_k['rating']) / top_k['similarity'].sum()
    return pred


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split the data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Predict for test set
predictions = []
truth = []

for _, row in test_df.iterrows():
    pred = predict_rating(row['userId'], row['movieId'], k=2)
    if not np.isnan(pred):
        predictions.append(pred)
        truth.append(row['rating'])

# RMSE
rmse = np.sqrt(mean_squared_error(truth, predictions))
print(f"User-User CF RMSE: {rmse:.4f}")
### lower K results in lower RMSE which is expected and better, we will use K=2 or K=3

User-User CF RMSE: 0.3293


In [6]:
import random

# Step 1: Pick a random user
user_ids = user_item_matrix.index.tolist()
random_user = random.choice(user_ids)
print(f"Randomly selected user: {random_user}")

# Step 2: Get movies the user hasn't rated yet
rated_movies = user_item_matrix.loc[random_user].dropna().index
all_movies = user_item_matrix.columns
unrated_movies = list(set(all_movies) - set(rated_movies))

# Step 3: Predict ratings for unrated movies
recommendations = []

for movie_id in unrated_movies:
    pred = predict_rating(random_user, movie_id, k=2)
    if not np.isnan(pred):
        recommendations.append((movie_id, pred))

# Step 4: Sort by predicted rating and pick top 20
top_20_recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:20]

# Step 5: Display
print("\nTop 20 movie recommendations with predicted ratings:")
for movie_id, pred_rating in top_20_recommendations:
    print(f"Movie ID: {movie_id}, Predicted Rating: {pred_rating:.1f}")


Randomly selected user: 431

Top 20 movie recommendations with predicted ratings:
Movie ID: 940, Predicted Rating: 5.0
Movie ID: 954, Predicted Rating: 5.0
Movie ID: 1206, Predicted Rating: 5.0
Movie ID: 1387, Predicted Rating: 5.0
Movie ID: 2288, Predicted Rating: 5.0
Movie ID: 2761, Predicted Rating: 5.0
Movie ID: 3074, Predicted Rating: 5.0
Movie ID: 131098, Predicted Rating: 5.0
Movie ID: 53, Predicted Rating: 5.0
Movie ID: 163925, Predicted Rating: 5.0
Movie ID: 99, Predicted Rating: 5.0
Movie ID: 101, Predicted Rating: 5.0
Movie ID: 111, Predicted Rating: 5.0
Movie ID: 148, Predicted Rating: 5.0
Movie ID: 131237, Predicted Rating: 5.0
Movie ID: 175, Predicted Rating: 5.0
Movie ID: 281, Predicted Rating: 5.0
Movie ID: 296, Predicted Rating: 5.0
Movie ID: 334, Predicted Rating: 5.0
Movie ID: 33138, Predicted Rating: 5.0


In [7]:
print(f"User {random_user} rated {user_item_matrix.loc[random_user].count()} movies.")


User 431 rated 20 movies.


In [8]:
rated_movies = user_item_matrix.loc[random_user].dropna().index
unrated_movies = list(set(user_item_matrix.columns) - set(rated_movies))
print(f"User {random_user} hasn't rated {len(unrated_movies)} movies.")

User 431 hasn't rated 9704 movies.


In [9]:
movies_df = pd.read_csv("data/ml-latest-small/movies.csv")
top_20_df = pd.DataFrame(top_20_recommendations, columns=['movieId', 'predicted_rating'])
merged = top_20_df.merge(movies_df, on='movieId')
print(merged[['title', 'predicted_rating']])


                                   title  predicted_rating
0   Adventures of Robin Hood, The (1938)               5.0
1    Mr. Smith Goes to Washington (1939)               5.0
2             Clockwork Orange, A (1971)               5.0
3                            Jaws (1975)               5.0
4                      Thing, The (1982)               5.0
5                 Iron Giant, The (1999)               5.0
6                Jeremiah Johnson (1972)               5.0
7                    Saving Santa (2013)               5.0
8                        Lamerica (1994)               5.0
9           Wings, Legs and Tails (1986)               5.0
10  Heidi Fleiss: Hollywood Madam (1995)               5.0
11                  Bottle Rocket (1996)               5.0
12                    Taxi Driver (1976)               5.0
13      Awfully Big Adventure, An (1995)               5.0
14            What Men Talk About (2010)               5.0
15                           Kids (1995)               5