In [12]:
import pandas as pd

# Load the data
ratings = pd.read_csv(r"C:\Users\Chrizel\Downloads\ml-latest-small\ml-latest-small\ratings.csv")
movies = pd.read_csv(r"C:\Users\Chrizel\Downloads\ml-latest-small\ml-latest-small\movies.csv")

# Preview the data
print("Ratings:\n", ratings.head())
print("\nMovies:\n", movies.head())


Ratings:
    userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931

Movies:
    movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [13]:
# Create pivot table: rows = users, columns = movies, values = ratings
ratings_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')

# Fill missing values with 0 (or NaN depending on the method used)
ratings_matrix = ratings_matrix.fillna(0)

print(ratings_matrix.shape)
ratings_matrix.head()


(610, 9724)


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between movies (items)
item_similarity = cosine_similarity(ratings_matrix.T)

# Convert similarity matrix to DataFrame with movie IDs as index and columns
item_similarity_df = pd.DataFrame(item_similarity, index=ratings_matrix.columns, columns=ratings_matrix.columns)

print(item_similarity_df.shape)
item_similarity_df.head()


(9724, 9724)


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.410562,0.296917,0.035573,0.308762,0.376316,0.277491,0.131629,0.232586,0.395573,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.410562,1.0,0.282438,0.106415,0.287795,0.297009,0.228576,0.172498,0.044835,0.417693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.296917,0.282438,1.0,0.092406,0.417802,0.284257,0.402831,0.313434,0.30484,0.242954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.035573,0.106415,0.092406,1.0,0.188376,0.089685,0.275035,0.158022,0.0,0.095598,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.308762,0.287795,0.417802,0.188376,1.0,0.298969,0.474002,0.283523,0.335058,0.218061,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
def recommend_movies(user_id, ratings_matrix, item_similarity_df, movies_df, n_recommendations=5):
    # Get the user's ratings
    user_ratings = ratings_matrix.loc[user_id]
    
    # Find movies the user has already rated
    rated_movies = user_ratings[user_ratings > 0].index.tolist()
    
    # Calculate weighted sum of similarities for all movies (fixing index alignment)
    sim_scores = item_similarity_df.loc[rated_movies, user_ratings[rated_movies].index].dot(user_ratings[rated_movies])
    
    # Normalize by sum of similarities
    sim_sums = item_similarity_df.loc[rated_movies].sum(axis=0)
    recommendation_scores = sim_scores / sim_sums
    
    # Remove movies already rated by user
    recommendation_scores = recommendation_scores.drop(rated_movies)
    
    # Sort the scores and get top recommendations
    top_recommendations = recommendation_scores.sort_values(ascending=False).head(n_recommendations)
    
    # Map movie IDs to movie titles
    recommended_movie_titles = movies_df.loc[movies_df['movieId'].isin(top_recommendations.index), 'title']
    
    return recommended_movie_titles


In [16]:
for user_id in [1, 5, 10]:
    print(f"Top 5 movie recommendations for User {user_id}:")
    recommendations = recommend_movies(user_id, ratings_matrix, item_similarity_df, movies, 5)
    print(recommendations.to_list())
    print()


Top 5 movie recommendations for User 1:
['Jumanji (1995)', 'Waiting to Exhale (1995)', 'Father of the Bride Part II (1995)', 'Sabrina (1995)', 'Tom and Huck (1995)']

Top 5 movie recommendations for User 5:
['Jumanji (1995)', 'Grumpier Old Men (1995)', 'Waiting to Exhale (1995)', 'Father of the Bride Part II (1995)', 'Heat (1995)']

Top 5 movie recommendations for User 10:
['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)', 'Waiting to Exhale (1995)', 'Father of the Bride Part II (1995)']



In [17]:
from sklearn.model_selection import train_test_split

# Split ratings data into train and test sets (80% train, 20% test)
train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)


In [18]:
# Create user-item matrix for training data
train_matrix = train_ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Compute cosine similarity matrix for items based on train data
from sklearn.metrics.pairwise import cosine_similarity

item_similarity_train = cosine_similarity(train_matrix.T)
item_similarity_df_train = pd.DataFrame(item_similarity_train, index=train_matrix.columns, columns=train_matrix.columns)


In [19]:
import numpy as np
from sklearn.metrics import mean_squared_error

def predict_rating(user_id, movie_id, train_matrix, item_similarity_df):
    if movie_id not in item_similarity_df.columns or user_id not in train_matrix.index:
        return np.nan  # Cannot predict if movie or user unknown
    
    # Get user's ratings from train data
    user_ratings = train_matrix.loc[user_id]
    
    # Get similarity scores for the target movie with all other movies
    sim_scores = item_similarity_df[movie_id]
    
    # Weighted sum of ratings
    numerator = (sim_scores * user_ratings).sum()
    denominator = sim_scores[user_ratings > 0].sum()
    
    if denominator == 0:
        return 0  # No similar movies rated
    else:
        return numerator / denominator

# Predict ratings for all user-movie pairs in test set
test_ratings['predicted_rating'] = test_ratings.apply(
    lambda row: predict_rating(row['userId'], row['movieId'], train_matrix, item_similarity_df_train), axis=1
)

# Drop NaNs and calculate RMSE
valid_predictions = test_ratings.dropna(subset=['predicted_rating'])
rmse = np.sqrt(mean_squared_error(valid_predictions['rating'], valid_predictions['predicted_rating']))

print(f"RMSE on test set: {rmse:.4f}")


RMSE on test set: 0.9219


1. Objective

The goal of this task was to develop a Movie Recommendation System that suggests movies to users based on their preferences or past ratings. The system should utilize collaborative filtering or content-based filtering techniques and be built using Python with libraries such as pandas, NumPy, and optionally scikit-learn or Surprise. The MovieLens dataset was used for model training and evaluation.

2. Tools & Technologies Used

Programming Language: Python

Libraries: pandas, NumPy, scikit-learn

Dataset: MovieLens (latest-small version)

3. Methodology

Step 1: Data Loading and Preprocessing

Loaded ratings.csv and movies.csv from the MovieLens dataset.

Merged data to form a user-item matrix.

Filled missing ratings with zeros to prepare for similarity computation.

Step 2: Collaborative Filtering

Implemented Item-Based Collaborative Filtering.

Used Cosine Similarity (from scikit-learn) to compute similarity between movies.

Predicted user ratings based on weighted averages of similar items.

Step 3: Generating Recommendations

For a given user, identified movies not yet rated.

Ranked movies based on predicted scores.

Recommended the top N movies.

Step 4: Model Evaluation

Performed an 80-20 split of the dataset into training and test sets.

Predicted ratings on the test set.

Evaluated performance using Root Mean Squared Error (RMSE).

RMSE on Test Set: 0.9219

4. Sample Output

Top 5 movie recommendations for User 1:
['Jumanji (1995)', 'Waiting to Exhale (1995)', 'Father of the Bride Part II (1995)', 'Sabrina (1995)', 'Tom and Huck (1995)']

Top 5 movie recommendations for User 5:
['Jumanji (1995)', 'Grumpier Old Men (1995)', 'Waiting to Exhale (1995)', 'Father of the Bride Part II (1995)', 'Heat (1995)']

Top 5 movie recommendations for User 10:
['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)', 'Waiting to Exhale (1995)', 'Father of the Bride Part II (1995)']



5. Conclusion

The system effectively recommends movies using item-based collaborative filtering. An RMSE of 0.9219 on the test set indicates the model's predictions are fairly accurate on unseen data, particularly for a simple model built from scratch without advanced tuning.

The project fulfills the objective of building a functional movie recommendation system using Python, pandas, NumPy, and scikit-learn, utilizing the MovieLens dataset.

