In [1]:
# -----------------------------
# MODULE 0: Unzip Dataset
# -----------------------------

import zipfile
import os

zip_path = "/content/archive_4.zip"     # your zip file
extract_path = "/content/archive_4"     # where to extract

# Unzip the file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(" File unzipped successfully!")

# Show extracted contents
print("\nExtracted folders and files:")
for root, dirs, files in os.walk(extract_path):
    print(f"Folder: {root}")
    for name in files:
        print(f"  └── {name}")


 File unzipped successfully!

Extracted folders and files:
Folder: /content/archive_4
Folder: /content/archive_4/ml-100k
  └── ua.test
  └── u.info
  └── u1.test
  └── u5.base
  └── u.user
  └── u4.base
  └── u.occupation
  └── mku.sh
  └── u.data
  └── u3.test
  └── ub.test
  └── README
  └── ub.base
  └── ua.base
  └── u3.base
  └── u.item
  └── u2.test
  └── u4.test
  └── u1.base
  └── allbut.pl
  └── u5.test
  └── u.genre
  └── u2.base


In [2]:
# -----------------------------
# MODULE 1: Import Libraries
# -----------------------------
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score
import pickle


In [19]:
# Load the ratings file (u.data)
ratings = pd.read_csv(
    "/content/archive_4/ml-100k/u.data",
    sep='\t',
    names=['user_id', 'movie_id', 'rating', 'timestamp']
)

# Load the movies file (u.item)
movies = pd.read_csv(
    "/content/archive_4/ml-100k/u.item",
    sep='|',
    encoding='latin-1',
    usecols=[0, 1],
    names=['movie_id', 'title']
)

# Merge both DataFrames on movie_id
data = pd.merge(ratings, movies, on='movie_id')

print(" Dataset loaded successfully!")
print("\n First 5 rows:")
data.head()


 Dataset loaded successfully!

 First 5 rows:


Unnamed: 0,user_id,movie_id,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


In [10]:
# -----------------------------
# MODULE 3: Create User-Item Matrix
# -----------------------------
user_movie_matrix = data.pivot_table(index='user_id', columns='title', values='rating')
print(" User-Item Matrix created!")


 User-Item Matrix created!


In [13]:
# -----------------------------
# MODULE 4: Compute User Similarity
# -----------------------------
# Replace NaN with 0 for similarity calculation
user_movie_matrix_filled = user_movie_matrix.fillna(0)

# Compute cosine similarity between users
user_similarity = cosine_similarity(user_movie_matrix_filled)
user_similarity_df = pd.DataFrame(user_similarity, index=user_movie_matrix.index, columns=user_movie_matrix.index)
print(" User Similarity Matrix computed!")


 User Similarity Matrix computed!


In [14]:
# -----------------------------
# MODULE 5: Recommend Movies
# -----------------------------
def recommend_movies(user_id, num_recommendations=5):
    # Find similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:6]

    # Get movies watched by similar users
    similar_users_ratings = user_movie_matrix.loc[similar_users.index]

    # Weighted average ratings
    weighted_ratings = similar_users_ratings.T.dot(similar_users) / similar_users.sum()

    # Remove movies the user has already rated
    user_rated = user_movie_matrix.loc[user_id].dropna().index
    recommendations = weighted_ratings.drop(user_rated, errors='ignore').sort_values(ascending=False)

    return recommendations.head(num_recommendations)


In [15]:
# -----------------------------
# MODULE 6: Test the Recommendation
# -----------------------------
user_id = 1
print(f"\n Recommended movies for User {user_id}:")
print(recommend_movies(user_id, num_recommendations=5))



 Recommended movies for User 1:
title
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)    4.394938
Stand by Me (1986)                                                             3.990886
Heathers (1989)                                                                3.987280
Speed (1994)                                                                   3.605775
E.T. the Extra-Terrestrial (1982)                                              3.600460
dtype: float64


In [16]:
# -----------------------------
# MODULE 7: Evaluate Model (Precision@K)
# -----------------------------
def precision_at_k(user_id, k=5):
    recommendations = recommend_movies(user_id, num_recommendations=k)
    user_rated_movies = user_movie_matrix.loc[user_id].dropna().index

    # Convert to binary (1 = relevant, 0 = not relevant)
    y_true = [1 if movie in user_rated_movies else 0 for movie in recommendations.index]
    y_pred = [1]*len(y_true)

    if len(y_true) == 0:
        return 0
    return precision_score(y_true, y_pred)

precision = precision_at_k(1, k=5)
print(f"\nPrecision@5 for User 1: {precision:.2f}")



Precision@5 for User 1: 0.00


In [17]:
# -----------------------------
# MODULE 8: Save the Model
# -----------------------------
with open("user_similarity_model.pkl", "wb") as file:
    pickle.dump(user_similarity_df, file)

print("\n Model saved successfully as 'user_similarity_model.pkl'")



 Model saved successfully as 'user_similarity_model.pkl'


In [18]:
# -----------------------------
# MODULE 9: Load Saved Model
# -----------------------------
with open("user_similarity_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)

print("\n Model loaded successfully!")



 Model loaded successfully!


In [20]:
# MODULE 10: Item-Based Collaborative Filtering
# -----------------------------
# Compute item-item similarity using cosine similarity on the transposed user-item matrix
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

item_similarity = cosine_similarity(user_movie_matrix_filled.T)
item_similarity_df = pd.DataFrame(item_similarity, index=user_movie_matrix.columns, columns=user_movie_matrix.columns)
print(" Item Similarity Matrix computed!")

def recommend_movies_item_based(user_id, num_recommendations=5):
    # Get movies rated by the user
    user_ratings = user_movie_matrix.loc[user_id].dropna()

    # Initialize recommendation scores
    scores = pd.Series(0.0, index=user_movie_matrix.columns)

    # For each movie rated by the user, add weighted similarity scores
    for movie, rating in user_ratings.items():
        similar_scores = item_similarity_df[movie]
        scores += similar_scores * rating

    # Remove movies the user has already rated
    scores = scores.drop(user_ratings.index, errors='ignore')

    # Return top N recommendations
    return scores.sort_values(ascending=False).head(num_recommendations)

# Test item-based recommendations
user_id = 1
print(f"\n Item-based recommended movies for User {user_id}:")
print(recommend_movies_item_based(user_id, num_recommendations=5))

 Item Similarity Matrix computed!

 Item-based recommended movies for User 1:
title
E.T. the Extra-Terrestrial (1982)    392.868044
Stand by Me (1986)                   366.958476
Speed (1994)                         366.715440
Batman (1989)                        363.956465
True Lies (1994)                     362.610997
dtype: float64


In [21]:
# MODULE 11: Matrix Factorization with SVD
# -----------------------------
from scipy.sparse.linalg import svds

# Perform SVD on the user-item matrix
# Fill NaN with 0 for SVD
matrix = user_movie_matrix.fillna(0).values
U, sigma, Vt = svds(matrix, k=50)  # k is the number of latent factors
sigma = np.diag(sigma)

# Reconstruct the predicted ratings matrix
predicted_ratings = np.dot(np.dot(U, sigma), Vt)
predicted_ratings_df = pd.DataFrame(predicted_ratings, index=user_movie_matrix.index, columns=user_movie_matrix.columns)
print(" SVD-based predicted ratings matrix computed!")

def recommend_movies_svd(user_id, num_recommendations=5):
    # Get predicted ratings for the user
    user_pred_ratings = predicted_ratings_df.loc[user_id]

    # Remove movies the user has already rated
    user_rated = user_movie_matrix.loc[user_id].dropna().index
    recommendations = user_pred_ratings.drop(user_rated, errors='ignore').sort_values(ascending=False)

    # Return top N recommendations
    return recommendations.head(num_recommendations)

# Test SVD-based recommendations
user_id = 1
print(f"\n SVD-based recommended movies for User {user_id}:")
print(recommend_movies_svd(user_id, num_recommendations=5))

 SVD-based predicted ratings matrix computed!

 SVD-based recommended movies for User 1:
title
E.T. the Extra-Terrestrial (1982)         3.512692
Batman (1989)                             3.268929
Dave (1993)                               2.968897
Ulee's Gold (1997)                        2.906573
One Flew Over the Cuckoo's Nest (1975)    2.759012
Name: 1, dtype: float64


In [22]:
# MODULE 12: Save Item-Based and SVD Models
# -----------------------------
import pickle

with open("item_similarity_model.pkl", "wb") as file:
    pickle.dump(item_similarity_df, file)

with open("svd_model.pkl", "wb") as file:
    pickle.dump({'U': U, 'sigma': sigma, 'Vt': Vt}, file)

print("\n Item-based and SVD models saved successfully!")


 Item-based and SVD models saved successfully!
