In [86]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
ratings = pd.read_csv('../data/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])

# CORRECTED MOVIES COLUMN NAMES (FIXED TYPO AND APOSTROPHE)
movies = pd.read_csv(
    '../data/u.item', 
    sep='|', 
    encoding='latin-1', 
    names=[
        'movie_id', 'title', 'release_date', 'video_release', 'imdb_url', 
        'unknown', 'Action', 'Adventure',  # Fixed "Advennture" typo to "Adventure"
        'Animation', 'Children\'s',  # Added apostrophe to "Children's"
        'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
        'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 
        'Sci-Fi', 'Thriller', 'War', 'Western'
    ]
)
users = pd.read_csv('../data/u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])


# Drop unnecessary columns
movies.drop(columns=['video_release', 'imdb_url', 'unknown'], inplace=True)
ratings.drop(columns=['timestamp'], inplace=True)


# Merge ratings with movie titles
ratings_with_titles = pd.merge(ratings, movies[['movie_id', 'title']], on='movie_id')
ratings_with_titles.head()

Unnamed: 0,user_id,movie_id,rating,title
0,196,242,3,Kolya (1996)
1,186,302,3,L.A. Confidential (1997)
2,22,377,1,Heavyweights (1994)
3,244,51,2,Legends of the Fall (1994)
4,166,346,1,Jackie Brown (1997)


In [88]:
# Basic stats
print("Ratings shape:", ratings.shape)
print("Movies shape:", movies.shape)
print("Users shape:", users.shape)

# Display first 5 rows of ratings
movies.head()

Ratings shape: (100000, 3)
Movies shape: (1682, 21)
Users shape: (943, 5)


Unnamed: 0,movie_id,title,release_date,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [90]:
print("Missing values in ratings:", ratings.isnull().sum().sum())
print("Missing values in movies:", movies.isnull().sum().sum())
print("Missing values in users:", users.isnull().sum().sum())

Missing values in ratings: 0
Missing values in movies: 1
Missing values in users: 0


In [92]:
print("Unique ratings:", sorted(ratings['rating'].unique()))  # net2akdo ratings bin output [1, 2, 3, 4, 5]

Unique ratings: [1, 2, 3, 4, 5]


In [94]:
print(movies[['Action', 'Drama']].value_counts())  # Should show only 0s and 1s

Action  Drama
0       0        752
        1        679
1       0        205
        1         46
Name: count, dtype: int64


In [96]:
# Check for orphaned user IDs in ratings
orphaned_users = ratings[~ratings['user_id'].isin(users['user_id'])]
print("Orphaned users in ratings:", len(orphaned_users))

# Check for orphaned movie IDs in ratings
orphaned_movies = ratings[~ratings['movie_id'].isin(movies['movie_id'])]
print("Orphaned movies in ratings:", len(orphaned_movies))

Orphaned users in ratings: 0
Orphaned movies in ratings: 0


In [None]:
#now leets start building the model of user based filtering baybyyyyyy

In [None]:
#user based filtering

In [100]:
# Create a user-item matrix (pivot table)
user_item_matrix = ratings.pivot_table(index='user_id', columns='movie_id', values='rating').fillna(0)
user_item_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [122]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
# Calculate user-user similarity
user_similarity = cosine_similarity(user_item_matrix)

# Predict ratings for a user
def predict_rating(user_id, movie_id):
    similar_users = np.argsort(user_similarity[user_id-1])[::-1][1:11]  # Top 10 similar users (exclude self)
    similar_ratings = user_item_matrix.iloc[similar_users][movie_id]
    return similar_ratings.mean()

# Example: Predict rating for user 1 on movie 50
predicted_rating = predict_rating(1, 50)
print(f"Predicted rating for User 1 on Movie 50: {predicted_rating:.2f}")

Predicted rating for User 1 on Movie 50: 5.00


In [127]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split

# Load data into Surprise format
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'movie_id', 'rating']], reader)

# Split data
trainset, testset = train_test_split(data, test_size=0.2)

# Train SVD model
model = SVD()
model.fit(trainset)

# Predict a rating
prediction = model.predict(uid=1, iid=50)
print(f"Predicted rating (SVD): {prediction.est:.2f}")

ModuleNotFoundError: No module named 'surprise'