In [86]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
ratings = pd.read_csv('../data/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])

# CORRECTED MOVIES COLUMN NAMES (FIXED TYPO AND APOSTROPHE)
movies = pd.read_csv(
    '../data/u.item', 
    sep='|', 
    encoding='latin-1', 
    names=[
        'movie_id', 'title', 'release_date', 'video_release', 'imdb_url', 
        'unknown', 'Action', 'Adventure',  # Fixed "Advennture" typo to "Adventure"
        'Animation', 'Children\'s',  # Added apostrophe to "Children's"
        'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
        'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 
        'Sci-Fi', 'Thriller', 'War', 'Western'
    ]
)
users = pd.read_csv('../data/u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])


# Drop unnecessary columns
movies.drop(columns=['video_release', 'imdb_url', 'unknown'], inplace=True)
ratings.drop(columns=['timestamp'], inplace=True)


# Merge ratings with movie titles
ratings_with_titles = pd.merge(ratings, movies[['movie_id', 'title']], on='movie_id')
ratings_with_titles.head()

Unnamed: 0,user_id,movie_id,rating,title
0,196,242,3,Kolya (1996)
1,186,302,3,L.A. Confidential (1997)
2,22,377,1,Heavyweights (1994)
3,244,51,2,Legends of the Fall (1994)
4,166,346,1,Jackie Brown (1997)


In [88]:
# Basic stats
print("Ratings shape:", ratings.shape)
print("Movies shape:", movies.shape)
print("Users shape:", users.shape)

# Display first 5 rows of ratings
movies.head()

Ratings shape: (100000, 3)
Movies shape: (1682, 21)
Users shape: (943, 5)


Unnamed: 0,movie_id,title,release_date,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [90]:
print("Missing values in ratings:", ratings.isnull().sum().sum())
print("Missing values in movies:", movies.isnull().sum().sum())
print("Missing values in users:", users.isnull().sum().sum())

Missing values in ratings: 0
Missing values in movies: 1
Missing values in users: 0


In [92]:
print("Unique ratings:", sorted(ratings['rating'].unique()))  # net2akdo ratings bin output [1, 2, 3, 4, 5]

Unique ratings: [1, 2, 3, 4, 5]


In [94]:
print(movies[['Action', 'Drama']].value_counts())  # Should show only 0s and 1s

Action  Drama
0       0        752
        1        679
1       0        205
        1         46
Name: count, dtype: int64


In [96]:
# Check for orphaned user IDs in ratings
orphaned_users = ratings[~ratings['user_id'].isin(users['user_id'])]
print("Orphaned users in ratings:", len(orphaned_users))

# Check for orphaned movie IDs in ratings
orphaned_movies = ratings[~ratings['movie_id'].isin(movies['movie_id'])]
print("Orphaned movies in ratings:", len(orphaned_movies))

Orphaned users in ratings: 0
Orphaned movies in ratings: 0


In [None]:
#now leets start building the model of user based filtering baybyyyyyy

In [None]:
#user based filtering

In [6]:
import pandas as pd
import numpy as np

# Load data (replace paths if needed)
ratings = pd.read_csv('../data/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])
movies = pd.read_csv('../data/u.item', sep='|', encoding='latin-1', 
                    names=['movie_id', 'title', 'release_date', 'video_release', 'imdb_url', 
                           'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 
                           'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
                           'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 
                           'Sci-Fi', 'Thriller', 'War', 'Western'])

# Drop irrelevant columns
movies.drop(columns=['video_release', 'imdb_url', 'unknown'], inplace=True)
ratings.drop(columns=['timestamp'], inplace=True)

In [10]:
#Spliting Data

In [12]:
from sklearn.model_selection import train_test_split

# Split data (stratify by user_id to ensure all users appear in both sets)
train_data, test_data = train_test_split(
    ratings, 
    test_size=0.2, 
    stratify=ratings['user_id'], 
    random_state=42
)

In [None]:
#User based collaborative filtering

In [14]:
# Create a user-item matrix from training data
user_item_matrix = train_data.pivot_table(
    index='user_id', 
    columns='movie_id', 
    values='rating'
).fillna(0)

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate user-user similarity
user_similarity = cosine_similarity(user_item_matrix)