In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Objective:
Use content based filtering algorithm to make a **Movie Recommender System**.

# Loading the Data:

# 1.Loading the Ratings of user on individual movies:

In [12]:
ratings = pd.read_csv('u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])

In [13]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [27]:
ratings.drop(columns=['timestamp'], inplace=True)

# Loading the movie details:

In [14]:
# Loading movie Data
movies = pd.read_csv(
    'u.item',
    sep='|',
    encoding='latin-1',
    names=[
        'movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL',
        'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime',
        'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
        'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
    ])

In [15]:
movies.drop(columns=['video_release_date'], inplace=True)

In [16]:
movies.head()

Unnamed: 0,movie_id,title,release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


# Mapping the Genres:

In [17]:
with open('u.genre', 'r') as f:
    genres = f.readlines()

# Convert to dictionary
genre_map = {line.split('|')[0]: int(line.split('|')[1]) for line in genres if '|' in line}
print(genre_map)

{'unknown': 0, 'Action': 1, 'Adventure': 2, 'Animation': 3, "Children's": 4, 'Comedy': 5, 'Crime': 6, 'Documentary': 7, 'Drama': 8, 'Fantasy': 9, 'Film-Noir': 10, 'Horror': 11, 'Musical': 12, 'Mystery': 13, 'Romance': 14, 'Sci-Fi': 15, 'Thriller': 16, 'War': 17, 'Western': 18}


# Loading the user Information:

In [18]:
users = pd.read_csv(
    'u.user',
    sep='|',
    names=['user_id', 'age', 'gender', 'occupation', 'zip_code']
)

In [19]:
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


# Checking for Missing Values:

In [21]:
print("Missing values in ratings:")
print(ratings.isnull().sum())

Missing values in ratings:
user_id      0
movie_id     0
rating       0
timestamp    0
dtype: int64


In [22]:
print("\nMissing values in movies:")
print(movies.isnull().sum())


Missing values in movies:
movie_id        0
title           0
release_date    1
IMDb_URL        3
unknown         0
Action          0
Adventure       0
Animation       0
Children's      0
Comedy          0
Crime           0
Documentary     0
Drama           0
Fantasy         0
Film-Noir       0
Horror          0
Musical         0
Mystery         0
Romance         0
Sci-Fi          0
Thriller        0
War             0
Western         0
dtype: int64


In [23]:
print("\nMissing values in users:")
print(users.isnull().sum())


Missing values in users:
user_id       0
age           0
gender        0
occupation    0
zip_code      0
dtype: int64


In [24]:
print("Empty strings in ratings:")
print((ratings == '').sum())

Empty strings in ratings:
user_id      0
movie_id     0
rating       0
timestamp    0
dtype: int64


In [25]:
print("\nEmpty strings in movies:")
print((movies == '').sum())


Empty strings in movies:
movie_id        0
title           0
release_date    0
IMDb_URL        0
unknown         0
Action          0
Adventure       0
Animation       0
Children's      0
Comedy          0
Crime           0
Documentary     0
Drama           0
Fantasy         0
Film-Noir       0
Horror          0
Musical         0
Mystery         0
Romance         0
Sci-Fi          0
Thriller        0
War             0
Western         0
dtype: int64


In [26]:
print("\nEmpty strings in users:")
print((users == '').sum())


Empty strings in users:
user_id       0
age           0
gender        0
occupation    0
zip_code      0
dtype: int64


# Feature Engineering:

1. **Finding the average rating for each Genre for each User**

In [28]:
# Merge ratings with movies to get genre info
ratings_with_genres = pd.merge(ratings, movies, on='movie_id')

# Melt the genre columns into long format
genre_columns = [
    'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime',
    'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
    'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]

melted = ratings_with_genres.melt(
    id_vars=['user_id', 'rating'],
    value_vars=genre_columns,
    var_name='genre',
    value_name='is_genre'
)

# Keep only the rows where genre applies (i.e., is_genre == 1)
melted = melted[melted['is_genre'] == 1]

# Group by user and genre, then take average rating
user_genre_avg = melted.groupby(['user_id', 'genre'])['rating'].mean().reset_index()

# Pivot to wide format so each genre becomes a column
user_genre_matrix = user_genre_avg.pivot(index='user_id', columns='genre', values='rating')

In [29]:
# Merge into users dataframe
users = users.merge(user_genre_matrix, on='user_id', how='left')

In [30]:
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code,Action,Adventure,Animation,Children's,Comedy,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,unknown
0,1,24,M,technician,85711,3.333333,2.928571,3.333333,2.2,3.472527,...,5.0,3.461538,2.923077,3.6,3.931818,4.0,3.615385,3.68,3.666667,4.0
1,2,53,F,other,94043,3.8,4.333333,4.0,3.0,3.8125,...,4.5,3.0,3.0,3.5,4.125,3.75,3.583333,3.666667,,
2,3,23,M,writer,32067,2.785714,3.5,,,2.583333,...,2.5,2.4,2.0,3.181818,3.4,2.75,2.52381,2.8,,
3,4,24,M,technician,43537,3.875,3.5,,,5.0,...,,4.0,5.0,4.0,4.333333,3.833333,3.909091,4.5,,
4,5,33,F,other,15213,3.142857,3.242424,3.785714,2.448276,3.0,...,5.0,2.535714,3.333333,3.0,2.315789,3.515152,2.947368,3.214286,2.5,4.0


In [31]:
# Calculate each user's overall average rating
user_avg = ratings.groupby('user_id')['rating'].mean()

# Fill NaNs in each row with that user's average
for genre in user_genre_matrix.columns:
    users[genre] = users[genre].fillna(users['user_id'].map(user_avg))

In [32]:
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code,Action,Adventure,Animation,Children's,Comedy,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,unknown
0,1,24,M,technician,85711,3.333333,2.928571,3.333333,2.2,3.472527,...,5.0,3.461538,2.923077,3.6,3.931818,4.0,3.615385,3.68,3.666667,4.0
1,2,53,F,other,94043,3.8,4.333333,4.0,3.0,3.8125,...,4.5,3.0,3.0,3.5,4.125,3.75,3.583333,3.666667,3.709677,3.709677
2,3,23,M,writer,32067,2.785714,3.5,2.796296,2.796296,2.583333,...,2.5,2.4,2.0,3.181818,3.4,2.75,2.52381,2.8,2.796296,2.796296
3,4,24,M,technician,43537,3.875,3.5,4.333333,4.333333,5.0,...,4.333333,4.0,5.0,4.0,4.333333,3.833333,3.909091,4.5,4.333333,4.333333
4,5,33,F,other,15213,3.142857,3.242424,3.785714,2.448276,3.0,...,5.0,2.535714,3.333333,3.0,2.315789,3.515152,2.947368,3.214286,2.5,4.0


In [34]:
print("\nMissing values in users:")
print(users.isnull().sum())


Missing values in users:
user_id        0
age            0
gender         0
occupation     0
zip_code       0
Action         0
Adventure      0
Animation      0
Children's     0
Comedy         0
Crime          0
Documentary    0
Drama          0
Fantasy        0
Film-Noir      0
Horror         0
Musical        0
Mystery        0
Romance        0
Sci-Fi         0
Thriller       0
War            0
Western        0
unknown        0
dtype: int64


**Adding the average rating for each movie**

In [36]:
# Calculate average rating per movie
movie_avg_rating = ratings.groupby('movie_id')['rating'].mean().reset_index()
movie_avg_rating.rename(columns={'rating': 'avg_rating'}, inplace=True)

# Merge with movies DataFrame
movies = movies.merge(movie_avg_rating, on='movie_id', how='left')

# Check result
print(movies[['movie_id', 'title', 'avg_rating']].head())

   movie_id              title  avg_rating
0         1   Toy Story (1995)    3.878319
1         2   GoldenEye (1995)    3.206107
2         3  Four Rooms (1995)    3.033333
3         4  Get Shorty (1995)    3.550239
4         5     Copycat (1995)    3.302326


In [37]:
print("\nMissing values in movies:")
print(movies.isnull().sum())


Missing values in movies:
movie_id        0
title           0
release_date    1
IMDb_URL        3
unknown         0
Action          0
Adventure       0
Animation       0
Children's      0
Comedy          0
Crime           0
Documentary     0
Drama           0
Fantasy         0
Film-Noir       0
Horror          0
Musical         0
Mystery         0
Romance         0
Sci-Fi          0
Thriller        0
War             0
Western         0
avg_rating      0
dtype: int64


In [38]:
users.columns

Index(['user_id', 'age', 'gender', 'occupation', 'zip_code', 'Action',
       'Adventure', 'Animation', 'Children's', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western',
       'unknown'],
      dtype='object')

In [39]:
movies.columns

Index(['movie_id', 'title', 'release_date', 'IMDb_URL', 'unknown', 'Action',
       'Adventure', 'Animation', 'Children's', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western',
       'avg_rating'],
      dtype='object')

# Copy DataFrames for Referances:

In [40]:
user_info = users[['user_id', 'age', 'gender', 'occupation', 'zip_code']].copy()

In [41]:
movie_info = movies[['movie_id', 'title', 'release_date', 'IMDb_URL'] + genre_columns].copy()

In [44]:
# Generate a genres column
def extract_genres(row):
    return [genre for genre in genre_columns if row[genre] == 1]

movie_info['genres'] = movie_info.apply(extract_genres, axis=1)

# Step 4 (optional): Drop the individual genre columns to keep only the list
movie_info.drop(columns=genre_columns, inplace=True)

In [42]:
user_info.head(3)

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067


In [45]:
movie_info.head(3)

Unnamed: 0,movie_id,title,release_date,IMDb_URL,genres
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,"[Animation, Children's, Comedy]"
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,"[Action, Adventure, Thriller]"
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,[Thriller]
