**Import Libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

**Read CSV**

In [2]:
df1 = pd.read_csv('/kaggle/input/movie-recommendation-system/ratings.csv')
df2 = pd.read_csv('/kaggle/input/movie-recommendation-system/movies.csv')
merged_df = df1.merge(df2, on='movieId')



In [3]:
# drop df1, df2
del df1 
del df2 

**Data Eploration**

In [4]:
merged_df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,169,2.5,1204927694,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
1,13,169,1.0,974868393,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
2,14,169,3.0,845470321,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
3,17,169,1.0,944991371,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
4,68,169,1.0,1011092044,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama


**Exploratory Data analysis**

In [5]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22884377 entries, 0 to 22884376
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
 4   title      object 
 5   genres     object 
dtypes: float64(1), int64(3), object(2)
memory usage: 1.0+ GB


In [6]:
#count nulls
merged_df.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
dtype: int64

In [7]:
#count duplicates
merged_df[merged_df.duplicated()]


Unnamed: 0,userId,movieId,rating,timestamp,title,genres


In [8]:
#count duplicates
merged_df.duplicated().sum()

0

In [9]:
#count no of users
merged_df["userId"].nunique()


247753

In [10]:
#counts no of movies
merged_df["movieId"].nunique()

33670

In [11]:
#No of users rating each movie
movie_ratings_count = merged_df["title"].value_counts()
values = np.array(movie_ratings_count.values)
sorted_values = np.sort(values)
print(sorted_values)
print(np.median(sorted_values))
q1 = np.percentile(sorted_values,25)
q3 = np.percentile(sorted_values,75)
iqr = q3 - q1
print (q1)
print (q3)
print (iqr)


[    1     1     1 ... 77887 79091 81296]
11.0
2.0
128.0
126.0


In [12]:
#counts no of users
user_rating_count = merged_df["userId"].value_counts()
values = np.array(user_rating_count.values)
sorted_values = np.sort(values)
print(sorted_values)
print(np.median(sorted_values))
q1 = np.percentile(sorted_values,25)
q3 = np.percentile(sorted_values,75)
iqr = q3 - q1
print (q1)
print (q3)
print (iqr)
print (np.sum(sorted_values<=5))    

[   1    1    1 ... 7057 7515 9281]
29.0
15.0
89.0
74.0
19439


In [13]:
# Create a bar plot to visualize the counts
# plt.figure(figsize=(12, 6))
# sns.barplot(x=movie_ratings_count.index, y=movie_ratings_count.values, palette="viridis")
# plt.title("Number of Users Rating Each Movie")
# plt.xticks(rotation=90)  # Rotate x-axis labels for better visibility
# plt.xlabel("Movie Title")
# plt.ylabel("Number of Users")
# plt.show()

In [14]:
# df_genres = merged_df['genres'].str.get_dummies('|')
# df_genres.head(5)

In [15]:
# df_encoded = pd.concat([merged_df, df_genres], axis=1)
# df_encoded.head(5)

In [16]:
# filtered_df = df_encoded[df_encoded['(no genres listed)'] > 0]
# filtered_df.head(5)

In [17]:
# df_clean = df_encoded.drop(filtered_df.index)
# df_clean.head(5)
df_clean = merged_df

In [18]:
# drop merged_df , df_genres , df_endoded , filtered_df
del merged_df 
# del df_genres 
# del df_encoded
# del filtered_df

In [19]:
# seperate the year from movie title to new column (year)
# df_clean[['title', 'year']] = df_clean['title'].str.extract(r'(.+) \((\d{4})\)')
# df_clean.head(5)


In [20]:
# df_clean.drop('(no genres listed)', axis=1)
# df_clean.head(5)

In [21]:
#Remove users who rated less than 5 movies and movies rated by less than 2 users
# Count the number of ratings per user and movie
user_ratings_count = df_clean['userId'].value_counts()
movie_ratings_count = df_clean['movieId'].value_counts()

# Filter out users who rated less than 5 movies
users_to_keep = user_ratings_count[user_ratings_count >= 35].index
df_filtered_users = df_clean[df_clean['userId'].isin(users_to_keep)]

# Filter out movies rated by less than 2 users
movies_to_keep = movie_ratings_count[movie_ratings_count >= 12].index
df_filtered = df_filtered_users[df_filtered_users['movieId'].isin(movies_to_keep)]

# del df_filtered_users , users_to_keep , movies_to_keep , user_ratings_count , movie_ratings_count
del movie_ratings_count
del user_ratings_count
del df_filtered_users
del users_to_keep
del movies_to_keep


# df_filtered now contains only the data for users who rated at least 5 movies and movies rated by at least 2 users.
df_filtered.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
1,13,169,1.0,974868393,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
2,14,169,3.0,845470321,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
3,17,169,1.0,944991371,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
4,68,169,1.0,1011092044,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
5,178,169,2.5,1140216232,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama


**Collabrative filter item-based**

In [22]:
final_dataset = df_filtered.pivot(index='movieId',columns='userId',values='rating')
final_dataset


userId,4,11,12,13,14,15,17,18,20,21,...,247734,247735,247736,247737,247738,247742,247746,247748,247750,247751
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,4.0,5.0,,,,...,,4.5,,,,,,,,4.0
2,,,,2.0,,,3.0,,,,...,,,,,,,,,,
3,,1.5,,,,,2.0,,,,...,,,,,,,,3.0,,
4,,,,,,,3.0,,,,...,,,,,,,,,,
5,,,,,3.0,,4.0,,,,...,,,,,,,,3.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150294,,,,,,,,,,,...,,,,,,,,,,
150548,,,,,,,,,,,...,,,,,,,,,,
150596,,,,,,,,,,,...,,,,,,,,,,
150856,,,,,,,,,,,...,,,,,,,,,,


In [23]:
final_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16655 entries, 1 to 151459
Columns: 113018 entries, 4 to 247751
dtypes: float64(113018)
memory usage: 14.0 GB


In [24]:
final_dataset

userId,4,11,12,13,14,15,17,18,20,21,...,247734,247735,247736,247737,247738,247742,247746,247748,247750,247751
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,4.0,5.0,,,,...,,4.5,,,,,,,,4.0
2,,,,2.0,,,3.0,,,,...,,,,,,,,,,
3,,1.5,,,,,2.0,,,,...,,,,,,,,3.0,,
4,,,,,,,3.0,,,,...,,,,,,,,,,
5,,,,,3.0,,4.0,,,,...,,,,,,,,3.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150294,,,,,,,,,,,...,,,,,,,,,,
150548,,,,,,,,,,,...,,,,,,,,,,
150596,,,,,,,,,,,...,,,,,,,,,,
150856,,,,,,,,,,,...,,,,,,,,,,


In [25]:
# replace NaN with 0
final_dataset.fillna(0,inplace=True)
final_dataset.head()

userId,4,11,12,13,14,15,17,18,20,21,...,247734,247735,247736,247737,247738,247742,247746,247748,247750,247751
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.0,...,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2,0.0,0.0,0.0,2.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.5,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0


In [26]:
csr_data = csr_matrix(final_dataset.values)
final_dataset.reset_index(inplace=True)

**Making the movie recommendation system model**

In [27]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
knn.fit(csr_data)

In [28]:
df_filtered.head(8000)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
1,13,169,1.0,974868393,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
2,14,169,3.0,845470321,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
3,17,169,1.0,944991371,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
4,68,169,1.0,1011092044,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
5,178,169,2.5,1140216232,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
...,...,...,...,...,...,...
9485,232411,2471,2.0,942062587,Crocodile Dundee II (1988),Action|Adventure|Comedy
9486,232499,2471,3.5,1121438094,Crocodile Dundee II (1988),Action|Adventure|Comedy
9487,232531,2471,2.0,1007362823,Crocodile Dundee II (1988),Action|Adventure|Comedy
9488,232575,2471,2.0,944345155,Crocodile Dundee II (1988),Action|Adventure|Comedy


In [29]:
def get_movie_recommendation(movie_name):
    n_movies_to_reccomend = 10
    movie_list = df_filtered[df_filtered['title'].str.contains(movie_name)] 
    if len(movie_list): 
            movie_idx= movie_list.iloc[0]['movieId'] #get the movieID
            movie_idx = final_dataset[final_dataset['movieId'] == movie_idx].index[0] #get the coresponding movieID in the final datast
            distances , indices = knn.kneighbors(csr_data[movie_idx],n_neighbors=n_movies_to_reccomend+1) # get the distance and index of the nearest top 10 movies
            rec_movie_indices = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1] #sort movies according to distance desc
            recommend_frame = []
            for val in rec_movie_indices:
                movie_idx = final_dataset.iloc[val[0]]['movieId']
                idx = df_filtered[df_filtered['movieId'] == movie_idx].index
                recommend_frame.append({'Title':df_filtered.iloc[idx]['title'].values[0],'Distance':val[1]})
            df = pd.DataFrame(recommend_frame,index=range(1,n_movies_to_reccomend+1))
            return df
    else:
        return "No movies found. Please check your input"
            
    

In [30]:
get_movie_recommendation('Rambo')

Unnamed: 0,Title,Distance
1,Back to School (1986),0.660645
2,Lone Star (1996),0.660586
3,Finding Forrester (2000),0.660269
4,Shrek 2 (2004),0.648064
5,Stalker (1979),0.630584
6,Wolf (1994),0.592689
7,Kiss Kiss Bang Bang (2005),0.571582
8,"Brothers Grimm, The (2005)",0.553944
9,Sin City (2005),0.464141
10,Crash (2004),0.373839
