In [1]:
import pandas as pd
import numpy as np

In [2]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

In [4]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [7]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [10]:
# we dont need genres column from movies_df so we will drop it

movies_df = movies_df.drop('genres', axis=1)

In [11]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [13]:
# similarly we will drop timestamp from our ratings_df

ratings_df = ratings_df.drop('timestamp', axis = 1)

In [14]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [17]:
# so lets merge ratings_df and movies df with the help of movieID column 

df = pd.merge(movies_df, ratings_df, on='movieId')

In [18]:
df.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   movieId  100836 non-null  int64  
 1   title    100836 non-null  object 
 2   userId   100836 non-null  int64  
 3   rating   100836 non-null  float64
dtypes: float64(1), int64(2), object(1)
memory usage: 3.8+ MB


In [24]:
df.shape

(100836, 4)

In [22]:
movie_rating_count = (df.
    groupby(['title'])['rating'].                       
    count().                                           # counting the rating
    reset_index().ipynb_checkpoints/                   # reseting the index because we dont want title as index    
    rename(columns={'rating':'TotalRatingCount'}))

In [23]:
movie_rating_count.head()

Unnamed: 0,title,TotalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [25]:
df_new = df.merge(movie_rating_count, left_on = 'title', right_on = 'title', how='left')

In [26]:
df_new.head()

Unnamed: 0,movieId,title,userId,rating,TotalRatingCount
0,1,Toy Story (1995),1,4.0,215
1,1,Toy Story (1995),5,4.0,215
2,1,Toy Story (1995),7,4.5,215
3,1,Toy Story (1995),15,2.5,215
4,1,Toy Story (1995),17,4.5,215


In [27]:
df_new.describe()

Unnamed: 0,movieId,userId,rating,TotalRatingCount
count,100836.0,100836.0,100836.0,100836.0
mean,19435.295718,326.127564,3.501557,58.758777
std,35530.987199,182.618491,1.042529,61.965384
min,1.0,1.0,0.5,1.0
25%,1199.0,177.0,3.0,13.0
50%,2991.0,325.0,3.5,39.0
75%,8122.0,477.0,4.0,84.0
max,193609.0,610.0,5.0,329.0


In [28]:
# now we will filter out result and recoment movies which have crtain threshold level

popularity_threshold =58              # taking mean of total raing count as threshold

popular_movie = df_new[df_new['TotalRatingCount'] >= popularity_threshold]

In [30]:
#  LEts check our result 
popular_movie.describe()

Unnamed: 0,movieId,userId,rating,TotalRatingCount
count,36717.0,36717.0,36717.0,36717.0
mean,7565.41011,312.536155,3.751178,122.759457
std,17991.998705,179.246769,0.98042,60.243974
min,1.0,1.0,0.5,58.0
25%,527.0,160.0,3.0,77.0
50%,1374.0,310.0,4.0,104.0
75%,3996.0,468.0,4.5,146.0
max,112852.0,610.0,5.0,329.0
