In [1]:
# importing required libraries

import pandas as pd
import numpy as np

In [2]:
# loading the csv file containing the data

ratings = pd.read_csv('Datasets/ratings.csv', usecols = ['userId', 'movieId','rating'])
print('Shape of ratings dataset is: ',ratings.shape, '\n')
print('Max values in dataset are \n',ratings.max(), '\n')
print('Min values in dataset are \n',ratings.min(), '\n')

Shape of ratings dataset is:  (26024289, 3) 

Max values in dataset are 
 userId     270896.0
movieId    176275.0
rating          5.0
dtype: float64 

Min values in dataset are 
 userId     1.0
movieId    1.0
rating     0.5
dtype: float64 



In [3]:
# filtering the data containing rating above 4

ratings = ratings[ratings['rating'] >= 4.0]
print('Shape of ratings dataset is: ',ratings.shape, '\n')
print('Max values in dataset are \n',ratings.max(), '\n')
print('Min values in dataset are \n',ratings.min(), '\n')

Shape of ratings dataset is:  (12981742, 3) 

Max values in dataset are 
 userId     270896.0
movieId    176271.0
rating          5.0
dtype: float64 

Min values in dataset are 
 userId     1.0
movieId    1.0
rating     4.0
dtype: float64 



In [4]:
# selecting the first 1000 unique movies 

movies_list = np.unique(ratings['movieId'])[:1000]
ratings = ratings.loc[ratings['movieId'].isin(movies_list)]
print('Shape of ratings dataset is: ',ratings.shape, '\n')
print('Max values in dataset are \n',ratings.max(), '\n')
print('Min values in dataset are \n',ratings.min(), '\n') 

Shape of ratings dataset is:  (3165985, 3) 

Max values in dataset are 
 userId     270896.0
movieId      1019.0
rating          5.0
dtype: float64 

Min values in dataset are 
 userId     1.0
movieId    1.0
rating     4.0
dtype: float64 



In [5]:
# selecting the first 1000 unique users

users_list = np.unique(ratings['userId'])[:1000]
ratings = ratings.loc[ratings['userId'].isin(users_list)]  
print('Shape of ratings dataset is: ',ratings.shape, '\n')
print('Max values in dataset are \n',ratings.max(), '\n')
print('Min values in dataset are \n',ratings.min(), '\n')
print('Total Users: ', np.unique(ratings['userId']).shape[0])
print('Total Movies which are rated by 100 users: ', np.unique(ratings['movieId']).shape[0])

Shape of ratings dataset is:  (13903, 3) 

Max values in dataset are 
 userId     1171.0
movieId    1019.0
rating        5.0
dtype: float64 

Min values in dataset are 
 userId     1.0
movieId    1.0
rating     4.0
dtype: float64 

Total Users:  1000
Total Movies which are rated by 100 users:  676


In [6]:
# dataframe containing movies watched by each user

users_fav_movies = ratings.loc[:, ['userId', 'movieId']]
users_fav_movies.head()

Unnamed: 0,userId,movieId
1,1,147
2,1,858
31,2,64
32,2,79
34,2,260


In [7]:
# reseting the index

users_fav_movies = ratings.reset_index(drop = True)
users_fav_movies.head()

Unnamed: 0,userId,movieId,rating
0,1,147,4.5
1,1,858,5.0
2,2,64,4.0
3,2,79,4.0
4,2,260,4.0


In [8]:
# final dataframe containing movies watched by each user
users_fav_movies.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13893,13894,13895,13896,13897,13898,13899,13900,13901,13902
userId,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,1170.0,1170.0,1170.0,1170.0,1170.0,1170.0,1170.0,1171.0,1171.0,1171.0
movieId,147.0,858.0,64.0,79.0,260.0,339.0,377.0,605.0,628.0,648.0,...,597.0,608.0,648.0,785.0,805.0,858.0,919.0,368.0,497.0,594.0
rating,4.5,5.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,4.0,...,4.0,4.5,4.0,4.0,4.0,5.0,4.0,4.0,4.0,5.0


In [9]:
# saving the dataframe in a csv file after preprocessing

users_fav_movies.to_csv('./Datasets/filtered_ratings.csv', index = False)