In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Copy of combined_data_1.txt.zip',header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1])

In [4]:
print("Top 5 rows: \n",dataset.head())
print("**"*20)
print("Dataset Information: \n",dataset.info())
print("**"*20)
print("Shape of the dataset: \n",dataset.shape)

Top 5 rows: 
    Cust_Id  Rating
0       1:     NaN
1  1488844     3.0
2   822109     5.0
3   885013     4.0
4    30878     4.0
****************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24058263 entries, 0 to 24058262
Data columns (total 2 columns):
 #   Column   Dtype  
---  ------   -----  
 0   Cust_Id  object 
 1   Rating   float64
dtypes: float64(1), object(1)
memory usage: 367.1+ MB
Dataset Information: 
 None
****************************************
Shape of the dataset: 
 (24058263, 2)


In [5]:
dataset['Rating'] = dataset['Rating'].astype(float)

In [6]:
rating_count_by_rating = dataset.groupby('Rating')['Rating'].agg(['count'])
rating_count_by_rating

Unnamed: 0_level_0,count
Rating,Unnamed: 1_level_1
1.0,1118186
2.0,2439073
3.0,6904181
4.0,8085741
5.0,5506583


In [7]:
movie_count = dataset.isnull().sum()[1]
movie_count

4499

In [8]:
rating_count = dataset['Cust_Id'].count() - movie_count
rating_count

cust_count = dataset['Cust_Id'].nunique()-movie_count
cust_count

470758

In [9]:
fig = px.bar(rating_count_by_rating, x=rating_count_by_rating.iloc[:, 0], y=rating_count_by_rating.index, orientation='h', text=rating_count_by_rating.iloc[:, 0],
             labels={'index': 'Rating', 'variable': 'Percentage'},
             title=f'Total pool: {movie_count} Movies, {cust_count} customers, {rating_count} ratings given',
             width=800, height=600)

fig.update_traces(marker_color='firebrick')

fig.show()


In [10]:
df_nan = pd.DataFrame(pd.isnull(dataset.Rating), )
df_nan = df_nan[df_nan['Rating'] == True].reset_index()
df_nan

Unnamed: 0,index,Rating
0,0,True
1,548,True
2,694,True
3,2707,True
4,2850,True
...,...,...
4494,24046714,True
4495,24047329,True
4496,24056849,True
4497,24057564,True


In [11]:
#To create a numpy array containing movie ids according the 'ratings' dataset

movie_np = []
movie_id = 1

for i,j in zip(df_nan['index'][1:],df_nan['index'][:-1]):
    temp = np.full((1,i-j-1), movie_id)
    movie_np = np.append(movie_np, temp)
    movie_id += 1

# Account for last record and corresponding length

last_record = np.full((1,len(dataset) - df_nan.iloc[-1, 0] - 1),movie_id)
movie_np = np.append(movie_np, last_record)

print(f'Movie numpy: {movie_np}')
print(f'Length: {len(movie_np)}')

Movie numpy: [1.000e+00 1.000e+00 1.000e+00 ... 4.499e+03 4.499e+03 4.499e+03]
Length: 24053764


In [12]:
dataset = dataset[pd.notnull(dataset['Rating'])]

dataset['Movie_Id'] = movie_np.astype(int)
dataset['Cust_Id'] =dataset['Cust_Id'].astype(int)
print('-Dataset examples-')
dataset.head()

-Dataset examples-


Unnamed: 0,Cust_Id,Rating,Movie_Id
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
5,823519,3.0,1


In [13]:
movie_ratings_count = dataset.groupby('Movie_Id')['Rating'].count().reset_index(name='RatingCount')

fig = px.scatter(movie_ratings_count, x='Movie_Id', y='RatingCount', title='Number of Ratings by Movie',
                 labels={'RatingCount': 'Number of Ratings', 'Movie_Id': 'Movie ID'},
                 color_discrete_sequence=['Firebrick'])

fig.update_layout(
    xaxis_title='Movie ID',
    yaxis_title='Number of Ratings'
)

fig.show()


In [14]:
f = ['count','mean']

#To create a list of all the movies rated less often(only include top 30% rated movies)
dataset_movie_summary = dataset.groupby('Movie_Id')['Rating'].agg(f)

dataset_movie_summary.index = dataset_movie_summary.index.map(int)

movie_benchmark = round(dataset_movie_summary['count'].quantile(0.7),0)

drop_movie_list = dataset_movie_summary[dataset_movie_summary['count'] < movie_benchmark].index

print('Movie minimum times of review: {}'.format(movie_benchmark))

Movie minimum times of review: 1799.0


In [15]:
drop_movie_list

Int64Index([   1,    2,    4,    5,    6,    7,    9,   10,   11,   12,
            ...
            4484, 4486, 4487, 4489, 4491, 4494, 4495, 4497, 4498, 4499],
           dtype='int64', name='Movie_Id', length=3149)

In [16]:
dataset_cust_summary = dataset.groupby('Cust_Id')['Rating'].agg(f)
dataset_cust_summary.index = dataset_cust_summary.index.map(int)
cust_benchmark = round(dataset_cust_summary['count'].quantile(0.7),0)
drop_cust_list = dataset_cust_summary[dataset_cust_summary['count'] < cust_benchmark].index

print(f'Customer minimum times of review: {cust_benchmark}')

Customer minimum times of review: 52.0


In [17]:
dataset = dataset[~dataset['Movie_Id'].isin(drop_movie_list)]
dataset = dataset[~dataset['Cust_Id'].isin(drop_cust_list)]
print('After Trim Shape: {}'.format(dataset.shape))

After Trim Shape: (17337458, 3)


In [18]:
df_p = pd.pivot_table(dataset,values='Rating',index='Cust_Id',columns='Movie_Id')

In [19]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Copy of movie_titles.csv', sep='\t', lineterminator='\n', encoding="ISO-8859-1", header=None, names=['Movie_Id', 'Year', 'Name'])
df[['Movie_Id', 'Year', 'Name']] = df['Movie_Id'].str.split(',', n=2, expand=True)

df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
df.set_index('Movie_Id', inplace=True)
df.head()

Unnamed: 0_level_0,Year,Name
Movie_Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003.0,Dinosaur Planet
2,2004.0,Isle of Man TT 2004 Review
3,1997.0,Character
4,1994.0,Paula Abdul's Get Up & Dance
5,2004.0,The Rise and Fall of ECW


In [20]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3162668 sha256=12a33fd2b7e2f44a9edd55bec51423166a36e1f8f115ab925d04d7ef5740de30
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [21]:
# Import required libraries
import math
import re
import matplotlib.pyplot as plt

from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [22]:
# Load Reader library
reader = Reader()

# get just top 100K rows for faster run time
data = Dataset.load_from_df(dataset[['Cust_Id', 'Movie_Id', 'Rating']][:100000], reader)

# Use the SVD algorithm.
svd = SVD()

# Compute the RMSE of the SVD algorithm
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9962  1.0024  0.9981  0.9989  0.0026  
MAE (testset)     0.7806  0.7913  0.8035  0.7918  0.0093  
Fit time          2.25    1.53    2.33    2.03    0.36    
Test time         0.43    0.20    0.41    0.35    0.11    


{'test_rmse': array([0.99618846, 1.00237228, 0.99806841]),
 'test_mae': array([0.78064012, 0.79126181, 0.80351003]),
 'fit_time': (2.2465097904205322, 1.5308904647827148, 2.3259947299957275),
 'test_time': (0.43217039108276367, 0.19557499885559082, 0.4134340286254883)}

In [23]:
dataset_712664 = dataset[(dataset['Cust_Id'] == 712664) & (dataset['Rating'] == 5)]
dataset_712664 = dataset_712664.set_index('Movie_Id')
dataset_712664 = dataset_712664.join(df)['Name']
dataset_712664.head(10)

Movie_Id
3      NaN
79     NaN
175    NaN
199    NaN
241    NaN
256    NaN
348    NaN
357    NaN
416    NaN
442    NaN
Name: Name, dtype: object

In [24]:
# Create a shallow copy for the movies dataset
user_1581300 = df.copy()

user_1581300 = user_1581300.reset_index()

#To remove all the movies rated less often
user_1581300 = user_1581300[~user_1581300['Movie_Id'].isin(drop_movie_list)]

# getting full dataset
data = Dataset.load_from_df(dataset[['Cust_Id', 'Movie_Id', 'Rating']], reader)

#create a training set for svd
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7a795734e5f0>

In [25]:
user_1581300['Estimate_Score'] = user_1581300['Movie_Id'].apply(lambda x: svd.predict(1581300, x).est)

user_1581300 = user_1581300.drop('Movie_Id', axis = 1)

user_1581300 = user_1581300.sort_values('Estimate_Score', ascending=False)

#Print top 10 recommendations
print(user_1581300.head(10))

         Year                                     Name  Estimate_Score
0      2003.0                          Dinosaur Planet        4.074808
11844  1955.0                               Summertime        4.074808
11850  1946.0                             The Yearling        4.074808
11849  2003.0  Dumb and Dumberer: When Harry Met Lloyd        4.074808
11848  1998.0                                    Earth        4.074808
11847  1997.0                     For Richer or Poorer        4.074808
11846  2005.0                            Dust to Glory        4.074808
11845  1985.0                           Prizzi's Honor        4.074808
11843  1967.0                     Barefoot in the Park        4.074808
11835  1952.0                           Victory at Sea        4.074808
