### Importing Libraries 

In [1]:
#conda install -c plotly plotly
import plotlya

In [2]:
import pandas as pd
import numpy as np
import surprise

### Read Dataset


In [3]:
meta=pd.read_csv("metadata.csv")

In [4]:
user=pd.read_csv("user-interactions.csv",nrows=1000000)

### Rename Columns


In [5]:
user.head()
user. rename(columns = {'pratilipi_id':'content_id'}, inplace = True)


In [6]:
user

Unnamed: 0,user_id,content_id,read_percent,updated_at
0,5506791963854965,1377786220672965,100.0,2022-03-23 00:08:26.227
1,5506791979071996,1377786219742624,29.0,2022-03-23 00:08:26.220
2,5506791980256358,1377786217096334,22.0,2022-03-23 00:08:26.020
3,5506791988747277,1377786224767880,100.0,2022-03-23 00:08:25.306
4,5506791992372558,1377786218111595,100.0,2022-03-23 00:08:25.250
...,...,...,...,...
999995,5506791950553476,1377786226110654,100.0,2022-03-22 14:35:22.708
999996,5506791993432153,1377786223316284,100.0,2022-03-22 14:35:22.705
999997,5506791954771043,1377786226611326,100.0,2022-03-22 14:35:22.705
999998,5506791964503903,1377786224101244,100.0,2022-03-22 14:35:22.685


### Make new Ratings column


In [7]:
conditions = [
    (user['read_percent'] > 0) & (user['read_percent'] <= 9),
    (user['read_percent'] > 9) & (user['read_percent'] <= 19),
    (user['read_percent'] > 19) & (user['read_percent'] <= 29),
    (user['read_percent'] > 29) & (user['read_percent'] <= 39),
    (user['read_percent'] > 39) & (user['read_percent'] <= 49),
    (user['read_percent'] > 49) & (user['read_percent'] <= 59),
    (user['read_percent'] > 59) & (user['read_percent'] <= 69),
    (user['read_percent'] > 69) & (user['read_percent'] <= 79),
    (user['read_percent'] > 79) & (user['read_percent'] <= 89),
    (user['read_percent'] > 89) & (user['read_percent'] <= 100)

    ]

# create a list of the values we want to assign for each condition
values = [1,2,3, 4,5,6,7,8,9,10]

# create a new column and use np.select to assign values to it using our lists as arguments
user['rating'] = np.select(conditions, values)




### Ploting Bargraph to visualize Dataset Distribution w.r.t. Rating


In [8]:
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

data = user['rating'].value_counts().sort_index(ascending=True)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / user.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )
# Create layout
layout = dict(title = 'Distribution Of {} Audio-ratings'.format(user.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

### As The Data is too sparse so we need to change the rating parameters 

In [9]:
conditions = [
    (user['read_percent'] > 0) &  (user['read_percent'] <= 19),
    (user['read_percent'] > 19)  & (user['read_percent'] <= 39),
    (user['read_percent'] > 39) &  (user['read_percent'] <= 59),
    (user['read_percent'] > 59) & (user['read_percent'] <= 79),
    (user['read_percent'] > 79) & (user['read_percent'] <= 100)

    ]

# create a list of the values we want to assign for each condition
values = [1,2,3, 4,5]

# create a new column and use np.select to assign values to it using our lists as arguments
user['rating'] = np.select(conditions, values)


In [10]:
user

Unnamed: 0,user_id,content_id,read_percent,updated_at,rating
0,5506791963854965,1377786220672965,100.0,2022-03-23 00:08:26.227,5
1,5506791979071996,1377786219742624,29.0,2022-03-23 00:08:26.220,2
2,5506791980256358,1377786217096334,22.0,2022-03-23 00:08:26.020,2
3,5506791988747277,1377786224767880,100.0,2022-03-23 00:08:25.306,5
4,5506791992372558,1377786218111595,100.0,2022-03-23 00:08:25.250,5
...,...,...,...,...,...
999995,5506791950553476,1377786226110654,100.0,2022-03-22 14:35:22.708,5
999996,5506791993432153,1377786223316284,100.0,2022-03-22 14:35:22.705,5
999997,5506791954771043,1377786226611326,100.0,2022-03-22 14:35:22.705,5
999998,5506791964503903,1377786224101244,100.0,2022-03-22 14:35:22.685,5


In [11]:
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

data = user['rating'].value_counts().sort_index(ascending=True)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / user.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )
# Create layout
layout = dict(title = 'Distribution Of {} Audio-ratings'.format(user.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [12]:
# most are high rating content 5 rating means content is good enough.

In [13]:
# Number of ratings per book
data = user.groupby('content_id')['rating'].count().clip(upper=50)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per Audio (Clipped at 100)',
                   xaxis = dict(title = 'Number of Ratings Per Audio'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

### To se number of rating for each content_id


In [14]:
user.groupby('content_id')['rating'].count().reset_index().sort_values('rating', ascending=False)[:10]

Unnamed: 0,content_id,rating
9553,1377786217073107,2560
164424,1377786228287206,2211
13020,1377786217546718,2032
163702,1377786228271510,1864
160893,1377786228202998,1808
164044,1377786228278968,1770
164015,1377786228278010,1489
163715,1377786228271736,1401
163813,1377786228273636,1370
160963,1377786228205426,1351


In [15]:
# Number of ratings per user
data = user.groupby('user_id')['rating'].count().clip(upper=50)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per User (Clipped at 50)',
                   xaxis = dict(title = 'Ratings Per User'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [16]:
user.groupby('user_id')['rating'].count().reset_index().sort_values('rating', ascending=False)[:10]

Unnamed: 0,user_id,rating
88839,5506791985603052,347
97527,5506791991347424,347
90951,5506791987154463,347
92377,5506791988316056,300
43861,5506791964300156,247
79524,5506791979646770,246
85502,5506791983099162,232
90642,5506791986864046,229
82722,5506791981317281,228
34500,5506791962253517,227


In [17]:
# Only considering Content that had been listened by more than minimum Content ratings
min_content_ratings = 50
filter_content = user['content_id'].value_counts() > min_content_ratings
filter_content = filter_content[filter_content].index.tolist()
# Only considering Content that had been listened by more than minimum user ratings 
min_user_ratings = 50
filter_users = user['user_id'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()

df_new = user[(user['content_id'].isin(filter_content)) & (user['user_id'].isin(filter_users))]
print('The original data frame shape:\t{}'.format(user.shape))
print('The new data frame shape:\t{}'.format(df_new.shape))

The original data frame shape:	(1000000, 5)
The new data frame shape:	(17678, 5)


### Now we use Surprise package for building the model



In [18]:
# !python --version
from surprise.reader import Reader

In [19]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df_new[['user_id', 'content_id', 'rating']], reader)

In [20]:
from surprise import SVD,SVDpp
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse') 

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,0.849807,2.944133,0.114737
SVD,0.850808,0.521455,0.028203


In [22]:
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=0.25)
from surprise import accuracy
algo = SVD()
# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)
# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.8511


0.8511017193562317