## Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import warnings

warnings.filterwarnings("ignore")

## Data Preprocessing

In [2]:
# Reading data

meta_data = pd.read_csv(r"D:\Data\Pratilipi\ds-assignment\metadata.csv")
user_data = pd.read_csv(r"D:\Data\Pratilipi\ds-assignment\user-interactions.csv", index_col = "Unnamed: 0")

In [3]:
meta_data

Unnamed: 0,author_id,pratilipi_id,category_name,reading_time,updated_at,published_at
0,-3418949279741297,1025741862639304,translation,0,2020-08-19 15:26:13,2016-09-30 10:37:04
1,-2270332351871840,1377786215601277,translation,171,2021-01-21 16:27:07,2018-06-11 13:17:48
2,-2270332352037261,1377786215601962,translation,92,2020-09-29 12:33:57,2018-06-12 04:19:12
3,-2270332352521845,1377786215640994,translation,0,2019-10-17 09:03:37,2019-09-26 14:58:53
4,-2270332349665658,1377786215931338,translation,47,2020-05-05 11:33:41,2018-11-25 12:28:23
...,...,...,...,...,...,...
954496,-2270332337845247,1377786228358627,Horror-Marathon,304,2022-03-22 17:40:22,2022-03-22 17:40:22
954497,-2270332334263077,1377786228362002,Horror-Marathon,588,2022-03-22 11:44:39,2022-03-22 11:44:39
954498,-2270332350350076,1377786228362682,Horror-Marathon,359,2022-03-22 12:39:41,2022-03-22 12:38:40
954499,-2270332337845247,1377786228375726,Horror-Marathon,310,2022-03-23 15:55:11,2022-03-23 15:55:11


In [4]:
user_data

Unnamed: 0,user_id,pratilipi_id,read_percent,updated_at
0,5506791963854965,1377786220672965,100.0,2022-03-23 00:08:26.227
1,5506791979071996,1377786219742624,29.0,2022-03-23 00:08:26.220
2,5506791980256358,1377786217096334,22.0,2022-03-23 00:08:26.020
3,5506791988747277,1377786224767880,100.0,2022-03-23 00:08:25.306
4,5506791992372558,1377786218111595,100.0,2022-03-23 00:08:25.250
...,...,...,...,...
9999995,5506791962779331,1377786223645432,38.0,2022-03-18 15:14:41.973
9999996,5506791970542005,1377786228178674,100.0,2022-03-18 15:14:41.924
9999997,5506791954036110,1377786225804654,100.0,2022-03-18 15:14:41.827
9999998,5506791960526245,1377786216689875,100.0,2022-03-18 15:14:41.777


In [5]:
meta_data.isnull().sum() # checking for null values

author_id        0
pratilipi_id     0
category_name    0
reading_time     0
updated_at       0
published_at     7
dtype: int64

In [6]:
# since there are only a few null values, we can drop them

meta_data = meta_data.dropna()

meta_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 954494 entries, 0 to 954500
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   author_id      954494 non-null  int64 
 1   pratilipi_id   954494 non-null  int64 
 2   category_name  954494 non-null  object
 3   reading_time   954494 non-null  int64 
 4   updated_at     954494 non-null  object
 5   published_at   954494 non-null  object
dtypes: int64(3), object(3)
memory usage: 51.0+ MB


In [7]:
user_data.isnull().sum()

user_id         0
pratilipi_id    0
read_percent    0
updated_at      0
dtype: int64

In [8]:
# looking at the most common genres 
meta_data["category_name"].value_counts()

romance                      193218
shortstories                 102096
social                        73919
suspense                      64041
family                        58515
novels                        53584
life                          51356
moral-inspiring               47421
women                         37167
horror                        28506
entertainment                 23362
action-and-adventure          21589
experiences-and-memories      20275
webseries                     16227
relegion-and-spiritual        15476
comedy                        14893
Pratilipi-kalamkar-samman     13237
mythology                     12323
children                      10196
Indiawale                      9674
Pratilipi-Awards-Hindi         9342
fantasy                        9324
swahindi2                      7853
drama                          7387
crime                          7151
politics                       5810
translation                    4250
short-story-challenge       

In [9]:
# converting strings into date-time objects

meta_data["updated_at"] = pd.to_datetime(meta_data["updated_at"], format = "%Y-%m-%d %H:%M:%S")
meta_data["published_at"] = pd.to_datetime(meta_data["published_at"], format = "%Y-%m-%d %H:%M:%S")

user_data["updated_at"] = pd.to_datetime(user_data["updated_at"], format = "%Y-%m-%d %H:%M:%S.%f")

In [10]:
# converting the date-time objects into integer timestamps

meta_data["updated_ts"] = meta_data["updated_at"].values.astype(np.int64)// 10 ** 9
meta_data["published_ts"] = meta_data["published_at"].values.astype(np.int64)// 10 ** 9

user_data["user_update"] = user_data["updated_at"].values.astype(np.int64)// 10 ** 9 

In [11]:
meta_data

Unnamed: 0,author_id,pratilipi_id,category_name,reading_time,updated_at,published_at,updated_ts,published_ts
0,-3418949279741297,1025741862639304,translation,0,2020-08-19 15:26:13,2016-09-30 10:37:04,1597850773,1475231824
1,-2270332351871840,1377786215601277,translation,171,2021-01-21 16:27:07,2018-06-11 13:17:48,1611246427,1528723068
2,-2270332352037261,1377786215601962,translation,92,2020-09-29 12:33:57,2018-06-12 04:19:12,1601382837,1528777152
3,-2270332352521845,1377786215640994,translation,0,2019-10-17 09:03:37,2019-09-26 14:58:53,1571303017,1569509933
4,-2270332349665658,1377786215931338,translation,47,2020-05-05 11:33:41,2018-11-25 12:28:23,1588678421,1543148903
...,...,...,...,...,...,...,...,...
954496,-2270332337845247,1377786228358627,Horror-Marathon,304,2022-03-22 17:40:22,2022-03-22 17:40:22,1647970822,1647970822
954497,-2270332334263077,1377786228362002,Horror-Marathon,588,2022-03-22 11:44:39,2022-03-22 11:44:39,1647949479,1647949479
954498,-2270332350350076,1377786228362682,Horror-Marathon,359,2022-03-22 12:39:41,2022-03-22 12:38:40,1647952781,1647952720
954499,-2270332337845247,1377786228375726,Horror-Marathon,310,2022-03-23 15:55:11,2022-03-23 15:55:11,1648050911,1648050911


In [12]:
user_data

Unnamed: 0,user_id,pratilipi_id,read_percent,updated_at,user_update
0,5506791963854965,1377786220672965,100.0,2022-03-23 00:08:26.227,1647994106
1,5506791979071996,1377786219742624,29.0,2022-03-23 00:08:26.220,1647994106
2,5506791980256358,1377786217096334,22.0,2022-03-23 00:08:26.020,1647994106
3,5506791988747277,1377786224767880,100.0,2022-03-23 00:08:25.306,1647994105
4,5506791992372558,1377786218111595,100.0,2022-03-23 00:08:25.250,1647994105
...,...,...,...,...,...
9999995,5506791962779331,1377786223645432,38.0,2022-03-18 15:14:41.973,1647616481
9999996,5506791970542005,1377786228178674,100.0,2022-03-18 15:14:41.924,1647616481
9999997,5506791954036110,1377786225804654,100.0,2022-03-18 15:14:41.827,1647616481
9999998,5506791960526245,1377786216689875,100.0,2022-03-18 15:14:41.777,1647616481


In [13]:
# combining both the dataframes

df = user_data.merge(meta_data, how = "inner",  on = "pratilipi_id")

In [14]:
# One-hot encoding the categorical variables

df = pd.get_dummies(df)
df

Unnamed: 0,user_id,pratilipi_id,read_percent,updated_at_x,user_update,author_id,reading_time,updated_at_y,published_at,updated_ts,...,category_name_romance,category_name_science-fiction,category_name_short-story-challenge,category_name_shortstories,category_name_social,category_name_suspense,category_name_swahindi2,category_name_translation,category_name_webseries,category_name_women
0,5506791963854965,1377786220672965,100.0,2022-03-23 00:08:26.227,1647994106,-2270332344732237,330,2020-10-12 09:17:49,2020-08-27 12:18:06,1602494269,...,0,0,0,0,0,0,0,0,0,0
1,5506791963854965,1377786220672965,100.0,2022-03-23 00:08:26.227,1647994106,-2270332344732237,330,2020-10-12 09:17:49,2020-08-27 12:18:06,1602494269,...,1,0,0,0,0,0,0,0,0,0
2,5506791963854965,1377786220672965,100.0,2022-03-23 00:08:26.227,1647994106,-2270332344732237,330,2020-10-12 09:17:49,2020-08-27 12:18:06,1602494269,...,0,0,0,0,0,0,0,0,0,0
3,5506791990547519,1377786220672965,100.0,2022-03-22 22:32:11.828,1647988331,-2270332344732237,330,2020-10-12 09:17:49,2020-08-27 12:18:06,1602494269,...,0,0,0,0,0,0,0,0,0,0
4,5506791990547519,1377786220672965,100.0,2022-03-22 22:32:11.828,1647988331,-2270332344732237,330,2020-10-12 09:17:49,2020-08-27 12:18:06,1602494269,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15892128,5506791986693589,1377786216364465,100.0,2022-03-18 15:15:22.194,1647616522,-2270332351898997,1553,2021-03-18 23:24:59,2019-03-06 16:59:28,1616109899,...,1,0,0,0,0,0,0,0,0,0
15892129,5506791954761436,1377786225176487,100.0,2022-03-18 15:15:21.427,1647616521,-2270332346798840,494,2021-08-16 22:34:23,2021-08-16 22:34:23,1629153263,...,0,0,0,0,0,0,0,0,0,0
15892130,5506791977932165,1377786225373700,100.0,2022-03-18 15:15:03.820,1647616503,-2270332334456371,1239,2021-08-31 14:41:49,2021-08-31 14:40:49,1630420909,...,0,0,0,0,0,0,0,0,0,0
15892131,5506791977932165,1377786225373700,100.0,2022-03-18 15:15:03.820,1647616503,-2270332334456371,1239,2021-08-31 14:41:49,2021-08-31 14:40:49,1630420909,...,1,0,0,0,0,0,0,0,0,0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15892133 entries, 0 to 15892132
Data columns (total 56 columns):
 #   Column                                   Dtype         
---  ------                                   -----         
 0   user_id                                  int64         
 1   pratilipi_id                             int64         
 2   read_percent                             float64       
 3   updated_at_x                             datetime64[ns]
 4   user_update                              int64         
 5   author_id                                int64         
 6   reading_time                             int64         
 7   updated_at_y                             datetime64[ns]
 8   published_at                             datetime64[ns]
 9   updated_ts                               int64         
 10  published_ts                             int64         
 11  category_name_Horror-Marathon            uint8         
 12  category_name_Indiawale   

In [16]:
df = df.drop(columns = ["updated_at_x", "updated_at_y", "published_at"])
df

Unnamed: 0,user_id,pratilipi_id,read_percent,user_update,author_id,reading_time,updated_ts,published_ts,category_name_Horror-Marathon,category_name_Indiawale,...,category_name_romance,category_name_science-fiction,category_name_short-story-challenge,category_name_shortstories,category_name_social,category_name_suspense,category_name_swahindi2,category_name_translation,category_name_webseries,category_name_women
0,5506791963854965,1377786220672965,100.0,1647994106,-2270332344732237,330,1602494269,1598530686,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5506791963854965,1377786220672965,100.0,1647994106,-2270332344732237,330,1602494269,1598530686,0,0,...,1,0,0,0,0,0,0,0,0,0
2,5506791963854965,1377786220672965,100.0,1647994106,-2270332344732237,330,1602494269,1598530686,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5506791990547519,1377786220672965,100.0,1647988331,-2270332344732237,330,1602494269,1598530686,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5506791990547519,1377786220672965,100.0,1647988331,-2270332344732237,330,1602494269,1598530686,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15892128,5506791986693589,1377786216364465,100.0,1647616522,-2270332351898997,1553,1616109899,1551891568,0,0,...,1,0,0,0,0,0,0,0,0,0
15892129,5506791954761436,1377786225176487,100.0,1647616521,-2270332346798840,494,1629153263,1629153263,0,0,...,0,0,0,0,0,0,0,0,0,0
15892130,5506791977932165,1377786225373700,100.0,1647616503,-2270332334456371,1239,1630420909,1630420849,0,0,...,0,0,0,0,0,0,0,0,0,0
15892131,5506791977932165,1377786225373700,100.0,1647616503,-2270332334456371,1239,1630420909,1630420849,0,0,...,1,0,0,0,0,0,0,0,0,0


## Popularity based recommended system

### creating a custom popularity metric might help us ranking the books

In [17]:
# creating the popularity metric column

df["popularity_metric"] = abs(((df["read_percent"]*df["reading_time"])/100)/ (df["user_update"] - df["published_ts"]))

In [18]:
df["popularity_metric"]

0           0.000007
1           0.000007
2           0.000007
3           0.000007
4           0.000007
              ...   
15892128    0.000016
15892129    0.000027
15892130    0.000072
15892131    0.000072
15892132    0.000072
Name: popularity_metric, Length: 15892133, dtype: float64

### Instead of just using the metric as it is, it might be a better idea to find the average of them with a prerequisite of atleast 200 reads

In [19]:
# grouping by pratilipi_id and only considering the ones with atleast 200 people reading them
grouped_df = df.groupby("pratilipi_id").mean()
most_read = grouped_df[df.groupby("pratilipi_id").count()["popularity_metric"] >= 200]

In [20]:
most_read.shape

(19110, 53)

In [21]:
# Taking the top 10 most popular books
most_read["popularity_metric"].sort_values(ascending=False)[:10]

pratilipi_id
1377786228354080    0.668709
1377786228363493    0.621213
1377786228287143    0.366785
1377786228319847    0.329137
1377786228311918    0.325423
1377786228368115    0.304442
1377786228363372    0.299725
1377786228289417    0.295593
1377786228356929    0.292811
1377786228331460    0.291643
Name: popularity_metric, dtype: float64

### The above books are the top 10 most popular books on our database, so it might be a good idea to recommend these to our user.

##  Collaborative Filtering Based Recommender

In [22]:
# splitting the dataframe into train-test

train_ratio = int(df.shape[0]*3/4)
train_data = df.sort_values(by = "published_ts")[:train_ratio]
test_data = df.sort_values(by = "published_ts")[train_ratio:]

In [23]:
# filtering out users with more than 200 reads
x = train_data.groupby("user_id").count()["popularity_metric"] >= 200
most_active_users = x[x].index

In [24]:
filtered_rating = train_data[train_data["user_id"].isin(most_active_users)]

In [25]:
# filtering out books with more than 50 reads

y = filtered_rating.groupby("pratilipi_id").count()["popularity_metric"] >= 50

popular_books = y[y].index

In [26]:
popular_books

Int64Index([-823882324207928, -487510917343544, -253947406543160,
             375107047560904,  733361919923912,  979877554722504,
            1114522134943432, 1377786215438472, 1377786215455039,
            1377786215467659,
            ...
            1377786227424563, 1377786227424654, 1377786227425281,
            1377786227425429, 1377786227425853, 1377786227426623,
            1377786227427554, 1377786227431447, 1377786227432272,
            1377786227432880],
           dtype='int64', name='pratilipi_id', length=23930)

In [27]:
final_rating = filtered_rating[filtered_rating["pratilipi_id"].isin(popular_books)]

In [28]:
# pivoting the dataframe with respect to "pa

pt = final_rating.pivot_table(index = "pratilipi_id", columns = "user_id", values = "popularity_metric")

In [29]:
# Filling null with 0
pt.fillna(0, inplace=True)

In [30]:
pt

user_id,3263998672675492,3267248272614052,3277958297854628,3292327750574756,3300962971067044,3308162227544740,3311404754313892,3312021925175972,3314890925580964,3324330042630820,...,5506791996543315,5506791996545531,5506791996549504,5506791996549531,5506791996556138,5506791996564684,5506791996570476,5506791996588116,5506791996588709,5506791996591960
pratilipi_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-823882324207928,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-487510917343544,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-253947406543160,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
375107047560904,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
733361919923912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1377786227426623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1377786227427554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1377786227431447,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1377786227432272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
similarity_score = cosine_similarity(pt)

In [33]:
similarity_score.shape

(23930, 23930)

In [34]:
# Function to give the 10 nearest points 

def recommend(book_id):
    index = np.where(pt.index==book_id)[0][0]
    similar_items = sorted(list(enumerate(similarity_score[index])),key=lambda x:x[1],reverse=True)[1:11]
    
    data = []
    for i in similar_items:
        item = []
        temp_df = meta_data[meta_data['pratilipi_id'] == pt.index[i[0]]]
        item.extend(list(temp_df.drop_duplicates('pratilipi_id')['pratilipi_id'].values))
        item.extend(list(temp_df.drop_duplicates('pratilipi_id')['author_id'].values))
        item.extend(list(temp_df.drop_duplicates('pratilipi_id')['category_name'].values))
        
        data.append(item)
    return pd.DataFrame(data, columns = ["Pratilipi ID", "Author ID", "Genre"])

In [35]:
# Let's try an example
recommend(375107047560904)

Unnamed: 0,Pratilipi ID,Author ID,Genre
0,1377786224955871,-2270332347525508,romance
1,1377786215574857,-4144357778322801,moral-inspiring
2,1377786218561193,-2270332352154662,family
3,1377786223564024,-2270332345997468,romance
4,1377786217339446,-2270332350277755,family
5,1377786220822984,-2270332351050933,family
6,1377786226010473,-2270332348842760,family
7,1114522134943432,-2730229226468721,novels
8,1377786226268538,-2270332348842760,family
9,1377786217069765,-3923764082570609,romance
