In [69]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from scipy.spatial.distance import cosine
from sklearn.metrics import pairwise_distances

In [70]:
df=pd.read_csv('book.csv',encoding='latin-1')
df.head()

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
0,1,276726,Classical Mythology,5
1,2,276729,Clara Callan,3
2,3,276729,Decision in Normandy,6
3,4,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,5,276737,The Mummies of Urumchi,6


In [71]:
# removing 'Unnamed: 0' column as we dont need them
df.drop('Unnamed: 0',axis=1,inplace=True)

In [72]:
# Checking for duplicate rows
df.duplicated().sum()

2

In [73]:
# removing duplicate rows
df.drop_duplicates(inplace=True)

In [74]:
df.head()

Unnamed: 0,User.ID,Book.Title,Book.Rating
0,276726,Classical Mythology,5
1,276729,Clara Callan,3
2,276729,Decision in Normandy,6
3,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,276737,The Mummies of Urumchi,6


In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9998 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   User.ID      9998 non-null   int64 
 1   Book.Title   9998 non-null   object
 2   Book.Rating  9998 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 312.4+ KB


In [76]:
# Shape of our data
df.shape

(9998, 3)

In [77]:
# Checking how many users are there
df['User.ID'].nunique()

2182

We have 2182 no of unique user from 10000 datas

In [78]:
# Checking distribution of Rating
fig=go.Figure(go.Pie(labels=df['Book.Rating'].value_counts().index,values=df['Book.Rating'].value_counts().values))
fig.update_layout(autosize=False,title_text='Distribution of User Rating')

In [79]:
# Lets see how many unique books are there 
df['Book.Title'].nunique()

9659

In [80]:
# Incase different case (upper/lower) are there, so our unique values will be
df['Book.Title'].apply(lambda x: x.lower().strip()).nunique()

9638

In [81]:
# Lets see what are those mistakes
unique_df=pd.DataFrame(df['Book.Title'].unique(),columns=['uniques'])
unique_df['lower_case']=unique_df['uniques'].apply(lambda x:x.lower().strip())
unique_df=unique_df.sort_values('lower_case')
stack=[]
for i in range(len(unique_df)-1):
    
    if unique_df.iloc[i,1]==unique_df.iloc[i+1,1]:
        stack.append(unique_df.iloc[i,0])
    else:
        if stack:
            stack.append(unique_df.iloc[i,0])
            print(stack)
            stack=[]


["Bachelor brothers' bed &amp; breakfast", "Bachelor Brothers' Bed &amp; Breakfast"]
['Cider With Rosie', 'Cider with Rosie']
['Coyote Blue', 'COYOTE BLUE']
['Flesh and Blood', 'Flesh And Blood']
['Flood : Mississippi 1927', 'FLOOD : Mississippi 1927']
['Girl with a Pearl Earring', 'Girl With a Pearl Earring']
['House of Mirth', 'HOUSE OF MIRTH']
['Jacob the Baker: Gentle Wisdom for a Complicated World', 'Jacob the Baker: Gentle Wisdom For a Complicated World']
['JITTERBUG PERFUME', 'Jitterbug Perfume']
["L'Oeil de la Sibylle", "L'oeil de la sibylle"]
['Little House on the Prairie', 'Little House On the Prairie']
['LONESOME DOVE', 'Lonesome Dove']
['LUCKY', 'Lucky']
['Moonlight Becomes You', 'MOONLIGHT BECOMES YOU']
['Pleading Guilty', 'PLEADING GUILTY']
['Random Winds', 'Random winds']
['Readings in social psychology: General, classic, and contemporary selections', 'Readings in Social Psychology: General, Classic, and Contemporary Selections']
['SHIPPING NEWS', 'Shipping News']
['Star

Above pairs are same books but written with different cases

In [82]:
# Lower casing all book title
df['Book.Title']=df['Book.Title'].apply(lambda x:x.lower().strip())

# Distribution of books
df['Book.Title'].value_counts()

fahrenheit 451                                                                                                              5
stardust                                                                                                                    5
the amber spyglass (his dark materials, book 3)                                                                             4
vanished                                                                                                                    4
ender's game (ender wiggins saga (paperback))                                                                               4
                                                                                                                           ..
babinski reflex: and 70 other useful and amazing metaphors from science, psychology, business, sports, and everyday life    1
lore of the unicorn                                                                                                   

In [83]:
# Now making pivot columns

pivot_table=df.pivot_table(index='User.ID',values='Book.Rating',columns='Book.Title').fillna(0)
pivot_table.head()

Book.Title,'48,'o au no keia: voices from hawai'i's mahu and transgender communities,...and the horse he rode in on : the people v. kenneth starr,01-01-00: a novel of the millennium,"1,401 more things that p*ss me off",10 commandments of dating,"100 great fantasy short, short stories",1001 brilliant ways to checkmate,101 bright ideas: esl activities for all ages,101 dalmatians,...,zen meditations on being a mother (zen meditations),zen: a way of life (teach yourself books),"zits unzipped : sketchbook #5 (scott, jerry, zits collection sketchbook, no. 5.)",zlateh the goat and other stories,zodiac: the eco-thriller,zombie!,zombies of the gene pool,zoot suit and other plays,zora hurston and the chinaberry tree (reading rainbow book),ã?â?bermorgen.
User.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
# Pairwise distances
user_sim=1-pairwise_distances(pivot_table.values,metric='cosine')
user_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [85]:
# making a dataframe
user_sim_df=pd.DataFrame(user_sim,index=df['User.ID'].unique(),columns=df['User.ID'].unique())
user_sim_df

Unnamed: 0,276726,276729,276736,276737,276744,276745,276747,276748,276751,276754,...,162085,162091,162092,162095,162103,162107,162109,162113,162121,162129
276726,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276729,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276736,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276737,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276744,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
162109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
162113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
162121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [86]:
# replacing diagonal element with 0
# np.fill_diagonal(user_sim_df,0)

for i in range (len(user_sim_df)):
    user_sim_df.iloc[i,i]=0

user_sim_df.head()

Unnamed: 0,276726,276729,276736,276737,276744,276745,276747,276748,276751,276754,...,162085,162091,162092,162095,162103,162107,162109,162113,162121,162129
276726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [87]:
# Checking most similar users
user_sim_df.idxmax(axis=1)[:5]


The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.



276726    276726
276729    276726
276736    276726
276737    276726
276744    276726
dtype: int64

In [88]:
df[(df['User.ID'] ==276726) | (df['User.ID'] ==276729) ]

Unnamed: 0,User.ID,Book.Title,Book.Rating
0,276726,classical mythology,5
1,276729,clara callan,3
2,276729,decision in normandy,6


In [89]:
# popular books
popular_book_df=df[['Book.Title','Book.Rating']].groupby('Book.Title').mean()
popular_book_df.loc[:,'count']=(df['Book.Title'].value_counts().sort_index().values.reshape(-1,1))
popular_book_df.loc[:,'multiply']=popular_book_df['Book.Rating']*popular_book_df['count']
popular_book_df.sort_values('multiply',ascending=False,inplace=True)

In [90]:
# defining a function to recommend books based on userID
def recommend_the_user(id,n=5):
    similar_user_id=user_sim_df[user_sim_df[id]!=0][id].index
    books=[]
    if len(similar_user_id)!=0:
        for i in similar_user_id:
            books=np.append(books,df[df['User.ID']==i]['Book.Title'].values)
        unique, counts = np.unique(books, return_counts=True)
        books=dict(zip(unique, counts))
        print('Recommendation\n','-'*20)
        print(*[k for k, v in sorted(books.items(), key=lambda item: item[1])][:n],sep='\n')

    else :
        print('Most Purchased Books\n','-'*20)
        print(*df['Book.Title'].value_counts()[:n].index,sep='\n')

        print('\nMost Popular Books\n','-'*20)
        print(*popular_book_df.iloc[:n].index,sep='\n')
        
        
    

In [91]:
recommend_the_user(276748)

Recommendation
 --------------------
best little word book ever! (little golden book)
busiest firefighters ever! (little golden book)
cookie monster/cookie tree
do you know?
grover's own alphabet


In [92]:
recommend_the_user(3613)

Recommendation
 --------------------
best little word book ever! (little golden book)
busiest firefighters ever! (little golden book)
children of the night
cold mountain (mti) (vintage contemporaries)
cookie monster/cookie tree


In [93]:
recommend_the_user(276754)

Most Purchased Books
 --------------------
fahrenheit 451
stardust
the amber spyglass (his dark materials, book 3)
vanished
ender's game (ender wiggins saga (paperback))

Most Popular Books
 --------------------
stardust
fahrenheit 451
the subtle knife (his dark materials, book 2)
the amber spyglass (his dark materials, book 3)
flesh and blood
