### Assignment-Recommendation System on Books Dataset

In [162]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine,correlation

In [163]:
# Import Dataset
books=pd.read_csv('book.csv',encoding='Latin1')
books

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
0,1,276726,Classical Mythology,5
1,2,276729,Clara Callan,3
2,3,276729,Decision in Normandy,6
3,4,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,5,276737,The Mummies of Urumchi,6
...,...,...,...,...
9995,9996,162121,American Fried: Adventures of a Happy Eater.,7
9996,9997,162121,Cannibal In Manhattan,9
9997,9998,162121,How to Flirt: A Practical Guide,7
9998,9999,162121,Twilight,8


In [164]:
#Drop the Unnmed 0 column
books=books.iloc[:,1:]
books

Unnamed: 0,User.ID,Book.Title,Book.Rating
0,276726,Classical Mythology,5
1,276729,Clara Callan,3
2,276729,Decision in Normandy,6
3,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,276737,The Mummies of Urumchi,6
...,...,...,...
9995,162121,American Fried: Adventures of a Happy Eater.,7
9996,162121,Cannibal In Manhattan,9
9997,162121,How to Flirt: A Practical Guide,7
9998,162121,Twilight,8


In [165]:
#Check for Duplicates if any
books.loc[books.duplicated(),:]

Unnamed: 0,User.ID,Book.Title,Book.Rating
5051,2152,Le nouveau soleil de Teur,7
7439,3757,The Magician's Tale,7


In [166]:
#Remove Duplicates
books.drop_duplicates(inplace=True)

In [167]:
books.shape

(9998, 3)

In [168]:
# Number of unique users in the dataset
len(books['User.ID'].unique())

2182

In [169]:
# Number of unique books in the dataset
books['Book.Title'].nunique()

9659

In [170]:
#Analyzing the data
books.groupby(['Book.Title'])['User.ID'].count().sort_values(ascending=False)

Book.Title
Fahrenheit 451                                                            5
Vanished                                                                  4
Stardust                                                                  4
Ender's Game (Ender Wiggins Saga (Paperback))                             4
The Subtle Knife (His Dark Materials, Book 2)                             4
                                                                         ..
His-And-Hers Twins (Harlequin American Romance, No. 820)                  1
Histoires Extraordinaires                                                 1
Historia de la vida del BuscÃ³n (Selecciones Austral ; 24 : ClÃ¡sicos)    1
Historical Whodunits                                                      1
Ã?Â?bermorgen.                                                            1
Name: User.ID, Length: 9659, dtype: int64

In [171]:
books.groupby(['User.ID'])['Book.Title'].count().sort_values(ascending=False)

User.ID
3757      522
162052    214
2276      212
4017      156
277427    150
         ... 
3800        1
3798        1
739         1
3789        1
3934        1
Name: Book.Title, Length: 2182, dtype: int64

In [172]:
books.loc[books['Book.Title']=='Vanished']#'Fahrenheit 451']

Unnamed: 0,User.ID,Book.Title,Book.Rating
617,277284,Vanished,7
5449,2355,Vanished,8
5596,2442,Vanished,5
8728,161234,Vanished,6


In [173]:
books.loc[books['User.ID']==277284]


Unnamed: 0,User.ID,Book.Title,Book.Rating
617,277284,Vanished,7


In [174]:
books.loc[books['User.ID']==2355]

Unnamed: 0,User.ID,Book.Title,Book.Rating
5449,2355,Vanished,8


In [175]:
books.describe()

Unnamed: 0,User.ID,Book.Rating
count,9998.0,9998.0
mean,95339.726845,7.566413
std,117650.21549,1.821685
min,8.0,1.0
25%,2103.0,7.0
50%,3757.0,8.0
75%,162052.0,9.0
max,278854.0,10.0


In [176]:
# Converting the tablular data into wide data using pivot table
books3=books.pivot_table(index='User.ID',columns='Book.Title',values='Book.Rating')

In [177]:
# Impute those NaNs with 0 values
books3.fillna(0,inplace=True)
books3

Book.Title,"Jason, Madison &amp",Other Stories;Merril;1985;McClelland &amp,Repairing PC Drives &amp,'48,'O Au No Keia: Voices from Hawai'I's Mahu and Transgender Communities,...AND THE HORSE HE RODE IN ON : THE PEOPLE V. KENNETH STARR,01-01-00: A Novel of the Millennium,"1,401 More Things That P*Ss Me Off",10 Commandments Of Dating,"100 Great Fantasy Short, Short Stories",...,Zora Hurston and the Chinaberry Tree (Reading Rainbow Book),\Even Monkeys Fall from Trees\ and Other Japanese Proverbs,\I Won't Learn from You\: And Other Thoughts on Creative Maladjustment,"\More More More,\ Said the Baby",\O\ Is for Outlaw,"\Surely You're Joking, Mr. Feynman!\: Adventures of a Curious Character","\Well, there's your problem\: Cartoons",iI Paradiso Degli Orchi,stardust,Ã?Â?bermorgen.
User.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0
278852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [178]:
books3['Fahrenheit 451']

User.ID
8         0.0
9         0.0
10        0.0
12        0.0
14        0.0
         ... 
278846    0.0
278849    0.0
278851    0.0
278852    0.0
278854    0.0
Name: Fahrenheit 451, Length: 2182, dtype: float64

In [179]:
# Calculating Cosine Similarity between Users on array data
user_sim=1-pairwise_distances(books3.values,metric='cosine')
user_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [180]:
user_sim.shape

(2182, 2182)

In [181]:
# Store the results in a dataframe format
user_sim2=pd.DataFrame(user_sim)
user_sim2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2172,2173,2174,2175,2176,2177,2178,2179,2180,2181
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2180,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [182]:
# Set the index and column names to user ids
user_sim2.index=books['User.ID'].unique()
user_sim2.columns=books['User.ID'].unique()
user_sim2

Unnamed: 0,276726,276729,276736,276737,276744,276745,276747,276748,276751,276754,...,162085,162091,162092,162095,162103,162107,162109,162113,162121,162129
276726,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276729,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276736,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276737,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276744,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
162109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
162113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
162121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [183]:
# Nullifying diagonal values
np.fill_diagonal(user_sim,0)
user_sim2

Unnamed: 0,276726,276729,276736,276737,276744,276745,276747,276748,276751,276754,...,162085,162091,162092,162095,162103,162107,162109,162113,162121,162129
276726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [184]:
# Most Similar Users
user_sim2.idxmax(axis=1)

276726    276726
276729    276726
276736    276726
276737    276726
276744    276726
           ...  
162107    276726
162109    276726
162113    161453
162121    276726
162129    276726
Length: 2182, dtype: int64

In [185]:
user_sim2[278633][276875]
user_sim2[277284][2355]

0.0

In [186]:
#Running a Loop to get all the similar users in a dataframe
col_names = user_sim2.columns
lst1=[]
lst2=[]
lst3=[]
for idx, row in user_sim2.iterrows():
     i1=-1
     for col in row:
         i1= i1+1
         if (idx == user_sim2.columns[i1]):
            break
         if round(col,2) !=0:
            lst1.append(idx)
            lst2.append(round(col,2))
            lst3.append(user_sim2.columns[i1])
print (lst1, lst2, lst3)
df = pd.DataFrame (list (zip(lst1, lst2, lst3)), columns= ['User1', 'Similarity_score', 'User2'])
print (df)

[277196, 277269, 277533, 277556, 277556, 277710, 277767, 277795, 277807, 277828, 278044, 278160, 278246, 278270, 278270, 278270, 278398, 278401, 278543, 278561, 278561, 278565, 278565, 278565, 278645, 278681, 16, 81, 102, 217, 221, 337, 367, 367, 431, 512, 592, 595, 695, 695, 705, 777, 903, 924, 965, 1093, 1093, 1108, 1108, 1108, 1108, 1113, 1263, 1263, 1297, 1359, 1399, 1436, 1473, 1475, 1491, 1491, 1491, 1491, 1491, 1545, 1583, 1596, 1597, 1650, 1719, 1782, 1792, 1830, 1859, 1891, 1915, 1954, 1984, 1984, 1987, 1987, 2036, 2036, 2036, 2045, 2045, 2051, 2074, 2163, 2203, 2203, 2203, 2311, 2312, 2312, 2334, 2340, 2355, 2363, 2363, 2414, 2437, 2437, 2437, 2470, 2470, 2480, 2499, 2578, 2583, 2687, 2734, 2759, 2782, 2782, 2782, 2850, 2850, 2850, 2850, 2883, 2939, 2939, 2973, 2983, 2991, 2991, 3023, 3048, 3089, 3099, 3114, 3175, 3175, 3175, 3251, 3251, 3251, 3251, 3261, 3293, 3328, 3346, 3346, 3346, 3350, 3435, 3462, 3472, 3473, 3481, 3481, 3508, 3534, 3534, 3546, 3550, 3550, 3564, 3564, 35

In [187]:
df.sort_values('Similarity_score', ascending = False).head(40)

Unnamed: 0,User1,Similarity_score,User2
141,3293,1.0,277601
170,3619,1.0,1782
207,3944,1.0,92
92,2203,1.0,367
146,3350,1.0,277353
121,2883,1.0,277351
299,161458,1.0,3538
169,3619,1.0,1775
289,161390,1.0,278620
11,278160,1.0,277945


In [188]:
df.loc[df['User1']==2074]

Unnamed: 0,User1,Similarity_score,User2
88,2074,0.46,291


In [189]:
# Extract the books which userId 162113 & 161453 have watched
books[((books['User.ID']==161390) | (books['User.ID']==278620))]

Unnamed: 0,User.ID,Book.Title,Book.Rating
2207,278620,April Fool Dead: A Death on Demand Mystery,4
8923,161390,Pour Your Heart into It: How Starbucks Built a...,10


In [190]:
# Extract the books which userId 162113 & 161453 have watched
books[(books['User.ID']==162113) | (books['User.ID']==161453)]

Unnamed: 0,User.ID,Book.Title,Book.Rating
8959,161453,"Bread, Tomato, Garlic: Quick Cooking With 3 Ma...",9
8960,161453,"The Ubiquitous Shrimp: From Simple to Exotic, ...",8
9989,162113,The Cape Ann (Contemporary American Fiction),8


In [191]:
user_1=books[(books['User.ID']==162113)]
user_2=books[(books['User.ID']==161453)]

In [192]:
user_1['Book.Title']

9989    The Cape Ann (Contemporary American Fiction)
Name: Book.Title, dtype: object

In [193]:
user_2['Book.Title']

8959    Bread, Tomato, Garlic: Quick Cooking With 3 Ma...
8960    The Ubiquitous Shrimp: From Simple to Exotic, ...
Name: Book.Title, dtype: object

In [194]:
pd.merge(user_1,user_2,on='Book.Title',how='outer')

Unnamed: 0,User.ID_x,Book.Title,Book.Rating_x,User.ID_y,Book.Rating_y
0,162113.0,The Cape Ann (Contemporary American Fiction),8.0,,
1,,"Bread, Tomato, Garlic: Quick Cooking With 3 Ma...",,161453.0,9.0
2,,"The Ubiquitous Shrimp: From Simple to Exotic, ...",,161453.0,8.0
