# Importing laibraries

In [1]:
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances

import warnings
warnings.filterwarnings('ignore')

# loading Dataset & Preprocessing

In [2]:
dataset=pd.read_csv('C:/Users/HP/Downloads/Assignment/book1.csv')

In [3]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
0,1,276726,Classical Mythology,5
1,2,276729,Clara Callan,3
2,3,276729,Decision in Normandy,6
3,4,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,5,276737,The Mummies of Urumchi,6


In [4]:
# Dropping Unnamed column

dataset=dataset.drop(['Unnamed: 0'],axis=1)

In [5]:
dataset

Unnamed: 0,User.ID,Book.Title,Book.Rating
0,276726,Classical Mythology,5
1,276729,Clara Callan,3
2,276729,Decision in Normandy,6
3,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,276737,The Mummies of Urumchi,6
...,...,...,...
9995,162121,American Fried: Adventures of a Happy Eater.,7
9996,162121,Cannibal In Manhattan,9
9997,162121,How to Flirt: A Practical Guide,7
9998,162121,Twilight,8


In [6]:
# Renaming  the columns name

dataset.columns = ['User_ID', 'Book_Title', 'Book_Rating']

In [7]:
dataset.Book_Title.nunique()

9659

In [8]:
# Creating Book_ID Column.

dataset['Book_ID'] = dataset.Book_Title.astype('category').cat.codes

In [9]:
dataset.head()

Unnamed: 0,User_ID,Book_Title,Book_Rating,Book_ID
0,276726,Classical Mythology,5,1443
1,276729,Clara Callan,3,1440
2,276729,Decision in Normandy,6,1883
3,276736,Flu: The Story of the Great Influenza Pandemic...,8,2722
4,276737,The Mummies of Urumchi,6,7969


In [10]:
# Calculating mean rating for each user.

Mean = dataset.groupby(by='User_ID',as_index=False)['Book_Rating'].mean()

# displaying 5 rows in descending order of User id.

Mean.sort_values(by=['User_ID'],ascending=False).head()

Unnamed: 0,User_ID,Book_Rating
2181,278854,7.0
2180,278852,8.0
2179,278851,6.5
2178,278849,9.0
2177,278846,8.0


In [11]:
# displaying 5 rows of dataset in descending order of User id.

dataset.sort_values(by=['User_ID'],ascending=False).head()

Unnamed: 0,User_ID,Book_Title,Book_Rating,Book_ID
2398,278854,Celtic Mythology (Library of the World's Myths...,8,1291
2397,278854,"A Treasury of Irish Myth, Legend, and Folklore",6,299
2396,278854,Blast From the Past,7,978
2395,278854,La crónica del Perú (Crónicas de América),7,4006
2394,278854,As valkírias,7,679


In [12]:
# merging dataset & mean dataframe w.r.t. 'User_ID' columns by inner joint.
# If column names and row data matches from both dataframes then merge function will place one
#..unique column instead of repeating it.
# And in case if only column names same but row data values are diff. then it will create columns with suffix x & y
#..please look Book_Rating_x & 	Book_Rating_y columns. Also look two last cell codes for understanding.

Rating_avg = pd.merge(dataset, Mean, on='User_ID')

In [13]:
Rating_avg.sort_values(by=['User_ID'],ascending=False).head()

Unnamed: 0,User_ID,Book_Title,Book_Rating_x,Book_ID,Book_Rating_y
2398,278854,Celtic Mythology (Library of the World's Myths...,8,1291,7.0
2397,278854,"A Treasury of Irish Myth, Legend, and Folklore",6,299,7.0
2396,278854,Blast From the Past,7,978,7.0
2395,278854,La crónica del Perú (Crónicas de América),7,4006,7.0
2394,278854,As valkírias,7,679,7.0


In [14]:
# adding actual rating - avg rating diff in Rating_avg dataframe

Rating_avg['adg_rating']=Rating_avg['Book_Rating_x']-Rating_avg['Book_Rating_y']

In [15]:
Rating_avg.head(5)

Unnamed: 0,User_ID,Book_Title,Book_Rating_x,Book_ID,Book_Rating_y,adg_rating
0,276726,Classical Mythology,5,1443,5.0,0.0
1,276729,Clara Callan,3,1440,4.5,-1.5
2,276729,Decision in Normandy,6,1883,4.5,1.5
3,276736,Flu: The Story of the Great Influenza Pandemic...,8,2722,8.0,0.0
4,276737,The Mummies of Urumchi,6,7969,6.0,0.0


In [16]:
check = pd.pivot_table(Rating_avg,values='Book_Rating_x',index='User_ID',columns='Book_ID')
check.head()

Book_ID,0,1,2,3,4,5,6,7,8,9,...,9649,9650,9651,9652,9653,9654,9655,9656,9657,9658
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
12,,,,,,,,,,,...,,,,,,,,,,
14,,,,,,,,,,,...,,,,,,,,,,


# Content Based Filtering





Building Matrix Dataframes with rating values.

In [17]:
final = pd.pivot_table(Rating_avg,values='adg_rating',index='User_ID',columns='Book_ID')
final.head()

Book_ID,0,1,2,3,4,5,6,7,8,9,...,9649,9650,9651,9652,9653,9654,9655,9656,9657,9658
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
12,,,,,,,,,,,...,,,,,,,,,,
14,,,,,,,,,,,...,,,,,,,,,,


In [18]:
# Replacing NaN by Movie Average
final_book = final.fillna(final.mean(axis=0))

# Replacing NaN by user Average
final_user = final.apply(lambda row: row.fillna(row.mean()), axis=1)

In [19]:
final_user.head()

Book_ID,0,1,2,3,4,5,6,7,8,9,...,9649,9650,9651,9652,9653,9654,9655,9656,9657,9658
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,2.537653e-16,2.537653e-16,2.537653e-16,2.537653e-16,2.537653e-16,2.537653e-16,2.537653e-16,2.537653e-16,2.537653e-16,2.537653e-16,...,2.537653e-16,2.537653e-16,2.537653e-16,2.537653e-16,2.537653e-16,2.537653e-16,2.537653e-16,2.537653e-16,2.537653e-16,2.537653e-16
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,...,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16,2.960595e-16


In [20]:
# user similarity on replacing NAN by user avg
b = cosine_similarity(final_user)
np.fill_diagonal(b, 0 )
similarity_with_user = pd.DataFrame(b,index=final_user.index)
similarity_with_user.columns=final_user.index
similarity_with_user.head()

User_ID,8,9,10,12,14,16,17,19,22,26,...,278831,278832,278836,278843,278844,278846,278849,278851,278852,278854
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,4.629693e-28,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-4.2167080000000004e-29,4.630244000000001e-28,0.0,0.0,-1.38667e-32,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,4.629693e-28,0.0,0.0,0.0,0.0,0.0,-1.2325950000000001e-32,0.0,0.0,0.0,...,0.0,0.0,0.0,-1.243935e-28,1.2671080000000001e-27,0.0,0.0,-3.081488e-32,0.0,0.0


In [21]:
# user similarity on replacing NAN by item(book) avg
cosine = cosine_similarity(final_book)
np.fill_diagonal(cosine, 0 )
similarity_with_book = pd.DataFrame(cosine,index=final_book.index)
similarity_with_book.columns=final_user.index
similarity_with_book.head()

User_ID,8,9,10,12,14,16,17,19,22,26,...,278831,278832,278836,278843,278844,278846,278849,278851,278852,278854
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.999983,0.99999,1.0,1.0,0.999999,1.0,1.0
9,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.999983,0.99999,1.0,1.0,0.999999,1.0,1.0
10,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.999983,0.99999,1.0,1.0,0.999999,1.0,1.0
12,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.999983,0.99999,1.0,1.0,0.999999,1.0,1.0
14,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.999983,0.99999,1.0,1.0,0.999999,1.0,1.0


In [22]:
def find_n_neighbours(df,n):
    df = df.apply(lambda x: pd.Series(x.sort_values(ascending=False).iloc[:n].index, index=['top{}'.format(i) for i in range(1, n+1)]), axis=1)
    return df

In [23]:
# top 30 neighbours(users) for each user
sim_user_30_u = find_n_neighbours(similarity_with_user,30)
sim_user_30_u.head()

Unnamed: 0_level_0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10,...,top21,top22,top23,top24,top25,top26,top27,top28,top29,top30
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,2152,2033,162052,276911,161993,277921,1838,278545,277923,276984,...,161173,1830,3601,3099,2545,277018,161391,4090,3467,2197
9,8,161755,161744,161749,161750,161751,161752,161753,161757,161407,...,161726,161715,161708,161706,161704,161696,161695,161694,161689,161685
10,8,161755,161744,161749,161750,161751,161752,161753,161757,161407,...,161726,161715,161708,161706,161704,161696,161695,161694,161689,161685
12,8,161755,161744,161749,161750,161751,161752,161753,161757,161407,...,161726,161715,161708,161706,161704,161696,161695,161694,161689,161685
14,2152,2033,162052,277921,278545,1838,161993,276911,277923,276984,...,277996,2545,3537,3099,278844,161391,3467,4090,2197,3247


In [24]:
# top 30 neighbours(books) for each user 
sim_user_30_b = find_n_neighbours(similarity_with_book,30)
sim_user_30_b.head(10)

Unnamed: 0_level_0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10,...,top21,top22,top23,top24,top25,top26,top27,top28,top29,top30
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,3934,161466,161460,161459,161458,161455,161453,161448,161443,161441,...,161390,161386,161385,161377,161370,161368,161367,161366,161354,161462
9,8,161467,161462,161460,161459,161458,161455,161453,161448,161443,...,161391,161390,161386,161385,161377,161370,161368,161367,161366,161466
10,8,161466,161460,161459,161458,161455,161453,161448,161443,161441,...,161390,161386,161385,161377,161370,161368,161367,161366,161354,161462
12,8,161466,161460,161459,161458,161455,161453,161448,161443,161441,...,161390,161386,161385,161377,161370,161368,161367,161366,161354,161462
14,8,161466,161460,161459,161458,161455,161453,161448,161443,161441,...,161390,161386,161385,161377,161370,161368,161367,161366,161354,161462
16,8,161466,161460,161459,161458,161455,161453,161448,161443,161441,...,161390,161386,161385,161377,161370,161368,161367,161366,161354,161462
17,8,161466,161460,161459,161458,161455,161453,161448,161443,161441,...,161390,161386,161385,161377,161370,161368,161367,161366,161354,161462
19,8,161466,161460,161459,161458,161455,161453,161448,161443,161441,...,161390,161386,161385,161377,161370,161368,161367,161366,161354,161462
22,8,161466,161460,161459,161458,161455,161453,161448,161443,161441,...,161390,161386,161385,161377,161370,161368,161367,161366,161354,161462
26,8,161466,161460,161459,161458,161455,161453,161448,161443,161441,...,161390,161386,161385,161377,161370,161368,161367,161366,161354,161462


In [25]:
def get_user_similar_books( user1, user2 ):
    common_books = Rating_avg[Rating_avg.User_ID == user1].merge(Rating_avg[Rating_avg.User_ID == user2],
    on = "Book_ID",
    how = "inner" )
    return common_books.merge( Rating_avg, on = 'Book_ID' )

In [26]:
a = get_user_similar_books(8,26)
a = a.loc[ : , ['Book_Rating_x','Book_Rating_y','Book_Title']]
a.head()

Unnamed: 0,Book_Rating_x,Book_Rating_y,Book_Title


In [27]:
# Writing function to get original rating gievn by particular user to particular book:

def User_item_score(user,item):
    
    #-----------------------------------------------------------------------------------------------
    # Selecting row
    # dataframe of top 30 book genrated by cosine rule.
    #...this variable 'a' will take entire row for entered user id, .values will make it np.array
    # sim_user_30_b contains top 30 book IDs for each uSer ID. 
    a = sim_user_30_b[sim_user_30_b.index==user].values 
    
    #------------------------------------------------------------------------------------------------
    # Removing 1-D array space
    # a.shape() # output will look like say: [1,30] since having 30 columns
    
    # this will make list containing 30 top book IDs
    b = a.squeeze().tolist() 
    
    #------------------------------------------------------------------------------------------------
    # Selcting particular Book_ID column mentioned by user. 
    c = final_book.loc[:,item]
    
   
    # Checking whether Book_ID mentioned by user comes under in top30 book list for mentioned user_ID 
    #..or not. If found in top30 list i.e. in 'b' return True else False.
    
    d = c[c.index.isin(b)] # if matche it will return values associated with bool. True & False
    # here d is nothing but actually equals to c. Since we are performing top n approach method.
    # And we have already provided top 30 books for each user_ID
    
    #------------------------------------------------------------------------------------------------
    
    # Avoiding null values from dataframe
    f = d[d.notnull()] # same as 'c'
    
    # Selecting avg. rating for Particular User.
    avg_user = Mean.loc[Mean['User_ID'] == user,'Book_Rating'].values[0] # '0' will selct only first value
    #                                                                   ...for each new input
    
    
    index = f.index.values.squeeze().tolist()
    corr = similarity_with_book.loc[user,index]
    fin = pd.concat([f, corr], axis=1)
    fin.columns = ['adg_score','correlation']
    fin['score']=fin.apply(lambda x:x['adg_score'] * x['correlation'],axis=1)
    nume = fin['score'].sum()
    deno = fin['correlation'].sum()
    final_score = avg_user + (nume/deno)
    return final_score

Quick Notes on above cell codes:

In [28]:
# squeeze()
a = np.array([[[0], [2], [4]]]) 
print(a.shape)

# (1, 3, 1) It shows 1-no. of layers, 3-No. of rows, 1-No. of columns.


# Squeeze fuction will remove one dimentional arrays. Here layer & columns are 1-D.
# it will keep as only rows
c = a.squeeze().tolist()
c
# Output : [0, 2, 4]
#-----------------------------------------------------------------

(1, 3, 1)


[0, 2, 4]

In [29]:
score = User_item_score(278854,1291)
print("score (u,i) is",score)

score (u,i) is 8.0


In [30]:
Rating_avg = Rating_avg.astype({"Book_ID": str})
Book_user = Rating_avg.groupby(by = 'User_ID')['Book_ID'].apply(lambda x:','.join(x))

In [31]:
def User_item_score1(user):
    Book_read_by_user = check.columns[check[check.index==user].notna().any()].tolist()
    a = sim_user_30_b[sim_user_30_b.index==user].values
    b = a.squeeze().tolist()
    d = Book_user[Book_user.index.isin(b)]
    l = ','.join(d.values)
    Book_read_by_similar_users = l.split(',')
    Book_under_consideration = list(set( Book_read_by_similar_users)-set(list(map(str,Book_read_by_user))))
    Book_under_consideration  = list(map(int, Book_under_consideration))
    score = []
    for item in Book_under_consideration:
        c = final_book.loc[:,item]
        d = c[c.index.isin(b)]
        f = d[d.notnull()]
        avg_user = Mean.loc[Mean['User_ID'] == user,'Book_Rating'].values[0]
        index = f.index.values.squeeze().tolist()
        corr = similarity_with_book.loc[user,index]
        fin = pd.concat([f, corr], axis=1)
        fin.columns = ['adg_score','correlation']
        fin['score']=fin.apply(lambda x:x['adg_score'] * x['correlation'],axis=1)
        nume = fin['score'].sum()
        deno = fin['correlation'].sum()
        final_score = avg_user + (nume/deno)
        score.append(final_score)
    data = pd.DataFrame({'Book_ID':Book_under_consideration,'score':score})
    top_5_recommendation = data.sort_values(by='score',ascending=False).head(5)
    Book_Name = top_5_recommendation.merge(dataset, how='inner', on='Book_ID')
    Book_Names = Book_Name.Book_Title.values.tolist()
    return Book_Names

In [None]:
def main():
    
    try:
        user = int(input("Enter the user id to whom you want to recommend : "))
        predicted_books = User_item_score1(user)
        print(" ")
        print("The Recommendations for User Id :", user)
        print("   ")
        for i in predicted_books:
            print(i)
            print()
    except:
        if user not in  dataset.User_ID.unique().tolist():
            print('Incorrect User_ID!!!')
main()

In [None]:
Enter the user id to whom you want to recommend : 8
 
The Recommendations for User Id : 8
   
How the Irish Saved Civilization: The Untold Story of Ireland's Heroic Role from the Fall of Rome to the Rise of Medieval Europe (Hinges of History, Vol 1)

Swan River

The Elephant Tree

Blue Highways : A Journey into America

The Cholera Years: The United States in 1832, 1849, and 1866