# Book Recommendation

## Library

In [66]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tabulate

## Dataset Overview


1. **Books data** : Giving the detail overview about the book information including the book title, publication year, as well as the author of the book
2. **Users data** : Give the detailed overview about each user such as the user's location as well as the user's age
3. **Ratings data** : Give the detailed overview about the rating that each user give to the book

## Preprocessing

### Books Data

In [67]:
df_books1 = pd.read_csv('data/Books.csv', sep=',', error_bad_lines=False, usecols = [0,1,2,3,4])



  df_books1 = pd.read_csv('data/Books.csv', sep=',', error_bad_lines=False, usecols = [0,1,2,3,4])
  df_books1 = pd.read_csv('data/Books.csv', sep=',', error_bad_lines=False, usecols = [0,1,2,3,4])


In [68]:
df_books1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271359 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
dtypes: object(5)
memory usage: 10.4+ MB


In [69]:
try:
    df_books1['Year-Of-Publication']  = df_books1['Year-Of-Publication'].astype(int)
except Exception as e:
    print(e)


invalid literal for int() with base 10: 'DK Publishing Inc'


In [70]:
df_books1[df_books1['Year-Of-Publication'] == 'DK Publishing Inc']

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
209538,078946697X,"DK Readers: Creating the X-Men, How It All Beg...",2000,DK Publishing Inc,http://images.amazon.com/images/P/078946697X.0...
221678,0789466953,"DK Readers: Creating the X-Men, How Comic Book...",2000,DK Publishing Inc,http://images.amazon.com/images/P/0789466953.0...


In [71]:
df_books1['Year-Of-Publication'] = pd.to_numeric(df_books1['Year-Of-Publication'],errors='coerce')

In [72]:
df_books1 = df_books.dropna()
df_books1['Year-Of-Publication'] = df_books1['Year-Of-Publication'].astype(int)

In [73]:
df_books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [74]:
df_books2 = pd.read_csv('data/BooksWithCategory.csv')

In [75]:
df_books2.set_axis(['ASIN', 'Filename', 'Image Url', 'Book-Title', 'Author', 'Category ID', 'Category'],
                    axis=1,inplace=True)

  df_books2.set_axis(['ASIN', 'Filename', 'Image Url', 'Book-Title', 'Author', 'Category ID', 'Category'],


In [76]:
df_books2.head()

Unnamed: 0,ASIN,Filename,Image Url,Book-Title,Author,Category ID,Category
0,1623439671,1623439671.jpg,http://ecx.images-amazon.com/images/I/61t-hrSw...,Doug the Pug 2016 Wall Calendar,Doug the Pug,3,Calendars
1,B00O80WC6I,B00O80WC6I.jpg,http://ecx.images-amazon.com/images/I/41X-KQqs...,"Moleskine 2016 Weekly Notebook, 12M, Large, Bl...",Moleskine,3,Calendars
2,761182187,0761182187.jpg,http://ecx.images-amazon.com/images/I/61j-4gxJ...,365 Cats Color Page-A-Day Calendar 2016,Workman Publishing,3,Calendars
3,1578052084,1578052084.jpg,http://ecx.images-amazon.com/images/I/51Ry4Tsq...,Sierra Club Engagement Calendar 2016,Sierra Club,3,Calendars
4,1578052076,1578052076.jpg,http://ecx.images-amazon.com/images/I/619KxYEq...,Sierra Club Wilderness Calendar 2016,Sierra Club,3,Calendars


In [77]:
df_books2 = df_books2.drop(['ASIN', 'Filename', 'Image Url', 'Author'], axis=1)

In [78]:
df_books2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207571 entries, 0 to 207570
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Book-Title   207571 non-null  object
 1   Category ID  207571 non-null  int64 
 2   Category     207571 non-null  object
dtypes: int64(1), object(2)
memory usage: 4.8+ MB


In [79]:
df_books = pd.merge(df_books1, df_books2, on='Book-Title')

### Users Data

In [80]:
df_users = pd.read_csv('data/Users.csv', sep=',', error_bad_lines=False, usecols = [0,1,2])



  df_users = pd.read_csv('data/Users.csv', sep=',', error_bad_lines=False, usecols = [0,1,2])


In [81]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


In [82]:
df_users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


### Ratings Data

In [83]:
df_ratings = pd.read_csv('data/Ratings.csv', sep=',', error_bad_lines=False, usecols = [0,1,2])    



  df_ratings = pd.read_csv('data/Ratings.csv', sep=',', error_bad_lines=False, usecols = [0,1,2])


In [84]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [85]:
df_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


### Data Cleaning

In [86]:
print('Number of data before cleaning : {}'.format(len(df_ratings)))
df_ratings = df_ratings[df_ratings['ISBN'].isin(df_books['ISBN'])]
print('Number of data after cleaning : {}'.format(len(df_ratings)))

Number of data before cleaning : 1149780
Number of data after cleaning : 96495


In [87]:
f = ['count','mean']

df_books_summary = df_ratings.groupby('ISBN')['Book-Rating'].agg(f)
df_books_summary.index = df_books_summary.index.map(str)

drop_book_list = df_books_summary[df_books_summary['count'] < 10].index

df_cust_summary = df_ratings.groupby('User-ID')['Book-Rating'].agg(f)
df_cust_summary.index = df_cust_summary.index.map(int)

drop_cust_list = df_cust_summary[df_cust_summary['count'] < 10].index

In [88]:
print('Before Filtering: {}'.format(df_ratings.shape))
df_ratings = df_ratings[~df_ratings['ISBN'].isin(drop_book_list)]
df_ratings = df_ratings[~df_ratings['User-ID'].isin(drop_cust_list)]
print('After Filtering: {}'.format(df_ratings.shape))


Before Filtering: (96495, 3)
After Filtering: (38472, 3)


In [89]:
pivot_rating = df_ratings.pivot(index='ISBN',columns='User-ID',values='Book-Rating')
pivot_rating.head()

User-ID,243,254,507,638,805,882,1424,1435,1848,1903,...,275281,275922,275970,276165,276641,276680,277195,277427,277639,278418
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6480764,,,,,,,,,,,...,,,,,,,,,,
6485936,,,,,,,,,,,...,,,,,,,,,,
20198906,,,,,,,,,,,...,,,,,,,,,,
20446500,,,,,,,,,,,...,,,,,,,,,,
20545509,,,,,,,,,,,...,,,,,,,,,,
