In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt # 개집
import seaborn
import plotly.express as px # 63빌딩
from IPython.display import Image, HTML
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# import data
books = pd.read_csv('./datasets/books.csv')
books.tail()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
9995,9996,7130616,7130616,7392860,19,441019455,9780441000000.0,Ilona Andrews,2010.0,Bayou Moon,...,17204,18856,1180,105,575,3538,7860,6778,https://images.gr-assets.com/books/1307445460m...,https://images.gr-assets.com/books/1307445460s...
9996,9997,208324,208324,1084709,19,067973371X,9780680000000.0,Robert A. Caro,1990.0,Means of Ascent,...,12582,12952,395,303,551,1737,3389,6972,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
9997,9998,77431,77431,2393986,60,039330762X,9780393000000.0,Patrick O'Brian,1977.0,The Mauritius Command,...,9421,10733,374,11,111,1191,4240,5180,https://images.gr-assets.com/books/1455373531m...,https://images.gr-assets.com/books/1455373531s...
9998,9999,8565083,8565083,13433613,7,61711527,9780062000000.0,Peggy Orenstein,2011.0,Cinderella Ate My Daughter: Dispatches from th...,...,11279,11994,1988,275,1002,3765,4577,2375,https://images.gr-assets.com/books/1279214118m...,https://images.gr-assets.com/books/1279214118s...
9999,10000,8914,8914,11817,31,375700455,9780376000000.0,John Keegan,1998.0,The First World War,...,9162,9700,364,117,345,2031,4138,3069,https://images.gr-assets.com/books/1403194704m...,https://images.gr-assets.com/books/1403194704s...


In [4]:
books.shape

(10000, 23)

In [5]:
ratings = pd.read_csv('./datasets/ratings.csv')
ratings.tail()

Unnamed: 0,book_id,user_id,rating
981751,10000,48386,5
981752,10000,49007,4
981753,10000,49383,5
981754,10000,50124,5
981755,10000,51328,1


In [6]:
ratings['rating'].value_counts()

rating
4    357366
5    292961
3    248623
2     63231
1     19575
Name: count, dtype: int64

In [7]:
btags = pd.read_csv('./datasets/book_tags.csv')
btags.tail()

Unnamed: 0,goodreads_book_id,tag_id,count
999907,33288638,21303,7
999908,33288638,17271,7
999909,33288638,1126,7
999910,33288638,11478,7
999911,33288638,27939,7


In [8]:
tags = pd.read_csv('./datasets/tags.csv')
tags.tail()

Unnamed: 0,tag_id,tag_name
34247,34247,Ｃhildrens
34248,34248,Ｆａｖｏｒｉｔｅｓ
34249,34249,Ｍａｎｇａ
34250,34250,ＳＥＲＩＥＳ
34251,34251,ｆａｖｏｕｒｉｔｅｓ


In [9]:
# data preprocessing
ratings.sort_values('user_id')

Unnamed: 0,book_id,user_id,rating
117889,1180,1,4
488112,4893,1,3
625717,6285,1,4
796318,8034,2,4
875008,8855,2,5
...,...,...,...
912886,9255,53424,4
818162,8262,53424,4
777143,7833,53424,4
716608,7212,53424,4


In [10]:
ratings.shape

(981756, 3)

In [11]:
ratings.drop_duplicates(subset=['user_id', 'book_id'], keep = False, inplace=True)
ratings.shape

(977269, 3)

In [12]:
books.columns

Index(['id', 'book_id', 'best_book_id', 'work_id', 'books_count', 'isbn',
       'isbn13', 'authors', 'original_publication_year', 'original_title',
       'title', 'language_code', 'average_rating', 'ratings_count',
       'work_ratings_count', 'work_text_reviews_count', 'ratings_1',
       'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5', 'image_url',
       'small_image_url'],
      dtype='object')

In [13]:
print(books.shape)
books.drop_duplicates(subset=['original_title'], keep = False, inplace=True)
books.shape

(10000, 23)


(9151, 23)

In [14]:
print(btags.shape)
btags.drop_duplicates(subset=['tag_id', 'goodreads_book_id'], keep = False, inplace=True)
btags.shape

(999912, 3)


(999896, 3)

In [15]:
print(tags.shape)
tags.drop_duplicates(subset=['tag_id'], keep = False, inplace=True)
tags.shape

(34252, 2)


(34252, 2)

In [16]:
# Data visualization
joint_tags = pd.merge(btags, tags, left_on='tag_id', right_on='tag_id', how='inner')

In [17]:
joint_tags

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
0,1,30574,167697,to-read
1,2,30574,24549,to-read
2,3,30574,496107,to-read
3,5,30574,11909,to-read
4,6,30574,298,to-read
...,...,...,...,...
999891,31538635,14690,6,hogwarts
999892,32848471,16149,21,jan-2017
999893,33288638,27821,9,single-mom
999894,33288638,11478,7,fave-author


In [18]:
# Top 10 rated books
top_rated = books.sort_values('average_rating', ascending=False)
top10 = top_rated.head(10)
display = top10[['title', 'small_image_url']]
display.set_index('title', inplace=True)

In [19]:
def path_to_image_html(path):
    return f'<img src="{path}" />'
HTML(display.to_html(escape=False, formatters={'small_image_url':path_to_image_html}))

Unnamed: 0_level_0,small_image_url
title,Unnamed: 1_level_1
The Complete Calvin and Hobbes,
"Words of Radiance (The Stormlight Archive, #2)",
Mark of the Lion Trilogy,
It's a Magical World: A Calvin and Hobbes Collection,
There's Treasure Everywhere: A Calvin and Hobbes Collection,
"Harry Potter Boxset (Harry Potter, #1-7)",
"Harry Potter Collection (Harry Potter, #1-6)",
The Indispensable Calvin and Hobbes,
The Authoritative Calvin and Hobbes: A Calvin and Hobbes Treasury,
Attack of the Deranged Mutant Killer Monster Snow Goons,


#### Contents Based Filtering Recommand System

In [20]:
fillnabooks = books.fillna('')
fillnabooks

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439023480.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780439554930.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
3,4,2657,2657,3275794,487,61120081,9780061120080.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743273560.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...
5,6,11870085,11870085,16827462,226,525478817,9780525478810.0,John Green,2012.0,The Fault in Our Stars,...,2346404,2478609,140739,47994,92723,327550,698471,1311871,https://images.gr-assets.com/books/1360206420m...,https://images.gr-assets.com/books/1360206420s...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,7130616,7130616,7392860,19,441019455,9780441019460.0,Ilona Andrews,2010.0,Bayou Moon,...,17204,18856,1180,105,575,3538,7860,6778,https://images.gr-assets.com/books/1307445460m...,https://images.gr-assets.com/books/1307445460s...
9996,9997,208324,208324,1084709,19,067973371X,9780679733710.0,Robert A. Caro,1990.0,Means of Ascent,...,12582,12952,395,303,551,1737,3389,6972,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
9997,9998,77431,77431,2393986,60,039330762X,9780393307630.0,Patrick O'Brian,1977.0,The Mauritius Command,...,9421,10733,374,11,111,1191,4240,5180,https://images.gr-assets.com/books/1455373531m...,https://images.gr-assets.com/books/1455373531s...
9998,9999,8565083,8565083,13433613,7,61711527,9780061711530.0,Peggy Orenstein,2011.0,Cinderella Ate My Daughter: Dispatches from th...,...,11279,11994,1988,275,1002,3765,4577,2375,https://images.gr-assets.com/books/1279214118m...,https://images.gr-assets.com/books/1279214118s...


In [21]:
fillednabooks=fillnabooks[['original_title', 'authors', 'average_rating']]

In [22]:
fillednabooks = fillednabooks.astype(str)
fillednabooks.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9151 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   original_title  9151 non-null   object
 1   authors         9151 non-null   object
 2   average_rating  9151 non-null   object
dtypes: object(3)
memory usage: 286.0+ KB


In [23]:
def clean_data(x):
    return str.lower(x.replace(" ", ""))

In [24]:
fillednabooks['original_title'] = fillednabooks['original_title'].apply(clean_data)
fillednabooks['authors'] = fillednabooks['authors'].apply(clean_data)
fillednabooks['average_rating'] = fillednabooks['average_rating'].apply(clean_data)

In [25]:
def create_soup(x):
    return x['original_title'] + ' ' + x['authors'] + ' ' + x['average_rating']

In [26]:
fillednabooks['soup'] = fillednabooks.apply(create_soup, axis=1)

In [27]:
fillednabooks

Unnamed: 0,original_title,authors,average_rating,soup
0,thehungergames,suzannecollins,4.34,thehungergames suzannecollins 4.34
1,harrypotterandthephilosopher'sstone,"j.k.rowling,marygrandpré",4.44,harrypotterandthephilosopher'sstone j.k.rowlin...
3,tokillamockingbird,harperlee,4.25,tokillamockingbird harperlee 4.25
4,thegreatgatsby,f.scottfitzgerald,3.89,thegreatgatsby f.scottfitzgerald 3.89
5,thefaultinourstars,johngreen,4.26,thefaultinourstars johngreen 4.26
...,...,...,...,...
9995,bayoumoon,ilonaandrews,4.09,bayoumoon ilonaandrews 4.09
9996,meansofascent,roberta.caro,4.25,meansofascent roberta.caro 4.25
9997,themauritiuscommand,patricko'brian,4.35,themauritiuscommand patricko'brian 4.35
9998,cinderellaatemydaughter:dispatchesfromthefront...,peggyorenstein,3.65,cinderellaatemydaughter:dispatchesfromthefront...


In [28]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(fillednabooks['soup'])
count_matrix

<9151x17302 sparse matrix of type '<class 'numpy.int64'>'
	with 33809 stored elements in Compressed Sparse Row format>

In [29]:
cosine_sim = cosine_similarity(count_matrix)
cosine_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [30]:
set(cosine_sim[99])

{0.0,
 0.2357022603955159,
 0.25819888974716115,
 0.2886751345948129,
 0.3333333333333334,
 0.408248290463863,
 1.0000000000000002}

In [31]:
fillnabooks = fillnabooks.reset_index()

In [32]:
# 책에 번호를 매겨준
indices = pd.Series(fillnabooks.index, index=fillnabooks.original_title)
indices.head()

original_title
The Hunger Games                            0
Harry Potter and the Philosopher's Stone    1
To Kill a Mockingbird                       2
The Great Gatsby                            3
The Fault in Our Stars                      4
dtype: int64

In [33]:
def get_rocommandations(title, cosine_sim=cosine_sim):
    # title = str.lower(title.replace(" ", ""))
    idx = indices[title]
    result = list(enumerate(cosine_sim[idx]))
    result = sorted(result, key=lambda x: x[1], reverse=True)
    # print(result)
    result = result[1:11]
    # print(result)
    book_indicies = [i[0] for i in result]
    # print(book_indicies)
    return books['original_title'].iloc[book_indicies]

In [34]:
result = get_rocommandations('The Fault in Our Stars')
print(result)

10                                  The Kite Runner 
73                                Looking for Alaska
87                                       Paper Towns
274                       An Abundance of Katherines
408    Fried Green Tomatoes at the Whistle Stop Cafe
439                                   Fall of Giants
672                                       Americanah
722                                        Shantaram
748                                  The Storyteller
857                                       Red Rising
Name: original_title, dtype: object


In [35]:
C = books['average_rating'].mean()
m = books['ratings_count'].quantile(0.6)
def weighted_vote_average(record):
    v = record['ratings_count']
    R = record['average_rating']
    return ( (v/(v+m)) * R) + ( (m/(m+v)) * C)

books['weighted_vote'] = books.apply(weighted_vote_average, axis=1)

In [36]:
books['weighted_vote']

0       4.338089
1       4.437432
3       4.247904
4       3.891087
5       4.257037
          ...   
9995    4.034602
9996    4.079026
9997    4.090063
9998    3.896271
9999    3.999450
Name: weighted_vote, Length: 9151, dtype: float64

In [37]:
def get_rocommandations(title, cosine_sim=cosine_sim):
    # title = str.lower(title.replace(" ", ""))
    idx = indices[title]
    result = list(enumerate(cosine_sim[idx]))
    result = sorted(result, key=lambda x: x[1], reverse=True)
    # print(result)
    result = result[1:11]
    # print(result)
    book_indicies = [i[0] for i in result]
    # print(book_indicies) df.iloc[similar_idx].sort_values(by=['weighted_vote'], ascending=False)[:top_n]
    # return books['original_title'].iloc[book_indicies]
    return books.iloc[book_indicies].sort_values(by=['weighted_vote'], ascending=False)[:11]['original_title']

In [38]:
result = get_rocommandations('The Fault in Our Stars')
print(result)

10                                  The Kite Runner 
408    Fried Green Tomatoes at the Whistle Stop Cafe
439                                   Fall of Giants
672                                       Americanah
748                                  The Storyteller
857                                       Red Rising
722                                        Shantaram
73                                Looking for Alaska
87                                       Paper Towns
274                       An Abundance of Katherines
Name: original_title, dtype: object
