In [1]:
import pandas as pd
import numpy as np
from plotly.graph_objects import *

In [3]:
ratings = pd.read_csv('Ratings.csv', sep=';', on_bad_lines='skip')
ratings.head()

Unnamed: 0,User-ID,ISBN,Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [4]:
books = pd.read_csv('Books.csv', on_bad_lines='skip', sep=';')
books.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company


**EDA**

In [5]:
booksRatings = pd.merge(books, ratings, on='ISBN', how='inner')
booksRatings.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher,User-ID,Rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,2,0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8,5
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11400,0
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676,8
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,41385,0


In [6]:
booksRatings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1031187 entries, 0 to 1031186
Data columns (total 7 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   ISBN       1031187 non-null  object
 1   Title      1031187 non-null  object
 2   Author     1031185 non-null  object
 3   Year       1031187 non-null  int64 
 4   Publisher  1031185 non-null  object
 5   User-ID    1031187 non-null  int64 
 6   Rating     1031187 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 55.1+ MB


In [9]:
booksRatings.dropna(inplace=True)
booksRatings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1031183 entries, 0 to 1031186
Data columns (total 7 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   ISBN       1031183 non-null  object
 1   Title      1031183 non-null  object
 2   Author     1031183 non-null  object
 3   Year       1031183 non-null  int64 
 4   Publisher  1031183 non-null  object
 5   User-ID    1031183 non-null  int64 
 6   Rating     1031183 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 62.9+ MB


In [10]:
# кількість оцінок
booksRatings['Rating'].value_counts()

Rating
0     647331
8      91806
10     71226
7      66405
9      60779
5      45355
6      31690
4       7617
3       5118
2       2375
1       1481
Name: count, dtype: int64

In [11]:
# усуваємо книги без оцінки
booksRatings = booksRatings[booksRatings['Rating'] > 0.0]
booksRatings['Rating'].value_counts()

Rating
8     91806
10    71226
7     66405
9     60779
5     45355
6     31690
4      7617
3      5118
2      2375
1      1481
Name: count, dtype: int64

In [12]:
# розподіл рейтингових оцінок
data = booksRatings['Rating'].value_counts().sort_index(ascending=False)

trace = Bar(x = data.index,
            text = [f'{np.round(val,1)}%' for val in (data.values / data.values.sum() * 100)],
            y = data.values)
layout = Layout(title = 'Distribution of rating marks',
               xaxis = {'title':'Rating mark'},
               yaxis = {'title':'Amount of books'})
fig = Figure(data=trace, layout=layout)
fig.show()

In [13]:
# середні рейтинги книг
booksAvgRatings = pd.DataFrame(data=booksRatings.groupby('ISBN')['Rating'].mean()).reset_index()
booksAvgRatings

Unnamed: 0,ISBN,Rating
0,0000913154,8.0
1,0001046438,9.0
2,000104687X,6.0
3,0001047213,9.0
4,0001047973,9.0
...,...,...
149833,B0001FZGPI,7.0
149834,B0001FZGRQ,9.0
149835,B0001GMSV2,8.0
149836,B0001I1KOG,10.0


In [14]:
booksAvgRatings.rename(columns={"Rating": "AvgRating"}, inplace = 'True')
booksAvgRatings

Unnamed: 0,ISBN,AvgRating
0,0000913154,8.0
1,0001046438,9.0
2,000104687X,6.0
3,0001047213,9.0
4,0001047973,9.0
...,...,...
149833,B0001FZGPI,7.0
149834,B0001FZGRQ,9.0
149835,B0001GMSV2,8.0
149836,B0001I1KOG,10.0


In [15]:
#вибираємо лише унікальні книги
booksRatings = booksRatings.filter(items=['ISBN', 'Title', 'Author', 'Year', 'Publisher']).drop_duplicates()
booksRatings

Unnamed: 0,ISBN,Title,Author,Year,Publisher
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
16,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
19,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
31,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group
64,0425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group
...,...,...,...,...,...
1031174,0395264707,Dreamsnake,Vonda N. McIntyre,1978,Houghton Mifflin
1031179,1845170423,Cocktail Classics,David Biggs,2004,Connaught
1031181,0449906736,Flashpoints: Promise and Peril in a New World,Robin Wright,1993,Ballantine Books
1031182,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm)


In [16]:
# об'єднуємо книги з їх середніми рейтингами
booksRatings = pd.merge(booksRatings, booksAvgRatings, on='ISBN')
booksRatings

Unnamed: 0,ISBN,Title,Author,Year,Publisher,AvgRating
0,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,7.666667
1,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,7.500000
2,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,7.833333
3,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,8.176471
4,0425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,8.000000
...,...,...,...,...,...,...
149833,0395264707,Dreamsnake,Vonda N. McIntyre,1978,Houghton Mifflin,10.000000
149834,1845170423,Cocktail Classics,David Biggs,2004,Connaught,7.000000
149835,0449906736,Flashpoints: Promise and Peril in a New World,Robin Wright,1993,Ballantine Books,5.000000
149836,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),7.000000


In [17]:
# найпопулярніші книги (без врахування кількості оцінок, поставлених кожній із них)
booksRatings.sort_values(by='AvgRating', ascending=False).head(20)

Unnamed: 0,ISBN,Title,Author,Year,Publisher,AvgRating
93508,0810112787,Closely Watched Trains (European Classics),Bohumil Hrabal,1995,Northwestern,10.0
76947,0849940516,Inspirational Study Bible,Max Lucado,1997,Nelson Bibles,10.0
61364,0711912335,U2 Touch the Flame: An Illustrated Documentary,Geoff Parkyn,1987,Omnibus Press,10.0
92226,2070327884,Lettres Ã?Â un jeune poÃ?Â¨te,Rainer Maria Rilke,1993,Gallimard,10.0
92230,2266122320,L'enfant ocÃ?Â©an,Mourlevat,2002,Pocket,10.0
61363,1871307414,U2: Rattle & hum : the official book of the U2...,Peter Williams,1988,Pyramid,10.0
92235,2211048889,LÃ?Â©on,Leon Walter Tillage,1999,L'Ecole des loisirs,10.0
61362,0863698859,U2 Faraway So Close,B.P. Fallon,1994,Virgin Books,10.0
61359,1898141002,U2: Burning Desire : The Complete Story,Sam Goodman,1993,Castle Communications,10.0
41642,1561794856,Mysterious Love (Nikki Sheridan Series),Shirley Brinkerhoff,1996,Baker Book House,10.0


**Adjusted raiting**

In [20]:
# кількість оцінок, виставлених читачами кожній із книг
nonZeroRatings = ratings[ratings['Rating'] > 0]
nonZeroRatings = nonZeroRatings.groupby('ISBN')['Rating'].count().reset_index()
nonZeroRatings

Unnamed: 0,ISBN,Rating
0,0330299891,1
1,0375404120,1
2,9022906116,1
3,#6612432,1
4,'9607092910',1
...,...,...
185968,"\""8888809228\""",1
185969,"\""9170010242\""",1
185970,ooo7156103,1
185971,´3499128624,1


In [None]:
#залишимо лише книги, в яких є (умовно) більше 5 відгуків

In [31]:
nonZeroRatings = nonZeroRatings[nonZeroRatings['Rating'] > 5]
nonZeroRatings

Unnamed: 0,ISBN,Rating
126,0002005018,9
248,0002251760,7
295,0002259001,9
303,0002259834,8
353,0002558122,10
...,...,...
184489,9727591965,6
184514,9727722458,14
184583,9728436408,6
184829,9770390107900,12


In [36]:
nonZeroRatings.rename(columns={"Rating": "MarksCount"}, inplace = 'True')
nonZeroRatings



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,ISBN,MarksCount
126,0002005018,9
248,0002251760,7
295,0002259001,9
303,0002259834,8
353,0002558122,10
...,...,...
184489,9727591965,6
184514,9727722458,14
184583,9728436408,6
184829,9770390107900,12


In [38]:
# об'єднуємо книги із значеннями кількості оцінок, поставлених їм читачати
booksRatings = pd.merge(booksRatings, nonZeroRatings, on='ISBN')
booksRatings

Unnamed: 0,ISBN,Title,Author,Year,Publisher,AvgRating,MarksCount
0,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,7.666667,9
1,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,7.833333,6
2,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,8.176471,17
3,0440234743,The Testament,John Grisham,1999,Dell,7.704142,169
4,0452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994,Plume,7.772152,79
...,...,...,...,...,...,...,...
10726,0380978482,"The Princess Diaries (The Princess Diaries, Vo...",Meg Cabot,2000,HarperCollins,7.666667,6
10727,006019250X,The Illustrated Alchemist: A Fable About Follo...,Paulo Coelho,1998,HarperCollins Publishers,9.714286,7
10728,3442350956,Ferne Ufer. Der 3. Band der groÃ?Â?en Highland...,Diana Gabaldon,1999,Blanvalet,9.000000,6
10729,0786890088,The Angel Maker,Ridley Pearson,2001,Hyperion Press,8.333333,6


In [39]:
# найпопулярніші книги (із врахуванням кількості оцінок, поставлених кожній із них)
booksRatings.sort_values(by='AvgRating', ascending=False).head(20)

Unnamed: 0,ISBN,Title,Author,Year,Publisher,AvgRating,MarksCount
2309,1888054557,Postmarked Yesteryear: 30 Rare Holiday Postcards,Pamela E. Apkarian-Russell,2001,Collectors Press,10.0,11
8384,0395282659,The Hobbit,J. R. R. Tolkien,1988,Houghton Mifflin Co,10.0,6
6928,0394831292,"Oh, the Thinks You Can Think! (I Can Read It A...",Dr. Seuss,1975,Random House Children's Books,10.0,6
9543,0679821481,Six by Seuss: A Treasury of Dr. Seuss Classics,Seuss,1991,Random House Children's Books,10.0,6
6264,1571456988,Uncle John's Supremely Satisfying Bathroom Rea...,Bathroom Readers Institute,2001,Bathroom Reader Press,10.0,7
10071,0394800893,The Sneetches and Other Stories,Dr. Seuss,1961,Random House Children's Books,10.0,8
4948,0395193958,The Lord of the Rings (Leatherette Collector's...,J. R. R. Tolkien,1974,Houghton Mifflin Company,10.0,6
8775,089471838X,Natural California: A Postcard Book,Not Applicable (Na ),1990,Running Pr,10.0,7
9677,0836213319,Dilbert: A Book of Postcards,Scott Adams,1996,Andrews McMeel Pub,9.923077,13
10176,0439425220,Harry Potter and the Chamber of Secrets Postca...,J. K. Rowling,2002,Scholastic,9.869565,23


In [41]:
globalMean = booksRatings['AvgRating'].mean()
globalMean

7.684785549466644

In [44]:
# Adjusted raiting function
def calculateAdjustedRaiting(data, globalMean, k):
    data['AdjustedRating'] = (data['AvgRating'] * data['MarksCount'] + globalMean * k)/(data['MarksCount'] + k)
    return data

**порівняння top 20 рейтингу для різних значень k**

In [45]:
# для k = 0.1
booksRatings_k1 = calculateAdjustedRaiting(booksRatings, globalMean, 0.1)
booksRatings_k1.sort_values(by='AdjustedRating', ascending=False).head(20)

Unnamed: 0,ISBN,Title,Author,Year,Publisher,AvgRating,MarksCount,AdjustedRating
2309,1888054557,Postmarked Yesteryear: 30 Rare Holiday Postcards,Pamela E. Apkarian-Russell,2001,Collectors Press,10.0,11,9.979142
10071,0394800893,The Sneetches and Other Stories,Dr. Seuss,1961,Random House Children's Books,10.0,8,9.971417
6264,1571456988,Uncle John's Supremely Satisfying Bathroom Rea...,Bathroom Readers Institute,2001,Bathroom Reader Press,10.0,7,9.967391
8775,089471838X,Natural California: A Postcard Book,Not Applicable (Na ),1990,Running Pr,10.0,7,9.967391
6928,0394831292,"Oh, the Thinks You Can Think! (I Can Read It A...",Dr. Seuss,1975,Random House Children's Books,10.0,6,9.962046
8384,0395282659,The Hobbit,J. R. R. Tolkien,1988,Houghton Mifflin Co,10.0,6,9.962046
9543,0679821481,Six by Seuss: A Treasury of Dr. Seuss Classics,Seuss,1991,Random House Children's Books,10.0,6,9.962046
4948,0395193958,The Lord of the Rings (Leatherette Collector's...,J. R. R. Tolkien,1974,Houghton Mifflin Company,10.0,6,9.962046
9677,0836213319,Dilbert: A Book of Postcards,Scott Adams,1996,Andrews McMeel Pub,9.923077,13,9.905991
10176,0439425220,Harry Potter and the Chamber of Secrets Postca...,J. K. Rowling,2002,Scholastic,9.869565,23,9.860107


In [48]:
# для k = 0.5
booksRatings_k2 = calculateAdjustedRaiting(booksRatings, globalMean, 0.5)
booksRatings_k2.sort_values(by='AdjustedRating', ascending=False).head(20)

Unnamed: 0,ISBN,Title,Author,Year,Publisher,AvgRating,MarksCount,AdjustedRating
2309,1888054557,Postmarked Yesteryear: 30 Rare Holiday Postcards,Pamela E. Apkarian-Russell,2001,Collectors Press,10.0,11,9.899339
10071,0394800893,The Sneetches and Other Stories,Dr. Seuss,1961,Random House Children's Books,10.0,8,9.863811
6264,1571456988,Uncle John's Supremely Satisfying Bathroom Rea...,Bathroom Readers Institute,2001,Bathroom Reader Press,10.0,7,9.845652
8775,089471838X,Natural California: A Postcard Book,Not Applicable (Na ),1990,Running Pr,10.0,7,9.845652
9677,0836213319,Dilbert: A Book of Postcards,Scott Adams,1996,Andrews McMeel Pub,9.923077,13,9.840177
10176,0439425220,Harry Potter and the Chamber of Secrets Postca...,J. K. Rowling,2002,Scholastic,9.869565,23,9.823081
8384,0395282659,The Hobbit,J. R. R. Tolkien,1988,Houghton Mifflin Co,10.0,6,9.821907
9543,0679821481,Six by Seuss: A Treasury of Dr. Seuss Classics,Seuss,1991,Random House Children's Books,10.0,6,9.821907
4948,0395193958,The Lord of the Rings (Leatherette Collector's...,J. R. R. Tolkien,1974,Houghton Mifflin Company,10.0,6,9.821907
6928,0394831292,"Oh, the Thinks You Can Think! (I Can Read It A...",Dr. Seuss,1975,Random House Children's Books,10.0,6,9.821907


In [49]:
# для k = 1
booksRatings_k3 = calculateAdjustedRaiting(booksRatings, globalMean, 1)
booksRatings_k3.sort_values(by='AdjustedRating', ascending=False).head(20)

Unnamed: 0,ISBN,Title,Author,Year,Publisher,AvgRating,MarksCount,AdjustedRating
2309,1888054557,Postmarked Yesteryear: 30 Rare Holiday Postcards,Pamela E. Apkarian-Russell,2001,Collectors Press,10.0,11,9.807065
10176,0439425220,Harry Potter and the Chamber of Secrets Postca...,J. K. Rowling,2002,Scholastic,9.869565,23,9.778533
9677,0836213319,Dilbert: A Book of Postcards,Scott Adams,1996,Andrews McMeel Pub,9.923077,13,9.763199
10071,0394800893,The Sneetches and Other Stories,Dr. Seuss,1961,Random House Children's Books,10.0,8,9.742754
6264,1571456988,Uncle John's Supremely Satisfying Bathroom Rea...,Bathroom Readers Institute,2001,Bathroom Reader Press,10.0,7,9.710598
8775,089471838X,Natural California: A Postcard Book,Not Applicable (Na ),1990,Running Pr,10.0,7,9.710598
4948,0395193958,The Lord of the Rings (Leatherette Collector's...,J. R. R. Tolkien,1974,Houghton Mifflin Company,10.0,6,9.669255
9543,0679821481,Six by Seuss: A Treasury of Dr. Seuss Classics,Seuss,1991,Random House Children's Books,10.0,6,9.669255
6928,0394831292,"Oh, the Thinks You Can Think! (I Can Read It A...",Dr. Seuss,1975,Random House Children's Books,10.0,6,9.669255
8384,0395282659,The Hobbit,J. R. R. Tolkien,1988,Houghton Mifflin Co,10.0,6,9.669255


In [50]:
# для k = 2
booksRatings_k4 = calculateAdjustedRaiting(booksRatings, globalMean, 2)
booksRatings_k4.sort_values(by='AdjustedRating', ascending=False).head(20)

Unnamed: 0,ISBN,Title,Author,Year,Publisher,AvgRating,MarksCount,AdjustedRating
10176,0439425220,Harry Potter and the Chamber of Secrets Postca...,J. K. Rowling,2002,Scholastic,9.869565,23,9.694783
2309,1888054557,Postmarked Yesteryear: 30 Rare Holiday Postcards,Pamela E. Apkarian-Russell,2001,Collectors Press,10.0,11,9.643813
9677,0836213319,Dilbert: A Book of Postcards,Scott Adams,1996,Andrews McMeel Pub,9.923077,13,9.624638
4394,0618002235,"The Two Towers (The Lord of the Rings, Part 2)",J. R. R. Tolkien,1999,Houghton Mifflin Company,9.72,25,9.569243
4521,0060256656,The Giving Tree,Shel Silverstein,1964,HarperCollins Publishers,9.75,20,9.562253
10071,0394800893,The Sneetches and Other Stories,Dr. Seuss,1961,Random House Children's Books,10.0,8,9.536957
3646,0394800389,Fox in Socks (I Can Read It All by Myself Begi...,Dr. Seuss,1965,Random House Children's Books,9.785714,14,9.523098
8775,089471838X,Natural California: A Postcard Book,Not Applicable (Na ),1990,Running Pr,10.0,7,9.485508
6264,1571456988,Uncle John's Supremely Satisfying Bathroom Rea...,Bathroom Readers Institute,2001,Bathroom Reader Press,10.0,7,9.485508
8269,0394823370,The Lorax,Dr. Seuss,1971,Random House Children's Books,9.8,10,9.447464
