# Book ratings EDA

## 0. Set up

Libraries

In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

import datetime

# EDA functions
from EDA_functions import ratings_cleaning_steps

# import requests
# from bs4 import BeautifulSoup
# from PIL import Image
# from io import BytesIO

Project folder

In [2]:
github_folder = os.getcwd()
main_folder = os.path.dirname(github_folder)

## 1. Import data

In [3]:
amz_ratings = pd.read_csv(os.path.join(main_folder,'Books_rating.csv'))

## 2. EDA

Explore data types and head

In [4]:
amz_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 10 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Id                  object 
 1   Title               object 
 2   Price               float64
 3   User_id             object 
 4   profileName         object 
 5   review/helpfulness  object 
 6   review/score        float64
 7   review/time         int64  
 8   review/summary      object 
 9   review/text         object 
dtypes: float64(2), int64(1), object(7)
memory usage: 228.9+ MB


Seems that all data is in the correct format except for review/time

In [5]:
amz_ratings.head()

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,826414346,Dr. Seuss: American Icon,,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...


-> Questions/Notes:
- what is review/helpfulness? helpfulness rating of the review 
    - this doesn't seem relevant for our analysis
- price is NA for some observations
    - how many?
    - could this be an issue for our analysis?
- what is review/time? time of given the review 
    - doesn't seem relevant for our analysis but it will need to be converted in datetime if we want to use it

#### 2.1 Title as identifier of the ratings

How many titles are available?

In [6]:
# Count number of tiles
print(f"{amz_ratings['Title'].unique().shape[0]:,}")

212,404


In [7]:
# Make titles lower case to avoid false duplicates
amz_ratings['Title'] = ratings_cleaning_steps.text_in_lower_case(amz_ratings, 'Title')

In [8]:
# Count number of tiles now that they are all in lower case
print(f"{amz_ratings['Title'].unique().shape[0]:,}")

209,457


-> Questions: does this number match the number of titles available in the amazon_books dataset?

### 2.2 Missing values

Which columns contain missing values?

In [9]:
amz_ratings.isna().sum()

Id                          0
Title                     208
Price                 2518829
User_id                561787
profileName            561905
review/helpfulness          0
review/score                0
review/time                 0
review/summary            407
review/text                 8
dtype: int64

-> Questions/notes:
- there are some entries with a missing title
    - these need to be dropped
- price is missing for many entries 
    - is this problematic for our analysis?
    - is it possible to fill this information with other prices for the same title?
- not a big issue if user id and profile names are missing
- good that score is not missing for any title
- missing review summary and text should not be an issue

#### 2.2.1 Title

There are 208 reviews with a missing title.
These will be dropped as it won't be possible to match them with the other dataset

In [10]:
# Remove na titles with function designed for the EDA
amz_ratings = ratings_cleaning_steps.remove_na_titles(
    amz_ratings, 
    'Title')

In [11]:
# Check that na titles have been removed
amz_ratings[amz_ratings['Title'].isna()]['Title'].count()

0

In [12]:
# Count number of tiles
print(f"{amz_ratings['Title'].unique().shape[0]:,}")

209,456


-> Questions/Note:
- issue sorted

#### 2.2.2 Price
- How many titles have at least one review with a missing price?
- How many titles have only NA prices?

In [20]:
def aggregate_df(
            df,
            group_by_columns,
            col_to_aggregate,
            operation,
            new_name_for_col_aggregated):
      if operation == 'average':
          return pd.DataFrame(
               df.groupby(group_by_columns)[col_to_aggregate].mean()
                    ).reset_index().rename(
                         columns = {col_to_aggregate: new_name_for_col_aggregated})
      if operation == 'median':
          return pd.DataFrame(
               df.groupby(group_by_columns)[col_to_aggregate].median()
                    ).reset_index().rename(
                         columns = {col_to_aggregate: new_name_for_col_aggregated})
      if operation == 'count':
          return pd.DataFrame(
               df.groupby(group_by_columns)[col_to_aggregate].count()
                    ).reset_index().rename(
                         columns = {col_to_aggregate: new_name_for_col_aggregated})

count_ratings = aggregate_df(
    df = amz_ratings,
    group_by_columns = 'Title',
    col_to_aggregate = 'review/score',
    operation = 'count',
    new_name_for_col_aggregated= 'reviews number'
)

In [25]:
# Create a dataset where we count the number of reviews and the number of reviews with missing price
from EDA_functions import ratings_cleaning_steps
# 1
# Number of ratings per title

count_ratings = ratings_cleaning_steps.aggregate_df(
    df = amz_ratings,
    group_by_columns = 'Title',
    col_to_aggregate = 'review/score',
    operation = 'count',
    new_name_for_col_aggregated= 'reviews number'
)

NameError: name 'pd' is not defined

In [None]:
# Create a dataset where we count the number of reviews and the number of reviews with missing price

# 1
# Number of ratings per title

count_ratings = ratings_cleaning_steps.aggregate_df(
    df = amz_ratings,
    group_by_columns = 'Title',
    col_to_aggregate = 'review/score',
    operation = 'count',
    new_name_for_col_aggregated= 'reviews number'
)


count_ratings = pd.DataFrame(
    amz_ratings.groupby('Title')['review/score'].count()
).reset_index()

# Rename column for clarity
count_ratings = count_ratings.rename(
    columns = {'review/score' : 'reviews number'}
)

# 2
# Number of ratings with na price
na_prices = pd.DataFrame(
    amz_ratings[amz_ratings['Price'].isna()].groupby('Title')['Id'].count()
).reset_index()

# Rename columns for clarity
na_prices = na_prices.rename(
    columns = {'Id' : 'missing prices'}
)

# 3 Average rating
avg_ratings = pd.DataFrame(
    amz_ratings.groupby('Title')['review/score'].mean()
    ).reset_index()

# Rename average rating
avg_ratings = avg_ratings.rename(
    columns= {'review/score' : 'average rating'}
)

# Merge the two datasets
ratings_per_title_and_na_prices = pd.merge(
   count_ratings,
   na_prices,
   on= ['Title'],
   how = 'left'
)

# titles with no missing price are indicated with NA 'percentage missing prices', replace these with 0
ratings_per_title_and_na_prices['missing prices'] = ratings_per_title_and_na_prices['missing prices'].fillna(0)


# Calculate % of entries with missing prices for each title
ratings_per_title_and_na_prices['percentage missing prices'] = ratings_per_title_and_na_prices['missing prices']/ratings_per_title_and_na_prices['reviews number']

In [None]:
bin_size = 0.1
n_bins = int(1/bin_size)+1


fig = go.Figure()
histogram_trace = go.Histogram(
        x = ratings_per_title_and_na_prices['percentage missing prices'],
        xbins=dict(
            # start = ratings_per_title_and_na_prices['percentage missing prices'].min(), 
            # end = ratings_per_title_and_na_prices['percentage missing prices'].max(), 
            size = bin_size),
        name = "% of ratings with missing price")

# Compute the counts for each bin
counts, bins = np.histogram(
    ratings_per_title_and_na_prices['percentage missing prices'], 
    bins=n_bins)

counts = [c/ratings_per_title_and_na_prices['Title'].count() for c in counts]
# Format the counts with thousands separator
formatted_counts = [f"{count:.2%}" for count in counts]

# Add the counts as text to the bars
histogram_trace.text = formatted_counts

# Add the histogram trace to the figure
fig.add_trace(histogram_trace)

fig.update_layout(
    title_text="Distribution of book titles based on % of ratings with missing price",
    title_font_size=20,
    title_x=0.5,
    title_y=0.95,
    yaxis_title="Number of titles",
    xaxis_title = "% of ratings with missing price",
    annotations=[
        dict(
            x=0,
            y=-0.2,
            xref="paper",
            yref="paper",
            text=f"Total number of books is {ratings_per_title_and_na_prices['Title'].count():,}",
            showarrow=False,
            font=dict(size=12)
        )
    ]
)

fig.show()

-> Questions/Note:
- we may not be able to use price in the model

### 2.3 Book ratings

#### 2.3.1 Distribution of books based on the number of ratings they received

In [None]:
# Create dataset to zoom on obs with ratings bwloe the 75th percentile
perc_75 = count_ratings['reviews number'].quantile(0.75)
no_outliers_count_ratings = count_ratings[
    (count_ratings['reviews number'] <= perc_75)]

In [None]:
# Create subplots
fig = make_subplots(
    rows=1, 
    cols=2)


# Plot boxplots of full dataset
fig.add_trace(
    go.Box(y=count_ratings['reviews number'], 
           name='Number of ratings per title',
           showlegend=False),
    row = 1,
    col=1
    )

# Plot boxplots excluding data with ratings > 2.5 std deviation
fig.add_trace(
    go.Box(y=no_outliers_count_ratings['reviews number'], 
           name='Number of ratings per title (excluding outliers)',
           showlegend=False),
    row = 1,
    col=2
    )

# Update layout to add titles
fig.update_layout(
    title_text="Boxplots of number of ratings per book",
    title_font_size=24,
    title_x=0.5,
    title_y=0.95,
    yaxis_title="Number of Ratings"
)

fig.show()


-> Questions/notes:
Few books have a high number of ratings, while 75th percent of the books have between 1 and 8 ratings.
Question: do we want to take only those books with high number of ratings?
For example below is the number of books with at least 100 reviews:

In [None]:
print(f"Number of books with at least 100 ratings:{count_ratings[count_ratings['reviews number'] >= 100]['reviews number'].count(): ,}")

#### 2.3.2 Distribution of the average rating

In [None]:
# Set up number of bins
n_bins = 5

# Create a new figure
fig = go.Figure()

# Add histogram trace to the figure with 10 bins
histogram_trace = go.Histogram(x=avg_ratings['average rating'], 
                               nbinsx=n_bins, 
                               name="avg rating distribution")

# Compute the counts for each bin
counts, bins = np.histogram(
    avg_ratings['average rating'], 
    bins=n_bins)

# Format the counts with thousands separator
formatted_counts = [f"{count:,}" for count in counts]

# Add the counts as text to the bars
histogram_trace.text = formatted_counts

# Add the histogram trace to the figure
fig.add_trace(histogram_trace)

# Update layout
fig.update_layout(
    title_text="Distribution of book titles based on average rating",
    title_font_size=20,
    title_x=0.5,
    title_y=0.95,
    yaxis_title="Number of titles",
    xaxis_title="Average rating",
    annotations=[
        dict(
            x=0,
            y=-0.2,
            xref="paper",
            yref="paper",
            text=f"Total number of books is {avg_ratings['Title'].count():,}",
            showarrow=False,
            font=dict(size=12)
        )
    ]
)

# Show the figure
fig.show()


-> Questions/notes:
- most books have a 4 and 5 rating
    - could this skewed ditribution impact our analysis?