In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from EDA_functions import books_cleaning_steps

import datetime

# EDA functions
from EDA_functions import ratings_cleaning_steps


In [2]:
github_folder = os.getcwd()
main_folder = os.path.dirname(github_folder)

In [3]:
amz_ratings = pd.read_csv(os.path.join(main_folder,'Books_rating.csv'))

In [4]:
amz_ratings.head()

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,826414346,Dr. Seuss: American Icon,,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...


Questions/notes\
How is the 'review/summary' field produced?\
Is this Amazon doing some NLP?

In [5]:
amz_ratings["review/summary"][0]

'Nice collection of Julie Strain images'

In [6]:
amz_ratings["review/text"][0]

"This is only for Julie Strain fans. It's a collection of her photos -- about 80 pages worth with a nice section of paintings by Olivia.If you're looking for heavy literary content, this isn't the place to find it -- there's only about 2 pages with text and everything else is photos.Bottom line: if you only want one book, the Six Foot One ... is probably a better choice, however, if you like Julie like I like Julie, you won't go wrong on this one either."

In [7]:
amz_books = pd.read_csv(os.path.join(main_folder,'books_data.csv'))

In [8]:
amz_books.head()

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount
0,Its Only Art If Its Well Hung!,,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,http://books.google.nl/books?id=DykPAAAACAAJ&d...,,1996,http://books.google.nl/books?id=DykPAAAACAAJ&d...,['Comics & Graphic Novels'],
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,2005-01-01,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,['Biography & Autobiography'],
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],http://books.google.com/books/content?id=2tsDA...,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,,2000,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,['Religion'],
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,iUniverse,2005-02,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,['Fiction'],
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],,http://books.google.nl/books?id=399SPgAACAAJ&d...,,2003-03-01,http://books.google.nl/books?id=399SPgAACAAJ&d...,,


### Reviewing Megan and Margherita's EDA pre meet-up

In [9]:
print(f"{amz_ratings['Title'].unique().shape[0]:,}")

212,404


In [10]:
print(f"{amz_books['Title'].unique().shape[0]:,}")

212,404


-> There are the same number of unique book titles in each df

In [11]:
amz_books.isnull().sum()

Title                 1
description       68442
authors           31413
image             52075
previewLink       23836
publisher         75886
publishedDate     25305
infoLink          23836
categories        41199
ratingsCount     162652
dtype: int64

In [12]:
amz_ratings.isnull().sum()

Id                          0
Title                     208
Price                 2518829
User_id                561787
profileName            561905
review/helpfulness          0
review/score                0
review/time                 0
review/summary            407
review/text                 8
dtype: int64

Unique titles from each df combined, to verify they are the same unique titles in each df!

In [13]:
unique_titles_amz_books = amz_books['Title'].unique()
unique_titles_amz_ratings = amz_ratings['Title'].unique()

combined_titles = pd.concat([pd.Series(unique_titles_amz_books), pd.Series(unique_titles_amz_ratings)], axis=0)

# Reset index to ensure it's continuous
combined_titles = combined_titles.reset_index(drop=True)

In [14]:
print(f"{combined_titles.unique().shape[0]:,}")

212,404


In [15]:
combined_titles.isnull().sum()

2

# Books EDA

In [16]:
amz_books.head()

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount
0,Its Only Art If Its Well Hung!,,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,http://books.google.nl/books?id=DykPAAAACAAJ&d...,,1996,http://books.google.nl/books?id=DykPAAAACAAJ&d...,['Comics & Graphic Novels'],
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,2005-01-01,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,['Biography & Autobiography'],
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],http://books.google.com/books/content?id=2tsDA...,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,,2000,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,['Religion'],
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,iUniverse,2005-02,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,['Fiction'],
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],,http://books.google.nl/books?id=399SPgAACAAJ&d...,,2003-03-01,http://books.google.nl/books?id=399SPgAACAAJ&d...,,


In [17]:
amz_books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212404 entries, 0 to 212403
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Title          212403 non-null  object 
 1   description    143962 non-null  object 
 2   authors        180991 non-null  object 
 3   image          160329 non-null  object 
 4   previewLink    188568 non-null  object 
 5   publisher      136518 non-null  object 
 6   publishedDate  187099 non-null  object 
 7   infoLink       188568 non-null  object 
 8   categories     171205 non-null  object 
 9   ratingsCount   49752 non-null   float64
dtypes: float64(1), object(9)
memory usage: 16.2+ MB


In [18]:
amz_books.shape

(212404, 10)

In [19]:
amz_books.isnull().sum()

Title                 1
description       68442
authors           31413
image             52075
previewLink       23836
publisher         75886
publishedDate     25305
infoLink          23836
categories        41199
ratingsCount     162652
dtype: int64

In [20]:
# ratings count is average rating for each title

In [21]:
print(f"{amz_books['Title'].unique().shape[0]:,}")

212,404


In [22]:
print(f"{amz_books['authors'].unique().shape[0]:,}")

127,279


In [23]:
print(f"{amz_books['image'].unique().shape[0]:,}")

149,388


In [24]:
print(f"{amz_books['publisher'].unique().shape[0]:,}")

16,017


In [25]:
amz_books.shape

(212404, 10)

In [26]:
# Convert 'dates' column to datetime
amz_books['publishedDate'] = pd.to_datetime(amz_books['publishedDate'], errors='coerce')

# Filter out NaN and dates containing '?'
filtered_amz_books = amz_books[amz_books['publishedDate'].notna() & ~amz_books['publishedDate'].astype(str).str.contains('\?', na=False)]


In [27]:
filtered_amz_books['publishedDate'].min()

Timestamp('1679-01-01 00:00:00')

In [28]:
filtered_amz_books['publishedDate'].mean()

Timestamp('1983-12-14 23:35:05.709667776')

In [29]:
filtered_amz_books['publishedDate'].max()

Timestamp('2025-01-01 00:00:00')

In [30]:
filtered_amz_books.shape

(90951, 10)

In [31]:
def clean_date(
    df,
    publishedDate_column: str    
):
    # Convert 'dates' column to datetime
    df[publishedDate_column] = pd.to_datetime(df[publishedDate_column], errors='coerce')

    # Filter out NaN and dates containing '?'
    filtered_df = df[df[publishedDate_column].notna() & ~df[publishedDate_column].astype(str).str.contains('\?', na=False)]
    return filtered_df

In [32]:
# function works in notebook, but havin trouble using .py file
# error name 'pd' is not defined
filtered_amz_books_2 = clean_date(
    amz_books,
    'publishedDate'   
)

In [33]:
filtered_amz_books_2.shape

(90951, 10)

In [34]:
# checking that clean_date function works
filtered_amz_books_3 = books_cleaning_steps.clean_date(
    amz_books,
    'publishedDate'   
)

In [40]:
filtered_amz_books_3.shape

(90951, 10)

In [35]:
# amz_books_drop_nan_year = amz_books.dropna(how="any", subset=["publishedDate_year"])

In [36]:
212404 - 25305

187099

In [37]:
187099 - 45 - 3

187051

In [38]:
# amz_books_drop_nan_year = amz_books_drop_nan_year.drop(amz_books_drop_nan_year[amz_books_drop_nan_year['publishedDate_year'] == '20??'].index)

In [39]:
amz_books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212404 entries, 0 to 212403
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   Title          212403 non-null  object        
 1   description    143962 non-null  object        
 2   authors        180991 non-null  object        
 3   image          160329 non-null  object        
 4   previewLink    188568 non-null  object        
 5   publisher      136518 non-null  object        
 6   publishedDate  90951 non-null   datetime64[ns]
 7   infoLink       188568 non-null  object        
 8   categories     171205 non-null  object        
 9   ratingsCount   49752 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(8)
memory usage: 16.2+ MB
