# Library Analysis

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from dateutil.parser import parse

In [None]:
df = pd.read_csv("books.csv")

# Data Cleaning
df['title'] = df['title'].str.lstrip()
df['num ratings'] = df['num ratings'].str.lstrip()
df['num ratings'] = df['num ratings'].str.replace(',', '', regex=True)
df.drop(['Unnamed: 0'], axis = 1, inplace = True)
df.loc[df['num pages'] == "unknown", "num pages"] = 0
df["num pages"] = df["num pages"].astype(float)
df["rating_numeric"] = df["rating"]
df.loc[df['rating_numeric'] == "Not Rated", "rating_numeric"] = 0
df["rating_numeric"] = df["rating_numeric"].astype(float)
df["avg rating"] = df["avg rating"].astype(float)
df["num ratings"] = df["num ratings"].astype(float)
df['difference'] = df['rating_numeric'] - df['avg rating']
df['date pub'] = pd.to_datetime(df['date pub'], errors='coerce')
df['date read'] = pd.to_datetime(df['date read'], errors='coerce')

df.head(5)

In [None]:
# Aantal boeken gelezen / ge-rate / datum aangegeven
# Aantal paginas gelezen

books_read = len(df.index)
no_date_entered = sum(df['date read'] == '-')
with_date = books_read - no_date_entered
no_rating = sum(df['rating'] == 'Not Rated')
with_rating = books_read - no_rating
total_pages = df['num pages'].sum()
average_pages = total_pages / books_read

print(
f"The numer of books read is {books_read}"
f"\nOut of {books_read} books, {with_rating} have been rated and {no_rating} have not."
f"\nOut of {books_read} books, for {with_date}, the reading date has been entered and for {no_date_entered} it has not."
f"\nThe total number of pages read is {total_pages}"
f"\nThe average number of pages per book is {average_pages}")

In [None]:
#5 sterren boeken
top_books = df[df['rating_numeric'] == 5]
top_books

In [None]:
# Top 5 oudste boeken
df.sort_values(by=['date pub']).head(5)

In [None]:
# Top 5 nieuwste boeken
df.sort_values(by=['date pub'], ascending=False).head(5)

In [None]:
# Distributie beoordeling
df_only_rated = df.drop(df[df.rating_numeric == 0].index)
df_only_rated['rating_numeric'].plot(kind='hist')

In [None]:
# Distributie aantal pagina's
df['num pages'].plot(kind='hist')

In [None]:
# Favourite writers
writers = df.groupby(['author'])['author'].count()
best_writers = writers.sort_values(ascending=False)
best_writers.head(5)

In [None]:
# Favourite writers - avg. rating
df.groupby(['author']).rating.mean()

In [None]:
# Favourite writers - avg. number of pages
df.groupby(['author']).rating.mean()

### Vergelijking jij met publiek

In [None]:
# Boeken waarover je veel positiever bent
df.sort_values(by='difference', ascending=False).head(5)

In [None]:
# Boeken waarover je veel negatiever bent
df = df.drop(df[df.rating_numeric == 0].index)
df.sort_values(by='difference').head(5)

In [None]:
# Mean difference between my ratings and those of other readers
mean_difference = df['difference'].apply(abs).mean()

In [None]:
# Distribution of when you read the books

## By year
year_read = df['date read'].dt.year

n, bins, patches = plt.hist(x=year_read, bins='auto', color='#0504aa')
plt.grid(axis='y')
plt.xlabel('Year')
plt.ylabel('Number of Books')
plt.title('When did I read most books?')
maxfreq = n.max()

In [None]:
# Distribution of when the books were published by year
year_pub = df['date pub'].dt.year

n, bins, patches = plt.hist(x=year_pub, bins='auto', color='#0504aa')
plt.grid(axis='y')
plt.xlabel('Year')
plt.ylabel('Number of Books')
plt.title('Date Published of Read Books')
maxfreq = n.max()