In [None]:
#####################
# LIBRARIES IMPORTS #
#####################

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

# Loading the data into Pandas' Dataframe

The dataset provided is composed of a "lighter_authors.json" file of about 0.5 Gbs and a "lighter_books.json" file of about 15 Gbs. Considering that where will be a data-type conversion the dataset will become even bigger when loaded on pandas and they could not work on machines with limited amounts of RAM. We can approach this problem from two sides:
* Divide the dataset in chunks, work one chunk at a time and merge the result.
* For every request we could extract only the columns we are interested with.

Both this approach are slow, we have to load every part of the dataset from the storage and load it on RAM for every exercise, and this increase considerably the amount of time to execute each query. Instead we try to load everything all at once, making the dataset lighter by removing columns useless for our analysis and where possible changing the data-type of useful columns to lighter versions.

## Authors dataset

In [None]:
# load the dataset from the .json file to a pandas dataframe
authors = pd.read_json("datasets/lighter_authors.json", lines = True)

In [None]:
# check the first lines of the dataframe
authors.head()

In [None]:
# check some infos about each column
authors.info()

In [None]:
# check some infos about the RAM usage of every column
raw_authors_memory_usage = authors.memory_usage(index = True, deep = True)
raw_authors_memory_usage

In [None]:
print("The dataset just as imported uses", round(raw_authors_memory_usage.sum() / 1073741824, 2), "GBs of RAM!" )
print("The 'about' column covers", round(raw_authors_memory_usage["about"] / raw_authors_memory_usage.sum(), 2) * 100, "% of the total RAM usage alone!")

The columns "image_url" and "about" are useless for our analysis so they can be removed.

In [None]:
# remove the useless columns
authors.drop(columns = ["image_url", "about"], inplace = True)
print("The dataset now uses", round(authors.memory_usage(index = True, deep = True).sum() / 1073741824, 2), "GBs of RAM!" )

[TODO] CHANGE DATA TYPES?

## Books

The books dataset is much bigger than the authors one and we can't work with it in one go, we have to separate it in chunks. Firstly we analyze what can be done with it by only observing some rows

In [None]:
# load a chunk of the dataset from the .json file to a pandas dataframe
books = pd.read_json("datasets/lighter_books.json", lines = True, nrows = 10000)

In [None]:
# check the first lines of the chunk
books.head()

In [None]:
# check some infos about each column
books.info()

In [None]:
# check some infos about the RAM usage of every column
raw_books_memory_usage = books.memory_usage(index = True, deep = True)
raw_books_memory_usage

In [None]:
print("The dataset just as imported uses", round(raw_books_memory_usage.sum() / 1073741824, 2), "GBs of RAM!" )
print("The 'about' column covers", round(raw_books_memory_usage["description"] / raw_books_memory_usage.sum(), 2) * 100, "% of the total RAM usage alone!")

The scenario is similar to the authors dataset, there is a column of long text descriptions that occupy a large amount of memory and it's useless to us. We remove it together with other useless columns such as "image_url", "isb", "isbn13", "asin" [TODO].

In [None]:
# remove the useless columns
books.drop(columns = ["isbn", "isbn13", "asin", "edition_information", "publisher", "image_url", "description", "shelves"], inplace = True)
print("The dataset now uses", round(books.memory_usage(index = True, deep = True).sum() / 1073741824, 2), "GBs of RAM!" )

Now we try to load the whole books dataset, chunk by chunk, and removing the useless parts.

In [None]:
books = pd.DataFrame()

chunk_size = 100000
chunks = pd.read_json("datasets/lighter_books.json", lines = True, chunksize = chunk_size)
col_to_drop = ["isbn", "isbn13", "asin", "edition_information", "image_url", "publisher", "shelves", "description"]
 
for chunk in chunks:
    chunk.drop(columns = col_to_drop, inplace = True)
    books = pd.concat([books, chunk])
    

In [None]:
# check some infos about each column
books.info()

In [None]:
# check some infos about the RAM usage of every column
books_memory_usage = books.memory_usage(index = True, deep = True)
books_memory_usage


In [None]:
print("The dataset uses", round(books_memory_usage.sum() / 1073741824, 2), "GBs of RAM!" )

# [RQ1] Exploratory Data Analysis (EDA)

TODO

In the Authors dataset what's the difference between "book" and "work"?

The Books dataset has some void string entries in the num_pages column.

# [RQ2] Let’s finally dig into this vast dataset, retrieving some vital information:

**Request 2.1:** Plot the number of books for each author in descending order.

**Request 2.2:**  Which book has the highest number of reviews?

In [None]:
books[books["text_reviews_count"] >= max(books["text_reviews_count"])]

**Request 2.3:** Which are the top ten and ten worst books concerning the average score?

In [None]:
books.nlargest(10, "average_rating")

In [None]:
books.nsmallest(10, "average_rating")

**Request 2.4:** Explore the different languages in the book’s dataset, providing a proper chart summarizing how these languages are distributed throughout our virtual library.

**Request 2.5:** How many books have more than 250 pages?

In [None]:
for elem in books["num_pages"]:
    if type(elem) != type(100):
        print("TROVATO")
        print(elem)
        print(type(elem))

        break

In [None]:
#books[books["num_pages"] > 250].shape()

**Request 2.6:** Plot the distribution of the fans count for the 50 most prolific authors (the ones who have written more books).

# [RQ3] Let’s have a historical look at the dataset!

# [RQ4] Quirks questions about consistency. In most cases, we will not have a consistent dataset, and the one we are dealing with is no exception. So, let's enhance our analysis.

# [RQ5] We can consider the authors with the most fans to be influential. Let’s have a deeper look.


*Provide information about the general response from readers (number of fans, average rating, number of reviews, etc.), divide the authors by gender, and comment about anything eventually related to “structural bias.”

**Request 5.1.a:** Plot the top 10 most influential authors regarding their fan count and number of books.

In [None]:
#Count the actual number of books written by every author using the length of the list of his book ids.
authors["book_count"] = authors["work_ids"].apply(len)

#10  most influential authors regarding their fan count
x = authors.nlargest(10, "fans_count")

#plot the data
plt.figure(figsize=(12,6))
plt.barh(x["name"], x["fans_count"], color = 'green', label = 'Fan Count')
plt.figure(figsize=(12,6))
plt.barh(x["name"], x["book_count"], color = 'blue', label = 'Book Count')
plt.show()

**Request 5.1.b:** Who is the most influential author?

In [None]:
#Most influential author

m_inf = authors.nlargest(1, "fans_count")
print("The most influential author, having by far the largest fanbase is:", m_inf["name"])

**Request 5.2:** Have they published any series of books? If any, extract the longest series name among these authors.

In [None]:
#Create a new dataframe containing only the books written by the top 10 authors
top_authors_books = books[books["author_id"].isin(list(x["id"]))]

#Get the series authors by their "series position" to find who has written series, and use max position to determine their longest series' length.

author_series_lengths = top_authors_books.groupby(["author_id", "author_name"])["series_position"].max().reset_index()
#return the highest series position, thus the longest series author and name
author_series = top_authors_books.groupby(["author_id","author_name", "series_name"])["series_position"].max().reset_index()
max_series = author_series[author_series["series_position"] == author_series["series_position"].max()]


print("Authors of series are:")
print(author_series_lengths["author_name"])

print("\nThe author of the longest series")
print(max_series["author_name"])
print("And the series is:")
print(max_series["series_name"])


**Request 5,2** How many of these authors have been published in different formats? Provide a meaningful chart on the distribution of the formats and comment on it.

In [None]:
#To find the authors published in different formats from the top 10

authors_with_different_formats = top_authors_books[top_authors_books.duplicated(subset=["author_id"], keep=False)]

#to avoid name repetition
unique_author_ids = authors_with_different_formats["author_id"].unique()

# Create a separate chart for each author with multiple formats
for author_id in unique_author_ids:
    author_data = authors_with_different_formats[authors_with_different_formats["author_id"] == author_id]
    author_name = author_data["author_name"].values[0]
    if not author_data["format"].isnull:
        format_distribution = author_data["format"].value_counts()
    
    format_distribution.plot(kind='bar')
    plt.title(f"Formats Distribution for {author_name}")
    plt.xlabel("Format")
    plt.ylabel("Count")
    plt.show()

Comments on the graphs above:

We can see that the books are mainly published in paperback and hardcover formats, whereas the "newer" digital formats are very few. This would be due to 3 main reasons: 
1- the age of most of the books written by these authors since audiobooks and ebook formats are relatively new.
2- even with more widely available digital formats now, these are authors of novels and readers who usually enjoy leisurely reading prefer the physical format of paper, rather than reading through a screen.
3- Once a book has been published in a digital format, there is no need to republish it using another editorial, since it is already on the internet, a more universal "bookstore".


**Request 5.3:** Provide information about the general response from readers (number of fans, average rating, number of reviews, etc.), divide the authors by gender, and comment about anything eventually related to “structural bias.”

# [RQ6] For this question, consider the top 10 authors concerning the number of fans again. 


    Provide the average time gap between two subsequent publications for a series of books and those not belonging to a series. What do you expect to see, and what is the actual answer to this question?
    For each of the authors, give a convenient plot showing how many books has the given author published UP TO a given year. Are these authors contemporary with each other? Can you notice a range of years where their production rate was higher?


**Request 6.1:** Provide the average time gap between two subsequent publications for a series of books and those not belonging to a series. What do you expect to see, and what is the actual answer to this question?

In [None]:
#Step 1: transform the publication date entry to datetime format

top_authors_books["original_publication_date"] = pd.to_datetime(top_authors_books["original_publication_date"], format = 'mixed')

#calculate time gap between consecutive "original" publications and add it into a column for all books
top_authors_books["time_gap"] = top_authors_books.groupby(["author_id", "series_id"])["original_publication_date"].diff()

# Calculate the average time gap for books in a series and not in a series, according to the new column created
average_time_gap_series = top_authors_books[top_authors_books["series_id"].notna()]["time_gap"].mean()
average_time_gap_non_series = top_authors_books[top_authors_books["series_id"].isna()]["time_gap"].mean()

#print(f"Average Time Gap for Books in a Series: {average_time_gap_series}")
#print(f"Average Time Gap for Books Not in a Series: {average_time_gap_non_series}")
print(top_authors_books)

What do we expect to see?

As expected the books that belong to a series have a more periodic and regular output from the authors(on average), and this logically makes sense following the idea that the author is breaking down a full story into separate books.
Books that do not belong to a series however do not have a well-defined average gap between them since they are independent works and do not follow any periodic output "expectation" from the public.

**Request 6.2:** For each of the authors, give a convenient plot showing how many books has the given author published UP TO a given year. Are these authors contemporary with each other? Can you notice a range of years where their production rate was higher?

In [None]:
#define a new column using the datetime format of the original publication date
top_authors_books['year'] = top_authors_books["original_publication_date"].dt.year
writers_counts = top_authors_books.groupby(['author_name', 'year']).size().reset_index(name='Count')

# Plot the publication history for each author
writers = writers_counts['author_name'].unique()
for writer in writers:
    author_data = writers_counts[writers_counts['author_name'] == writer]
    plt.plot(author_data['year'], author_data['Count'], label=writer)

plt.title("Publication History of Authors")
plt.xlabel("Year")
plt.ylabel("Number of Books Published")
plt.legend()
plt.show()

# [RQ7] Estimating probabilities is a core skill for a data scientist: show us your best!

# [RQ8] Charts, statistical tests, and analysis methods are splendid tools to illustrate your data-driven decisions to check whether a hypothesis is correct.