In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

pd.set_option('display.max_colwidth', 200)
sns.set_style("whitegrid")

books = pd.read_csv("../data/books_cleaned.csv")
books.head()

In [None]:
books.info()
books.describe(include="all")

In [None]:
books.isnull().sum()

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(books['average_rating'], bins=20, kde=True)
plt.title("Distribution of Average Ratings")
plt.savefig("../outputs/ratings_distribution.png", bbox_inches="tight")
plt.show()

In [None]:
genre_list = []
for g in books['genres'].dropna():
    genre_list.extend([x.strip() for x in g.split(",")])

top_genres = Counter(genre_list).most_common(10)
pd.DataFrame(top_genres, columns=["Genre", "Count"])

In [None]:
top_authors = books['authors'].value_counts().head(10)
plt.figure(figsize=(8,5))
sns.barplot(x=top_authors.values, y=top_authors.index, palette="viridis")
plt.title("Top 10 Authors with Most Books")
plt.savefig("../outputs/top_authors.png", bbox_inches="tight")
plt.show()

In [None]:
plt.figure(figsize=(6,5))
sns.scatterplot(x="ratings_count", y="average_rating", data=books, alpha=0.6)
plt.xscale("log")
plt.title("Ratings Count vs Average Rating")
plt.savefig("../outputs/ratings_vs_count.png", bbox_inches="tight")
plt.show()

In [None]:
books[['title', 'authors', 'genres', 'average_rating']].sample(5)

In [None]:
print("Dataset cleaned and ready for feature engineering.")
print(f"Total books: {books.shape[0]}")
print(f"Unique authors: {books['authors'].nunique()}")