In [None]:
# Import libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
# Load data
books = pd.read_csv('data/book.csv')

In [None]:
# Basic EDA
print(books.info())
sns.countplot(data=books, x="popularity")
plt.title("Popularity Distribution")
plt.show()

In [None]:
# Filter out rare categories
books = books.groupby("categories").filter(lambda x: len(x) > 100)

In [None]:
# One-hot encode categories
categories = pd.get_dummies(books["categories"], drop_first=True)
books = pd.concat([books.drop(columns=["categories"]), categories], axis=1)

# Extract review counts and helpfulness
books[["num_helpful", "num_reviews"]] = books["review/helpfulness"].str.split("/", expand=True).astype(int)
books["perc_helpful_reviews"] = books["num_helpful"] / books["num_reviews"]
books["perc_helpful_reviews"].fillna(0, inplace=True)
books.drop(columns=["review/helpfulness"], inplace=True)


In [None]:
# Normalize text columns
for col in ["review/summary", "review/text", "description"]:
    books[col] = books[col].str.lower()

# Positive sentiment words
positive_words = [
    "great", "excellent", "good", "interesting", "enjoy", "helpful", "useful", "like", "love",
    "beautiful", "fantastic", "perfect", "wonderful", "impressive", "amazing", "outstanding",
    "remarkable", "brilliant", "exceptional", "positive", "thrilling"
]

vectorizer = CountVectorizer(vocabulary=positive_words)

In [None]:
# Transform and calculate sentiment features
for feature in ["review/text", "review/summary", "description"]:
    transformed = vectorizer.fit_transform(books[feature].fillna(""))
    books[f"positive_words_{feature.split('/')[-1]}"] = transformed.sum(axis=1).A1

In [None]:
# Drop original text columns
books.drop(columns=["review/text", "review/summary", "description"], inplace=True)

# Define features and target
X = books.drop(columns=["title", "authors", "popularity"]).values
y = books["popularity"].values

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Train model
clf = RandomForestClassifier(
    n_estimators=120,
    max_depth=50,
    min_samples_split=5,
    random_state=42,
    class_weight="balanced"
)
clf.fit(X_train, y_train)

In [None]:
# Evaluate model
print("Train Accuracy:", clf.score(X_train, y_train))
print("Test Accuracy:", clf.score(X_test, y_test))