# Amazon Book Review Analysis
## Capstone Project Notebook
- **Sector**: Retail
- **Problem**: Predict customer satisfaction trends for Amazon book reviews.
- **Dataset**: `ratings_Books_cleaned_200K.csv` (198,117 rows, 5 columns).

This notebook documents the analysis process, including data cleaning, EDA, modeling, and evaluation.

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
import os

# Set working directory
os.chdir(r'C:\Users\munez\OneDrive\Documents\AUCA\Big-Data\Retail_Amazon_Project')

In [2]:
# Function to load and clean data
def load_and_clean_data(file_path):
    df = pd.read_csv(file_path)
    df.columns = ['item', 'user', 'rating', 'timestamp', 'reviewTime']
    df['reviewTime'] = pd.to_datetime(df['timestamp'], unit='s')
    df = df.dropna().drop_duplicates()
    return df

# Load cleaned dataset
df = load_and_clean_data('data/ratings_Books_cleaned_200K.csv')

In [3]:
# Function for EDA
def perform_eda(df):
    print("Descriptive Statistics:")
    print(df.describe())
    # Rating Distribution
    plt.figure(figsize=(8, 5))
    df['rating'].hist(bins=5, color='skyblue', edgecolor='black')
    plt.title('Distribution of Book Ratings')
    plt.xlabel('Rating (1-5)')
    plt.ylabel('Count')
    plt.grid(True, alpha=0.3)
    plt.savefig('data/rating_distribution.png')
    plt.close()
    # Rating Over Time
    plt.figure(figsize=(10, 6))
    df.groupby(df['reviewTime'].dt.to_period('M'))['rating'].mean().plot(color='green')
    plt.title('Average Rating Over Time (Monthly)')
    plt.xlabel('Time')
    plt.ylabel('Average Rating')
    plt.grid(True, alpha=0.3)
    plt.xticks(rotation=45)
    plt.savefig('data/rating_over_time.png')
    plt.close()
    print("EDA visualizations saved.")

# Perform EDA
perform_eda(df)

Descriptive Statistics:
              rating     timestamp                     reviewTime
count  198117.000000  1.981170e+05                         198117
mean        4.361034  1.351103e+09  2012-10-24 18:15:54.744923648
min         1.000000  8.710848e+08            1997-08-09 00:00:00
25%         4.000000  1.325030e+09            2011-12-28 00:00:00
50%         5.000000  1.400717e+09            2014-05-22 00:00:00
75%         5.000000  1.446163e+09            2015-10-30 00:00:00
max         5.000000  1.524701e+09            2018-04-26 00:00:00
std         1.090925  1.480994e+08                            NaN
EDA visualizations saved.


In [4]:

# Function for modeling
def train_and_evaluate_model(df):
    df['satisfied'] = np.where(df['rating'] >= 4, 1, 0)
    X = df[['timestamp']]
    y = df['satisfied']
    class_weights = compute_class_weight('balanced', classes=np.array([0, 1]), y=y)
    weight_dict = {0: class_weights[0], 1: class_weights[1]}
    print(f"Class weights: {weight_dict}")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LogisticRegression(max_iter=1000, class_weight=weight_dict)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy:.2f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Not Satisfied', 'Satisfied']))
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_pred_prob)
    print(f"ROC-AUC Score: {roc_auc:.2f}")
    df_test = X_test.copy()
    df_test['actual'] = y_test
    df_test['predicted'] = y_pred
    df_test['probability'] = y_pred_prob
    df_test.to_csv('data/model_predictions.csv', index=False)
    print("Model predictions saved to data/model_predictions.csv")

# Train and evaluate model
train_and_evaluate_model(df)

Class weights: {0: np.float64(3.0341368537123254), 1: np.float64(0.598652919882274)}
Model Accuracy: 0.83

Classification Report:
               precision    recall  f1-score   support

Not Satisfied       0.00      0.00      0.00      6613
    Satisfied       0.83      1.00      0.91     33011

     accuracy                           0.83     39624
    macro avg       0.42      0.50      0.45     39624
 weighted avg       0.69      0.83      0.76     39624

ROC-AUC Score: 0.57
Model predictions saved to data/model_predictions.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [5]:
# Function for innovation: Add user review count and use Random Forest
def innovate_model(df):
    # Aggregate user review count
    user_counts = df.groupby('user').size().reset_index(name='user_review_count')
    df = df.merge(user_counts, on='user', how='left')
    
    # Feature Engineering
    df['satisfied'] = np.where(df['rating'] >= 4, 1, 0)
    X = df[['timestamp', 'user_review_count']]  # New feature
    y = df['satisfied']
    
    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.array([0, 1]), y=y)
    weight_dict = {0: class_weights[0], 1: class_weights[1]}
    print(f"Innovative Class weights: {weight_dict}")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train Random Forest
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_estimators=100, class_weight=weight_dict, random_state=42)
    model.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Innovative Model Accuracy: {accuracy:.2f}")
    print("\nInnovative Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Not Satisfied', 'Satisfied'], zero_division=0))
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_pred_prob)
    print(f"Innovative ROC-AUC Score: {roc_auc:.2f}")
    
    # Save innovative predictions
    df_test = X_test.copy()
    df_test['actual'] = y_test
    df_test['predicted'] = y_pred
    df_test['probability'] = y_pred_prob
    df_test.to_csv('data/model_predictions_innovative.csv', index=False)
    print("Innovative model predictions saved to data/model_predictions_innovative.csv")

# Apply innovation
innovate_model(df)

Innovative Class weights: {0: np.float64(3.0341368537123254), 1: np.float64(0.598652919882274)}
Innovative Model Accuracy: 0.67

Innovative Classification Report:
               precision    recall  f1-score   support

Not Satisfied       0.21      0.35      0.26      6613
    Satisfied       0.85      0.74      0.79     33011

     accuracy                           0.67     39624
    macro avg       0.53      0.54      0.52     39624
 weighted avg       0.74      0.67      0.70     39624

Innovative ROC-AUC Score: 0.55
Innovative model predictions saved to data/model_predictions_innovative.csv


In [7]:
import pandas as pd
import os

# Set working directory
os.chdir(r'C:\Users\munez\OneDrive\Documents\AUCA\Big-Data\Retail_Amazon_Project')

# Load cleaned dataset
file_path = 'data/ratings_Books_cleaned_200K.csv'
df = pd.read_csv(file_path)

# Add satisfied column
df['satisfied'] = (df['rating'] >= 4).astype(int)

# Save updated file
df.to_csv('data/ratings_Books_cleaned_200K.csv', index=False)
print("Updated dataset with 'satisfied' saved to data/ratings_Books_cleaned_200K_updated.csv")

Updated dataset with 'satisfied' saved to data/ratings_Books_cleaned_200K_updated.csv
