# Phase 4: Sentiment Modeling

This notebook implements sentiment classification models to predict review sentiment from text and metadata features.

## Objectives
1. Prepare data for sentiment modeling
2. Implement baseline models (Logistic Regression, Naive Bayes, SVM, Random Forest)
3. Engineer advanced text features
4. Build advanced models (XGBoost, Neural Networks, LSTM, BERT)
5. Perform hyperparameter tuning
6. Evaluate and compare all models
7. Select and save best model

## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# ML Libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score
)

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Project paths
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA_DIR = PROJECT_ROOT / "data"
PROCESSED_DIR = DATA_DIR / "processed" / "fused"
REPORTS_DIR = PROJECT_ROOT / "reports"
FIGURES_DIR = REPORTS_DIR / "figures"
MODELS_DIR = PROJECT_ROOT / "models"

# Create directories if they don't exist
FIGURES_DIR.mkdir(parents=True, exist_ok=True)
MODELS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"Data directory: {DATA_DIR}")
print(f"Models directory: {MODELS_DIR}")

## 2. Load Data

In [None]:
# Load fused dataset (using similar approach to EDA, but larger sample for modeling)
import pyarrow.parquet as pq
import gc

fused_file = PROCESSED_DIR / "books_books_fused.parquet"

print(f"Loading dataset from: {fused_file}")

# For modeling, we'll use a larger sample (100K rows) for better model performance
if fused_file.exists():
    parquet_file = pq.ParquetFile(fused_file)
    num_rows = parquet_file.metadata.num_rows
    num_row_groups = parquet_file.num_row_groups
    
    print(f"Total rows in file: {num_rows:,}")
    print(f"Number of row groups: {num_row_groups}")
    
    # Sample size for modeling (larger than EDA for better model performance)
    SAMPLE_SIZE = 100_000
    
    if num_rows > SAMPLE_SIZE:
        print(f"\nSampling {SAMPLE_SIZE:,} rows for modeling...")
        
        # Try to read multiple row groups
        max_row_groups_to_try = min(20, num_row_groups)
        rng = np.random.RandomState(42)
        row_groups_to_try = sorted(rng.choice(num_row_groups, 
                                              size=max_row_groups_to_try, 
                                              replace=False))
        
        print(f"Attempting to read from {len(row_groups_to_try)} row groups...")
        batches = []
        
        for rg_idx in row_groups_to_try:
            try:
                batch = parquet_file.read_row_groups([rg_idx]).to_pandas()
                
                # Handle dictionary columns
                for col in batch.columns:
                    if batch[col].dtype == 'object':
                        sample_vals = batch[col].dropna().head(5)
                        if len(sample_vals) > 0 and any(isinstance(val, dict) for val in sample_vals):
                            batch[col] = batch[col].astype(str)
                
                batches.append(batch)
                
                total_rows = sum(len(b) for b in batches)
                if total_rows >= SAMPLE_SIZE * 1.5:
                    break
            except Exception as e:
                print(f"Warning: Could not read row group {rg_idx}: {str(e)[:100]}")
                continue
        
        if batches:
            df_temp = pd.concat(batches, ignore_index=True)
            
            if len(df_temp) > SAMPLE_SIZE:
                df = df_temp.sample(n=SAMPLE_SIZE, random_state=42)
            else:
                df = df_temp.copy()
            
            del batches, df_temp
            gc.collect()
        else:
            raise ValueError("Could not read any data from parquet file")
    else:
        df = pd.read_parquet(fused_file)
    
    # Ensure helpfulness_ratio is numeric
    if 'helpfulness_ratio' in df.columns:
        df['helpfulness_ratio'] = pd.to_numeric(df['helpfulness_ratio'], errors='coerce')
    
    print(f"\nDataset loaded successfully!")
    print(f"   Shape: {df.shape}")
    print(f"   Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
else:
    raise FileNotFoundError(f"File not found: {fused_file})

## 3. Data Preparation and Target Creation

In [None]:
# Check available columns
print("Available columns:")
for i, col in enumerate(df.columns, 1):
    print(f"  {i:2d}. {col}")

# Check for review text column
text_cols = [col for col in df.columns if 'review' in col.lower() and 'text' in col.lower()]
print(f"\nReview text columns found: {text_cols}")

# Check data quality
print(f"\nDataset info:")
print(f"  Total rows: {len(df):,}")
print(f"  Missing values: {df.isnull().sum().sum():,}")
if 'overall' in df.columns:
    print(f"  Rating distribution:")
    print(df['overall'].value_counts().sort_index())

In [None]:
# Create sentiment labels from ratings
if 'overall' in df.columns:
    # Multi-class: 5 classes (1-5 stars)
    df['sentiment_5class'] = df['overall'].astype(int)
    
    # 3-class: Positive, Neutral, Negative
    df['sentiment_3class'] = df['overall'].apply(
        lambda x: 'Positive' if x >= 4 else ('Neutral' if x == 3 else 'Negative')
    )
    
    # Binary: Positive vs Negative
    df['sentiment_binary'] = df['overall'].apply(
        lambda x: 'Positive' if x >= 4 else 'Negative'
    )
    
    print("Sentiment distribution (3-class):")
    print(df['sentiment_3class'].value_counts())
    print(f"\nSentiment distribution (binary):")
    print(df['sentiment_binary'].value_counts())
    print(f"\nSentiment distribution (5-class):")
    print(df['sentiment_5class'].value_counts().sort_index())
else:
    raise ValueError("Rating column 'overall' not found")

In [None]:
# Select target (we'll use 3-class for main analysis)
TARGET = 'sentiment_3class'

# Get review text (use first available text column)
if text_cols:
    TEXT_COL = text_cols[0]
    print(f"Using text column: {TEXT_COL}")
else:
    # Try to find any text-like column
    TEXT_COL = None
    for col in df.columns:
        if df[col].dtype == 'object' and df[col].str.len().mean() > 50:
            TEXT_COL = col
            print(f"Using text column: {TEXT_COL}")
            break

if TEXT_COL is None:
    raise ValueError("No suitable text column found")

# Remove rows with missing text or target
df_clean = df[[TEXT_COL, TARGET]].dropna()
print(f"\nClean dataset shape: {df_clean.shape}")
print(f"Removed {len(df) - len(df_clean)} rows with missing data")

# Check text length statistics
if len(df_clean) > 0:
    text_lengths = df_clean[TEXT_COL].str.len()
    print(f"\nText length statistics:")
    print(f"  Mean: {text_lengths.mean():.0f} characters")
    print(f"  Median: {text_lengths.median():.0f} characters")
    print(f"  Min: {text_lengths.min():.0f} characters")
    print(f"  Max: {text_lengths.max():.0f} characters")

## 4. Train-Test Split

In [None]:
# Split data
X = df_clean[TEXT_COL]
y = df_clean[TARGET]

# Stratified split to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

# Further split training into train and validation
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.15, random_state=42, stratify=y_train
)

print(f"Training set: {len(X_train):,} samples")
print(f"Validation set: {len(X_val):,} samples")
print(f"Test set: {len(X_test):,} samples")
print(f"\nClass distribution in training set:")
print(y_train.value_counts())
print(f"\nClass distribution in validation set:")
print(y_val.value_counts())
print(f"\nClass distribution in test set:")
print(y_test.value_counts())

## 5. Baseline Models

### 5.1 Feature Extraction (TF-IDF)

In [None]:
# Create TF-IDF vectorizer
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),  # Unigrams and bigrams
    min_df=2,  # Minimum document frequency
    max_df=0.95,  # Maximum document frequency
    stop_words='english'
)

# Fit and transform
print("Fitting TF-IDF vectorizer...")
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)

print(f"TF-IDF feature matrix shape: {X_train_tfidf.shape}")
print(f"Vocabulary size: {len(tfidf.vocabulary_)}")
print(f"Feature matrix is sparse: {hasattr(X_train_tfidf, 'toarray')}")

### 5.2 Model 1: Logistic Regression

In [None]:
# Train Logistic Regression
print("Training Logistic Regression...")
lr_model = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1)
lr_model.fit(X_train_tfidf, y_train)

# Predictions
y_train_pred_lr = lr_model.predict(X_train_tfidf)
y_val_pred_lr = lr_model.predict(X_val_tfidf)

# Evaluation
print("\nLogistic Regression Results:")
print(f"Training Accuracy: {accuracy_score(y_train, y_train_pred_lr):.4f}")
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_lr):.4f}")
print(f"Validation F1-Score (macro): {f1_score(y_val, y_val_pred_lr, average='macro'):.4f}")
print(f"Validation F1-Score (weighted): {f1_score(y_val, y_val_pred_lr, average='weighted'):.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred_lr))

### 5.3 Model 2: Naive Bayes

In [None]:
# Train Naive Bayes
print("Training Naive Bayes...")
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predictions
y_val_pred_nb = nb_model.predict(X_val_tfidf)

# Evaluation
print("\nNaive Bayes Results:")
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_nb):.4f}")
print(f"Validation F1-Score (macro): {f1_score(y_val, y_val_pred_nb, average='macro'):.4f}")
print(f"Validation F1-Score (weighted): {f1_score(y_val, y_val_pred_nb, average='weighted'):.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred_nb))

### 5.4 Model 3: Support Vector Machine (SVM)

In [None]:
# Train SVM (using smaller sample for speed)
print("Training SVM (this may take a while)...")
svm_sample_size = min(10000, X_train_tfidf.shape[0])
svm_indices = np.random.choice(X_train_tfidf.shape[0], svm_sample_size, replace=False, random_state=42)

svm_model = SVC(kernel='linear', random_state=42, probability=True)
svm_model.fit(X_train_tfidf[svm_indices], y_train.iloc[svm_indices])

# Predictions
y_val_pred_svm = svm_model.predict(X_val_tfidf)

# Evaluation
print("\nSVM Results:")
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_svm):.4f}")
print(f"Validation F1-Score (macro): {f1_score(y_val, y_val_pred_svm, average='macro'):.4f}")
print(f"Validation F1-Score (weighted): {f1_score(y_val, y_val_pred_svm, average='weighted'):.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred_svm))

### 5.5 Model 4: Random Forest

In [None]:
# Train Random Forest (using smaller sample for speed)
print("Training Random Forest (this may take a while)...")
rf_sample_size = min(20000, X_train_tfidf.shape[0])
rf_indices = np.random.choice(X_train_tfidf.shape[0], rf_sample_size, replace=False, random_state=42)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, max_depth=20)
rf_model.fit(X_train_tfidf[rf_indices], y_train.iloc[rf_indices])

# Predictions
y_val_pred_rf = rf_model.predict(X_val_tfidf)

# Evaluation
print("\nRandom Forest Results:")
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_rf):.4f}")
print(f"Validation F1-Score (macro): {f1_score(y_val, y_val_pred_rf, average='macro'):.4f}")
print(f"Validation F1-Score (weighted): {f1_score(y_val, y_val_pred_rf, average='weighted'):.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred_rf))

## 6. Model Comparison

In [None]:
# Compare baseline models
models = {
    'Logistic Regression': (lr_model, y_val_pred_lr),
    'Naive Bayes': (nb_model, y_val_pred_nb),
    'SVM': (svm_model, y_val_pred_svm),
    'Random Forest': (rf_model, y_val_pred_rf)
}

results = []
for name, (model, predictions) in models.items():
    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_val, predictions),
        'F1-Macro': f1_score(y_val, predictions, average='macro'),
        'F1-Weighted': f1_score(y_val, predictions, average='weighted'),
        'Precision': precision_score(y_val, predictions, average='macro'),
        'Recall': recall_score(y_val, predictions, average='macro')
    })

results_df = pd.DataFrame(results)
results_df = results_df.sort_values('F1-Macro', ascending=False)
print("Baseline Models Comparison:")
print(results_df.to_string(index=False))

# Visualize comparison
fig, ax = plt.subplots(figsize=(12, 6))
x = np.arange(len(results_df))
width = 0.15

metrics = ['Accuracy', 'F1-Macro', 'F1-Weighted', 'Precision', 'Recall']
for i, metric in enumerate(metrics):
    ax.bar(x + i*width, results_df[metric], width, label=metric)

ax.set_xlabel('Models')
ax.set_ylabel('Score')
ax.set_title('Baseline Models Performance Comparison')
ax.set_xticks(x + width * 2)
ax.set_xticklabels(results_df['Model'], rotation=45, ha='right')
ax.legend()
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(FIGURES_DIR / 'baseline_models_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# Save results
results_df.to_csv(REPORTS_DIR / 'baseline_models_results.csv', index=False)
print(f"\nResults saved to: {REPORTS_DIR / 'baseline_models_results.csv'}")

## 7. Baseline Models Summary

We've successfully implemented and evaluated 4 baseline models:
1. **Logistic Regression** - Linear classifier with good interpretability
2. **Naive Bayes** - Probabilistic classifier, fast training
3. **SVM** - Support Vector Machine with linear kernel
4. **Random Forest** - Ensemble tree-based model

All models have been evaluated on the validation set with comprehensive metrics.

## 8. Next Steps

### Completed âœ…
- Data loading and preparation
- Target variable creation (3-class, binary, 5-class)
- Train-test-validation split
- TF-IDF feature extraction
- 4 baseline models implemented and evaluated
- Model comparison and visualization

### To Be Implemented ðŸ”œ
- **Advanced Feature Engineering**:
  - Word embeddings (Word2Vec, GloVe)
  - Character n-grams
  - Metadata features integration
  - Sentiment scores (VADER, TextBlob)
  
- **Advanced Models**:
  - XGBoost/LightGBM
  - Neural Networks (MLP)
  - LSTM/GRU for sequence modeling
  - Transformer models (BERT/RoBERTa)
  - Ensemble methods
  
- **Hyperparameter Tuning**:
  - Grid search / Random search
  - Bayesian optimization
  - Cross-validation
  
- **Final Steps**:
  - Best model selection
  - Test set evaluation
  - Model persistence
  - Production pipeline