In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from collections import Counter
from tqdm import tqdm
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import os
from torch.utils.data import DataLoader, Dataset as TorchDataset
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer


In [None]:
# === 1. Load and Split Data ===

data_path = 'E:/amazon-sentiment-analyzer/backend/data/train_all_3class.csv'

print(f"📂 Loading dataset from: {data_path} ...")
df = pd.read_csv(data_path)
print(f"✅ Loaded {len(df)} rows.")

# Show basic dataset info
print("\n📊 Label distribution BEFORE split:")
label_counts = Counter(df['label'])
for label, count in sorted(label_counts.items()):
    label_name = {0: "Negative", 1: "Neutral", 2: "Positive"}.get(label, str(label))
    print(f"  {label} ({label_name}): {count:,} samples")

# Split
print("\n✂️ Splitting dataset into Train and Test (80/20)...")
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

# Convert to DataFrames to view shape
train_df = pd.DataFrame({'text': train_texts, 'label': train_labels})
test_df = pd.DataFrame({'text': test_texts, 'label': test_labels})

print(f"\n✅ Split complete!")
print(f"  ➤ Train set: {len(train_df)} samples")
print(f"  ➤ Test set:  {len(test_df)} samples")

# Show post-split label distribution
print("\n📊 Label distribution AFTER split:")
for split_name, labels in [("Train", train_labels), ("Test", test_labels)]:
    counts = Counter(labels)
    print(f"  {split_name} set:")
    for label, count in sorted(counts.items()):
        label_name = {0: "Negative", 1: "Neutral", 2: "Positive"}.get(label, str(label))
        print(f"    {label} ({label_name}): {count:,} samples")

In [None]:
# === 2. TF-IDF + Classical ML Models ===

print("\n🔧 Step 2: TF-IDF Vectorization + Classical ML Models")

start_time = time.time()
print("⏳ Vectorizing text with TF-IDF (max_features=100000)...")
tfidf = TfidfVectorizer(max_features=100000)

X_train_tfidf = tfidf.fit_transform(train_texts)
X_test_tfidf = tfidf.transform(test_texts)

print(f"✅ TF-IDF complete. Vector shapes: Train={X_train_tfidf.shape}, Test={X_test_tfidf.shape}")
print(f"⏱️ Time taken: {time.time() - start_time:.2f} seconds")

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=300, n_jobs=-1),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, n_jobs=-1),
    "XGBoost": XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='mlogloss', verbosity=0),
    "LightGBM": LGBMClassifier(n_estimators=100, verbosity=-1)
}

results = {}

print("\n🚀 Training models...")
for name, model in models.items():
    print(f"\n📚 Training: {name}")
    start = time.time()
    model.fit(X_train_tfidf, train_labels)
    preds = model.predict(X_test_tfidf)

    acc = accuracy_score(test_labels, preds)
    print(f"🎯 Accuracy: {acc:.4f}")
    print(f"📋 Classification Report:\n{classification_report(test_labels, preds, target_names=['Negative', 'Neutral', 'Positive'])}")
    print(f"⏱️ Time taken: {time.time() - start:.2f} sec")

    results[name] = acc

In [None]:
# Logistic Regression
print("\n📚 Training: Logistic Regression")

start_time = time.time()

# Initialize and train the model
lr = LogisticRegression(max_iter=1000)
print("⏳ Fitting model...")
lr.fit(X_train_tfidf, train_labels)

# Predict
print("🔎 Predicting on test set...")
y_pred_lr = lr.predict(X_test_tfidf)

# Evaluate
acc_lr = accuracy_score(test_labels, y_pred_lr)
print(f"✅ Accuracy: {acc_lr:.4f}")
print(f"📋 Classification Report:\n{classification_report(test_labels, y_pred_lr, target_names=['Negative', 'Neutral', 'Positive'])}")

# Store result
results['LogisticRegression'] = acc_lr

print(f"⏱️ Time taken: {time.time() - start_time:.2f} sec")

In [None]:
# Random Forest
print("\n🌲 Training: Random Forest")

start_time = time.time()

# Initialize and train the model
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
print("⏳ Fitting model...")
rf.fit(X_train_tfidf, train_labels)

# Predict
print("🔎 Predicting on test set...")
y_pred_rf = rf.predict(X_test_tfidf)

# Evaluate
acc_rf = accuracy_score(test_labels, y_pred_rf)
print(f"✅ Accuracy: {acc_rf:.4f}")
print(f"📋 Classification Report:\n{classification_report(test_labels, y_pred_rf, target_names=['Negative', 'Neutral', 'Positive'])}")

# Store result
results['RandomForest'] = acc_rf

print(f"⏱️ Time taken: {time.time() - start_time:.2f} sec")

In [None]:
# XGBoost
print("\n⚡ Training: XGBoost (GPU enabled)")

start_time = time.time()

# Initialize XGBoost model
xgb_clf = XGBClassifier(
    tree_method='gpu_hist',        # Use GPU
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42,
    verbosity=1
)

print("⏳ Fitting model...")
xgb_clf.fit(X_train_tfidf, train_labels)

# Predict
print("🔎 Predicting on test set...")
y_pred_xgb = xgb_clf.predict(X_test_tfidf)

# Evaluate
acc_xgb = accuracy_score(test_labels, y_pred_xgb)
print(f"✅ Accuracy: {acc_xgb:.4f}")
print(f"📋 Classification Report:\n{classification_report(test_labels, y_pred_xgb, target_names=['Negative', 'Neutral', 'Positive'])}")

# Store result
results['XGBoost'] = acc_xgb

print(f"⏱️ Time taken: {time.time() - start_time:.2f} sec")

In [None]:
# LightGBM
print("\n⚡ Training: LightGBM (GPU enabled)")

start_time = time.time()

# Initialize LightGBM model with GPU support
lgb_clf = LGBMClassifier(
    device='gpu',
    boosting_type='gbdt',
    objective='multiclass',
    num_class=3,
    random_state=42,
    verbose=1
)

print("⏳ Fitting model...")
lgb_clf.fit(X_train_tfidf, train_labels)

# Predict
print("🔎 Predicting on test set...")
y_pred_lgb = lgb_clf.predict(X_test_tfidf)

# Evaluate
acc_lgb = accuracy_score(test_labels, y_pred_lgb)
print(f"✅ Accuracy: {acc_lgb:.4f}")
print(f"📋 Classification Report:\n{classification_report(test_labels, y_pred_lgb, target_names=['Negative', 'Neutral', 'Positive'])}")

# Store result
results['LightGBM'] = acc_lgb

print(f"⏱️ Time taken: {time.time() - start_time:.2f} sec")

In [None]:
# === 3. LSTM with PyTorch ===
print("\n🧠 Training: LSTM with PyTorch")

# Dataset definition
class TextDataset(TorchDataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.sequences = tokenizer.texts_to_sequences(texts)
        self.sequences = pad_sequences(self.sequences, maxlen=max_len)
        self.labels = torch.tensor(labels.values)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long), self.labels[idx]

# Hyperparameters
max_len = 200
vocab_size = 50000
batch_size = 512
embed_dim = 128
hidden_dim = 64
num_epochs = 3

# Tokenization
print("🔠 Tokenizing and preparing sequences...")
tokenizer_lstm = Tokenizer(num_words=vocab_size)
tokenizer_lstm.fit_on_texts(train_texts)

# Dataset & Loader
train_dataset = TextDataset(train_texts, train_labels, tokenizer_lstm, max_len)
test_dataset = TextDataset(test_texts, test_labels, tokenizer_lstm, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# LSTM Model
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 3)

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        return self.fc(hidden[-1])

model = LSTMClassifier(vocab_size=vocab_size, embed_dim=embed_dim, hidden_dim=hidden_dim).cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

start_time = time.time()

# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0
    print(f"\n📚 Epoch {epoch + 1}/{num_epochs}")
    for batch_idx, (batch_x, batch_y) in enumerate(train_loader):
        batch_x, batch_y = batch_x.cuda(), batch_y.cuda()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if (batch_idx + 1) % 20 == 0:
            print(f"  ➤ Batch {batch_idx+1}/{len(train_loader)} | Loss: {running_loss / (batch_idx+1):.4f}")

# Evaluation
print("\n🔍 Evaluating model on test set...")
model.eval()
preds, actuals = [], []

with torch.no_grad():
    for batch_x, batch_y in test_loader:
        batch_x = batch_x.cuda()
        outputs = model(batch_x)
        preds.extend(outputs.argmax(1).cpu().numpy())
        actuals.extend(batch_y.numpy())

# Accuracy and report
acc_lstm = accuracy_score(actuals, preds)
print(f"\n✅ Accuracy: {acc_lstm:.4f}")
print(f"📋 Classification Report:\n{classification_report(actuals, preds, target_names=['Negative', 'Neutral', 'Positive'])}")

results['LSTM'] = acc_lstm

print(f"⏱️ Total Time for LSTM: {time.time() - start_time:.2f} sec")

In [None]:
import os
import time
import torch
import pandas as pd
from collections import Counter
from sklearn.metrics import accuracy_score, classification_report
from datasets import Dataset
from transformers import (
    BertTokenizer, BertForSequenceClassification,
    TrainingArguments, Trainer
)

# === 1. Load Data ===
data_path = "E:/amazon-sentiment-analyzer/backend/data/train_all_3class.csv"
print(f"📂 Loading dataset from: {data_path} ...")
df = pd.read_csv(data_path)
print(f"✅ Loaded {len(df)} rows.")

# 🧼 Clean 'text' column
print("\n🧹 Cleaning text column...")
df = df.dropna(subset=["text"])
df["text"] = df["text"].astype(str)
df = df[df["text"].str.strip() != ""]  # remove empty strings
print(f"✅ After cleaning: {len(df)} rows.")

# 📊 Label distribution
print("\n📊 Label distribution:")
label_counts = Counter(df["label"])
for label, count in sorted(label_counts.items()):
    label_name = ["Negative", "Neutral", "Positive"][label]
    print(f"  {label} ({label_name}): {count} samples")

# === 2. Split into Train/Test ===
print("\n✂️ Splitting dataset into 80% train / 20% test ...")
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)
print(f"✅ Train size: {len(train_df)} | Test size: {len(test_df)}")

train_texts, train_labels = train_df["text"].tolist(), train_df["label"].tolist()
test_texts, test_labels = test_df["text"].tolist(), test_df["label"].tolist()

# === 3. Tokenizer ===
print("\n🔠 Loading BERT tokenizer (bert-base-uncased)...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# === 4. Convert to HuggingFace Datasets ===
print("📦 Converting to HuggingFace Dataset...")
train_ds = Dataset.from_dict({'text': train_texts, 'label': train_labels})
test_ds = Dataset.from_dict({'text': test_texts, 'label': test_labels})
print(f"✅ HuggingFace Datasets ready: Train={train_ds.num_rows}, Test={test_ds.num_rows}")

# === 5. Tokenization ===
print("\n🧼 Tokenizing datasets...")
def tokenize_fn(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=128)

train_ds = train_ds.map(tokenize_fn, batched=True)
test_ds = test_ds.map(tokenize_fn, batched=True)
train_ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
print("✅ Tokenization complete.")

# === 6. Load Model ===
print("\n🧠 Loading BERT model for classification...")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3).cuda()

# === 7. Training Arguments ===
print("⚙️ Setting up training arguments...")
training_args = TrainingArguments(
    output_dir="E:/amazon-sentiment-analyzer/backend/model/bert_output",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    report_to="none"
)

# === 8. Trainer Setup ===
print("👨‍🏫 Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer
)

# === 9. Training ===
print("\n🚀 Starting training...")
start_time = time.time()
trainer.train()
print(f"✅ Training done in {time.time() - start_time:.2f} seconds.")

# === 10. Evaluation ===
print("\n📊 Evaluating model on test set...")
eval_results = trainer.evaluate()
print("📈 Evaluation Results:")
for k, v in eval_results.items():
    print(f"  {k}: {v:.4f}")

# === 11. Accuracy & Classification Report ===
print("\n🔍 Running predictions on test set...")
preds = trainer.predict(test_ds).predictions
preds_cls = torch.argmax(torch.tensor(preds), axis=1).numpy()

acc = accuracy_score(test_labels, preds_cls)
print(f"\n✅ Test Accuracy: {acc:.4f}")
print("📋 Classification Report:")
print(classification_report(test_labels, preds_cls, target_names=["Negative", "Neutral", "Positive"]))

# === 12. Save Model and Tokenizer ===
save_dir = "E:/amazon-sentiment-analyzer/backend/model/bert_model"
print(f"\n💾 Saving model & tokenizer to: {save_dir}")
os.makedirs(save_dir, exist_ok=True)
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print("✅ Save complete.")


In [None]:
# # === 5. Compare All Models ===
# print("\n📊 Model Accuracies:")
# for name, acc in results.items():
#     print(f"  {name}: {acc:.4f}")

# best_model = max(results, key=results.get)
# print(f"\n🏆 Best model: {best_model} with accuracy {results[best_model]:.4f}")

In [None]:
import pandas as pd
import joblib
import os
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from scipy.sparse import vstack

# === Paths ===
data_dir = "E:/amazon-sentiment-analyzer/backend/data/"
train_path = os.path.join(data_dir, "train01.csv")
val_path = os.path.join(data_dir, "val01.csv")
test_path = os.path.join(data_dir, "test01.csv")

CHUNKSIZE = 500_000  # Adjust based on RAM

print("📂 Loading val/test...")
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)

val_df['text'] = val_df['text'].fillna("").astype(str)
test_df['text'] = test_df['text'].fillna("").astype(str)

# === Fit TF-IDF on chunks from training set ===
print("🔠 Fitting TF-IDF vectorizer on full training data...")
vectorizer = TfidfVectorizer(max_features=100_000)

train_texts = []
reader = pd.read_csv(train_path, chunksize=CHUNKSIZE)
for chunk in reader:
    chunk['text'] = chunk['text'].fillna("").astype(str)
    train_texts.extend(chunk['text'].tolist())
    print(f"🧠 Collected: {len(train_texts):,} samples")

vectorizer.fit(train_texts)
print("✅ TF-IDF fit complete.")

# === Convert all train data to vectors (in chunks) ===
print("🔄 Transforming train data to vectors...")
X_train_chunks = []
y_train_chunks = []

reader = pd.read_csv(train_path, chunksize=CHUNKSIZE)
for chunk in reader:
    chunk['text'] = chunk['text'].fillna("").astype(str)
    X_chunk = vectorizer.transform(chunk['text'])
    X_train_chunks.append(X_chunk)
    y_train_chunks.extend(chunk['label'].tolist())
    print(f"📦 Processed chunk with {len(chunk):,} rows")

X_train = vstack(X_train_chunks)
y_train = y_train_chunks

print(f"✅ Final X_train shape: {X_train.shape}")

# === Transform val/test ===
X_val = vectorizer.transform(val_df['text'])
X_test = vectorizer.transform(test_df['text'])
y_val = val_df['label']
y_test = test_df['label']

# === Train Models ===
models = {
    "LogisticRegression": LogisticRegression(max_iter=200, solver='saga', n_jobs=-1),
    "LightGBM": LGBMClassifier(n_estimators=300, learning_rate=0.1, num_leaves=64, n_jobs=-1),
    "XGBoost": XGBClassifier(n_estimators=300, learning_rate=0.1, max_depth=6, n_jobs=-1, tree_method='gpu_hist')
}

results = {}

for name, model in models.items():
    print(f"\n🚀 Training: {name}")
    start = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start

    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    print(f"✅ {name} Accuracy: {acc:.4f} | Time: {train_time:.2f}s")
    print(classification_report(y_test, y_pred))

    model_dir = f"E:/amazon-sentiment-analyzer/backend/model/{name.lower()}_model"
    os.makedirs(model_dir, exist_ok=True)
    joblib.dump(model, os.path.join(model_dir, "model.pkl"))
    joblib.dump(vectorizer, os.path.join(model_dir, "vectorizer.pkl"))
    print(f"💾 Saved model to {model_dir}")

    results[name] = {"accuracy": acc, "time": train_time}

# === Summary ===
print("\n📊 Summary of Results:")
for name, metrics in results.items():
    print(f"{name}: Accuracy={metrics['accuracy']:.4f}, Time={metrics['time']:.2f}s")


📂 Loading val/test...
🔠 Fitting TF-IDF vectorizer on full training data...
🧠 Collected: 500,000 samples
🧠 Collected: 1,000,000 samples
🧠 Collected: 1,500,000 samples
🧠 Collected: 2,000,000 samples
🧠 Collected: 2,500,000 samples
🧠 Collected: 3,000,000 samples
🧠 Collected: 3,500,000 samples
🧠 Collected: 4,000,000 samples
🧠 Collected: 4,500,000 samples
🧠 Collected: 5,000,000 samples
🧠 Collected: 5,500,000 samples
🧠 Collected: 6,000,000 samples
🧠 Collected: 6,500,000 samples
🧠 Collected: 7,000,000 samples
🧠 Collected: 7,500,000 samples
🧠 Collected: 8,000,000 samples
🧠 Collected: 8,500,000 samples
🧠 Collected: 9,000,000 samples
🧠 Collected: 9,500,000 samples
🧠 Collected: 10,000,000 samples
🧠 Collected: 10,500,000 samples
🧠 Collected: 11,000,000 samples
🧠 Collected: 11,500,000 samples
🧠 Collected: 12,000,000 samples
🧠 Collected: 12,500,000 samples
🧠 Collected: 13,000,000 samples
🧠 Collected: 13,500,000 samples
🧠 Collected: 14,000,000 samples
🧠 Collected: 14,500,000 samples
🧠 Collected: 15,00

In [1]:
import pandas as pd
import joblib
import os
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from scipy.sparse import vstack

# === Paths ===
data_dir = "E:/amazon-sentiment-analyzer/backend/data/"
train_path = os.path.join(data_dir, "train01.csv")
val_path = os.path.join(data_dir, "val01.csv")
test_path = os.path.join(data_dir, "test01.csv")
CHUNKSIZE = 500_000

# === Load val/test ===
print("📂 Loading val/test...")
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)
val_df['text'] = val_df['text'].fillna("").astype(str)
test_df['text'] = test_df['text'].fillna("").astype(str)

# === Fit TF-IDF ===
print("🔠 Fitting TF-IDF vectorizer...")
vectorizer = TfidfVectorizer(max_features=50_000)
train_texts = []
reader = pd.read_csv(train_path, chunksize=CHUNKSIZE, nrows=100_000)  # Subsample for fitting
for chunk in reader:
    chunk['text'] = chunk['text'].fillna("").astype(str)
    train_texts.extend(chunk['text'].tolist())
vectorizer.fit(train_texts)
print("✅ TF-IDF fit complete.")

# === Transform train data ===
print("🔄 Transforming train data...")
X_train_chunks = []
y_train_chunks = []
reader = pd.read_csv(train_path, chunksize=CHUNKSIZE)
for i, chunk in enumerate(reader):
    start = time.time()
    chunk['text'] = chunk['text'].fillna("").astype(str)
    X_chunk = vectorizer.transform(chunk['text'])
    X_train_chunks.append(X_chunk)
    y_train_chunks.extend(chunk['label'].tolist())
    print(f"📦 Processed chunk {i+1} in {time.time() - start:.2f}s")
X_train = vstack(X_train_chunks)
y_train = y_train_chunks
print(f"✅ X_train shape: {X_train.shape}")

# === Transform val/test ===
X_val = vectorizer.transform(val_df['text'])
X_test = vectorizer.transform(test_df['text'])
y_val = val_df['label']
y_test = test_df['label']

# === Train Models ===
models = {
    "SGDClassifier": SGDClassifier(loss='log', max_iter=50, n_jobs=-1),
    "LightGBM": LGBMClassifier(n_estimators=100, learning_rate=0.1, num_leaves=31, n_jobs=-1),
    "XGBoost": XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=4, n_jobs=-1, tree_method='gpu_hist')
}

results = {}
for name, model in models.items():
    print(f"\n🚀 Training: {name}")
    start = time.time()
    if name in ["LightGBM", "XGBoost"]:
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='logloss', early_stopping_rounds=10)
    else:
        model.fit(X_train, y_train)
    train_time = time.time() - start
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"✅ {name} Accuracy: {acc:.4f} | Time: {train_time:.2f}s")
    print(classification_report(y_test, y_pred))
    model_dir = f"E:/amazon-sentiment-analyzer/backend/model/{name.lower()}_model"
    os.makedirs(model_dir, exist_ok=True)
    joblib.dump(model, os.path.join(model_dir, "model.pkl"), compress=3)
    joblib.dump(vectorizer, os.path.join(model_dir, "vectorizer.pkl"), compress=3)
    results[name] = {"accuracy": acc, "time": train_time}

# === Summary ===
print("\n📊 Summary of Results:")
sorted_results = sorted(results.items(), key=lambda x: x[1]['accuracy'], reverse=True)
for name, metrics in sorted_results:
    print(f"{name}: Accuracy={metrics['accuracy']:.4f}, Time={metrics['time']:.2f}s")

📂 Loading val/test...
🔠 Fitting TF-IDF vectorizer...
✅ TF-IDF fit complete.
🔄 Transforming train data...
📦 Processed chunk 1 in 29.12s
📦 Processed chunk 2 in 24.26s
📦 Processed chunk 3 in 22.25s
📦 Processed chunk 4 in 24.77s
📦 Processed chunk 5 in 23.28s


KeyboardInterrupt: 

In [6]:
import pandas as pd
import joblib
import os
import time
import sklearn
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import psutil
import numpy as np

# === Check scikit-learn version ===
print(f"📦 scikit-learn version: {sklearn.__version__}")

# === Paths ===
data_dir = "E:/amazon-sentiment-analyzer/backend/data/"
train_path = os.path.join(data_dir, "train01.csv")
val_path = os.path.join(data_dir, "val01.csv")
test_path = os.path.join(data_dir, "test01.csv")
CHUNKSIZE = 100_000  # Suitable for 8GB RAM

# === Load val/test ===
print("📂 Loading val/test...")
try:
    val_df = pd.read_csv(val_path)
    test_df = pd.read_csv(test_path)
    val_df['text'] = val_df['text'].fillna("").astype(str).str.lower()  # Basic preprocessing
    test_df['text'] = test_df['text'].fillna("").astype(str).str.lower()
    print(f"Memory usage after loading: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")
except FileNotFoundError as e:
    raise FileNotFoundError(f"File not found: {e}")

# === Compute Class Weights ===
print("🔍 Computing class weights from sample...")
sample_size = 200_000  # Increased for better estimation
y_sample = []
reader = pd.read_csv(train_path, chunksize=CHUNKSIZE, nrows=sample_size)
for chunk in reader:
    try:
        chunk['label'] = pd.to_numeric(chunk['label'], errors='coerce')
        chunk = chunk.dropna(subset=['label'])
        chunk['label'] = chunk['label'].astype(int)
        y_sample.extend(chunk['label'].tolist())
    except Exception as e:
        print(f"⚠️ Error processing chunk for class weights: {e}")
if not y_sample:
    raise ValueError("No valid labels found in sample. Check 'label' column in train01.csv.")

# Inspect unique labels
unique_labels = np.unique(y_sample)
print(f"Unique labels in sample: {unique_labels}")
if len(unique_labels) < 2:
    raise ValueError(f"Expected at least 2 classes, found {len(unique_labels)}: {unique_labels}")

# Compute class weights
try:
    class_weights = compute_class_weight('balanced', classes=unique_labels, y=y_sample)
    class_weight_dict = {unique_labels[i]: class_weights[i] for i in range(len(unique_labels))}
    print(f"Class weights: {class_weight_dict}")
except ValueError as e:
    print(f"⚠️ Error computing class weights: {e}. Using equal weights.")
    class_weight_dict = {label: 1.0 for label in unique_labels}

# === Initialize Vectorizer ===
print("🔠 Initializing HashingVectorizer...")
vectorizer = HashingVectorizer(n_features=50_000, norm='l2', alternate_sign=False)  # Increased features
print("✅ Vectorizer ready.")

# === Initialize Model ===
model = SGDClassifier(
    loss='log_loss',
    max_iter=30,  # Increased for better convergence
    learning_rate='adaptive',
    eta0=0.001,  # Lowered for stability
    alpha=0.0001,  # Adjusted regularization
    n_jobs=-1
)

# === Train Model Incrementally ===
print("🚀 Training SGDClassifier on chunks...")
start = time.time()
for epoch in range(2):  # Two passes over data
    reader = pd.read_csv(train_path, chunksize=CHUNKSIZE)
    for i, chunk in enumerate(reader):
        chunk_start = time.time()
        try:
            chunk['text'] = chunk['text'].fillna("").astype(str).str.lower()
            chunk['label'] = pd.to_numeric(chunk['label'], errors='coerce')
            chunk = chunk.dropna(subset=['label'])
            chunk['label'] = chunk['label'].astype(int)
            X_chunk = vectorizer.transform(chunk['text'])
            y_chunk = chunk['label']
            valid_idx = y_chunk.isin(class_weight_dict.keys())
            if not valid_idx.all():
                print(f"⚠️ Chunk {i+1} contains invalid labels. Filtering...")
                X_chunk = X_chunk[valid_idx]
                y_chunk = y_chunk[valid_idx]
            if len(y_chunk) == 0:
                print(f"⚠️ Chunk {i+1} has no valid labels. Skipping.")
                continue
            model.partial_fit(X_chunk, y_chunk, classes=unique_labels, sample_weight=[class_weight_dict[y] for y in y_chunk])
            print(f"📦 Epoch {epoch+1}, Chunk {i+1} with {len(chunk):,} rows in {time.time() - chunk_start:.2f}s")
            print(f"Memory usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")
        except Exception as e:
            print(f"⚠️ Error in epoch {epoch+1}, chunk {i+1}: {e}")
train_time = time.time() - start
print(f"✅ Training complete in {train_time:.2f}s")

# === Transform and Evaluate Val/Test ===
print("🔄 Transforming val/test data...")
X_val = vectorizer.transform(val_df['text'])
X_test = vectorizer.transform(test_df['text'])
y_val = pd.to_numeric(val_df['label'], errors='coerce').astype(int)
y_test = pd.to_numeric(test_df['label'], errors='coerce').astype(int)

print("📊 Evaluating on validation set...")
y_val_pred = model.predict(X_val)
val_acc = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_acc:.4f}")

print("📊 Evaluating on test set...")
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"✅ SGDClassifier Accuracy: {acc:.4f} | Time: {train_time:.2f}s")
print(classification_report(y_test, y_pred, zero_division=0))

# === Save Model and Vectorizer ===
model_dir = "E:/amazon-sentiment-analyzer/backend/model/sgdclassifier_model"
os.makedirs(model_dir, exist_ok=True)
joblib.dump(model, os.path.join(model_dir, "model.pkl"), compress=3)
joblib.dump(vectorizer, os.path.join(model_dir, "vectorizer.pkl"), compress=3)
print(f"💾 Saved model to {model_dir}")

📦 scikit-learn version: 1.6.1
📂 Loading val/test...
Memory usage after loading: 2520.18 MB
🔍 Computing class weights from sample...
Unique labels in sample: [0 1 2 3 4]
Class weights: {np.int64(0): np.float64(3.2229473853839337), np.int64(1): np.float64(4.4692737430167595), np.int64(2): np.float64(2.3066720489014476), np.int64(3): np.float64(1.1158534884369682), np.int64(4): np.float64(0.318849590676838)}
🔠 Initializing HashingVectorizer...
✅ Vectorizer ready.
🚀 Training SGDClassifier on chunks...
📦 Epoch 1, Chunk 1 with 100,000 rows in 4.90s
Memory usage: 2005.37 MB
📦 Epoch 1, Chunk 2 with 100,000 rows in 3.29s
Memory usage: 1906.19 MB
📦 Epoch 1, Chunk 3 with 100,000 rows in 2.55s
Memory usage: 1910.76 MB
📦 Epoch 1, Chunk 4 with 100,000 rows in 2.75s
Memory usage: 1912.59 MB
📦 Epoch 1, Chunk 5 with 100,000 rows in 2.40s
Memory usage: 1908.84 MB
📦 Epoch 1, Chunk 6 with 100,000 rows in 2.72s
Memory usage: 1906.91 MB
📦 Epoch 1, Chunk 7 with 100,000 rows in 2.57s
Memory usage: 1907.55 MB
