In [1]:
import pandas as pd

# Load data
df = pd.read_csv("/content/Data_1.csv")

# Remove neutral samples (sentiment=1)
df = df[df["sentiment"] != 1]

# Remap labels: 0 = negative, 2 = positive → 0 = negative, 1 = positive
df["sentiment"] = df["sentiment"].replace(2, 1)

# Check distribution
print(df["sentiment"].value_counts())
# Expected output:
# 0    527,381 (negative)
# 1    584,436 (positive)

sentiment
1    584436
0    527381
Name: count, dtype: int64


In [2]:
from sklearn.model_selection import train_test_split

texts = df["text"].tolist()
labels = df["sentiment"].tolist()

# Stratified split (80% train, 10% val, 10% test)
X_train, X_temp, y_train, y_temp = train_test_split(
    texts, labels, test_size=0.2, stratify=labels, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
from tqdm import tqdm
from sklearn.metrics import classification_report, f1_score
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights correctly
classes = np.unique(y_train)
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Initialize model with proper config
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    id2label={0: "negative", 1: "positive"},
    label2id={"negative": 0, "positive": 1}
).to(device)

# Optimized training setup
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

# Enhanced Dataset class with caching
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=96):  # Reduced max_len
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.encodings = None

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        if self.encodings is None:
            text = str(self.texts[idx])
            encoding = self.tokenizer(
                text,
                max_length=self.max_len,
                truncation=True,
                padding="max_length",
                return_tensors="pt"
            )
            return {
                "input_ids": encoding["input_ids"].flatten(),
                "attention_mask": encoding["attention_mask"].flatten(),
                "labels": torch.tensor(self.labels[idx], dtype=torch.long)
            }
        else:
            return self.encodings[idx]

# Create datasets with optimized tokenization
train_dataset = SentimentDataset(X_train, y_train, tokenizer)
val_dataset = SentimentDataset(X_val, y_val, tokenizer)

# Use larger batch sizes with gradient accumulation
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128)

# Training loop with mixed precision
scaler = torch.cuda.amp.GradScaler()
grad_accum_steps = 2

for epoch in range(3):
    # Training
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]")

    for step, batch in enumerate(progress_bar):
        inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
        labels = batch["labels"].to(device)

        with torch.cuda.amp.autocast():
            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, labels)
            loss = loss / grad_accum_steps

        scaler.scale(loss).backward()

        if (step + 1) % grad_accum_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_loss += loss.item() * grad_accum_steps
        avg_loss = total_loss / (step + 1)
        progress_bar.set_postfix({"loss": f"{avg_loss:.4f}"})

    # Validation
    model.eval()
    val_preds, val_labels = [], []
    val_loss = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} [Val]"):
            inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
            labels = batch["labels"].to(device)

            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, labels)
            val_loss += loss.item()

            preds = torch.argmax(outputs.logits, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    val_accuracy = (np.array(val_preds) == np.array(val_labels)).mean()
    val_f1 = f1_score(val_labels, val_preds, average="macro")

    print(f"\nEpoch {epoch+1} Results:")
    print(f"Train Loss: {avg_loss:.4f} | Val Loss: {val_loss/len(val_loader):.4f}")
    print(f"Val Accuracy: {val_accuracy:.4f} | Val F1: {val_f1:.4f}")
    print(classification_report(val_labels, val_preds, target_names=["negative", "positive"]))
    print("-" * 80)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
Epoch 1 [Train]: 100%|██████████| 13898/13898 [32:21<00:00,  7.16it/s, loss=0.4331]
Epoch 1 [Val]: 100%|██████████| 869/869 [05:22<00:00,  2.69it/s]



Epoch 1 Results:
Train Loss: 0.4331 | Val Loss: 0.4189
Val Accuracy: 0.8031 | Val F1: 0.8031
              precision    recall  f1-score   support

    negative       0.76      0.86      0.80     52738
    positive       0.85      0.76      0.80     58444

    accuracy                           0.80    111182
   macro avg       0.81      0.81      0.80    111182
weighted avg       0.81      0.80      0.80    111182

--------------------------------------------------------------------------------


  with torch.cuda.amp.autocast():
Epoch 2 [Train]: 100%|██████████| 13898/13898 [32:12<00:00,  7.19it/s, loss=0.3831]
Epoch 2 [Val]: 100%|██████████| 869/869 [05:23<00:00,  2.69it/s]



Epoch 2 Results:
Train Loss: 0.3831 | Val Loss: 0.4084
Val Accuracy: 0.8140 | Val F1: 0.8134
              precision    recall  f1-score   support

    negative       0.81      0.80      0.80     52738
    positive       0.82      0.83      0.82     58444

    accuracy                           0.81    111182
   macro avg       0.81      0.81      0.81    111182
weighted avg       0.81      0.81      0.81    111182

--------------------------------------------------------------------------------


  with torch.cuda.amp.autocast():
Epoch 3 [Train]: 100%|██████████| 13898/13898 [32:12<00:00,  7.19it/s, loss=0.3413]
Epoch 3 [Val]: 100%|██████████| 869/869 [05:23<00:00,  2.69it/s]



Epoch 3 Results:
Train Loss: 0.3413 | Val Loss: 0.4229
Val Accuracy: 0.8130 | Val F1: 0.8124
              precision    recall  f1-score   support

    negative       0.81      0.80      0.80     52738
    positive       0.82      0.83      0.82     58444

    accuracy                           0.81    111182
   macro avg       0.81      0.81      0.81    111182
weighted avg       0.81      0.81      0.81    111182

--------------------------------------------------------------------------------


In [8]:
# Save model and tokenizer
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")

# Optional: Save PyTorch weights
torch.save(model.state_dict(), "./sentiment_model/pytorch_model.bin")

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load from saved directory
model = AutoModelForSequenceClassification.from_pretrained("./sentiment_model").to(device)
tokenizer = AutoTokenizer.from_pretrained("./sentiment_model")

In [10]:
def predict_sentiment(text, model, tokenizer, device):
    model.eval()
    encoding = tokenizer(
        text,
        max_length=96,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**encoding)
        probs = torch.softmax(outputs.logits, dim=1)
        pred = torch.argmax(probs).item()

    return "positive" if pred == 1 else "negative", probs.cpu().numpy()[0]

In [11]:
test_cases = [
    ("This product is absolutely amazing! Worth every penny!", "positive"),
    ("Worst customer service I've ever experienced.", "negative"),
    ("The item arrived damaged but the replacement process was smooth.", "negative"),  # Mixed sentiment
    ("Meh, it's okay I guess.", "negative"),
    ("", "neutral"),  # Edge case: empty input
    ("The", "neutral"),  # Edge case: very short input
]

for text, expected in test_cases:
    if not text.strip():  # Handle empty input
        print(f"Input: '{text}' => Error: Empty input")
        continue

    pred, probs = predict_sentiment(text, model, tokenizer, device)
    confidence = probs[1] if pred == "positive" else probs[0]
    print(f"Input: {text}")
    print(f"Predicted: {pred} ({confidence:.2f}) | Expected: {expected}")
    print("-" * 80)

Input: This product is absolutely amazing! Worth every penny!
Predicted: positive (0.99) | Expected: positive
--------------------------------------------------------------------------------
Input: Worst customer service I've ever experienced.
Predicted: negative (1.00) | Expected: negative
--------------------------------------------------------------------------------
Input: The item arrived damaged but the replacement process was smooth.
Predicted: positive (0.87) | Expected: negative
--------------------------------------------------------------------------------
Input: Meh, it's okay I guess.
Predicted: negative (0.69) | Expected: negative
--------------------------------------------------------------------------------
Input: '' => Error: Empty input
Input: The
Predicted: negative (0.61) | Expected: neutral
--------------------------------------------------------------------------------


In [12]:
def evaluate_real_world(model, tokenizer, device):
    # Test with different categories
    categories = {
        "Sarcasm": [
            ("Oh great, another broken feature!", "negative"),
            ("Because I love waiting on hold for hours!", "negative")
        ],
        "Mixed Sentiments": [
            ("The food was excellent but the service ruined everything.", "negative"),
            ("Expensive but worth it for the quality.", "positive")
        ],
        "Emojis": [
            ("🔥🔥 Best purchase ever! 😍", "positive"),
            ("Never again 😤👎", "negative")
        ],
        "Typos": [
            ("This produc is amezing!", "positive"),
            ("Terible exprience!!!", "negative")
        ]
    }

    for category, examples in categories.items():
        print(f"\n{category} Testing:")
        for text, expected in examples:
            pred, _ = predict_sentiment(text, model, tokenizer, device)
            result = "✓" if pred == expected else "✗"
            print(f"{result} Text: {text}")
            print(f"Predicted: {pred} | Expected: {expected}")
            print("-" * 60)

In [14]:
# Find problematic predictions
error_cases = []
# Assuming you have labeled test data, create test_dataset here
# Example:
# test_dataset = SentimentDataset(X_test, y_test, tokenizer) # Replace X_test, y_test if necessary

for text, true_label in test_dataset:
    pred, _ = predict_sentiment(text, model, tokenizer, device)
    if pred != true_label:
        error_cases.append({
            "text": text,
            "true": true_label,
            "predicted": pred,
            "length": len(text.split()),
            "caps_ratio": sum(1 for c in text if c.isupper())/len(text) if text else 0
        })

# Analyze error patterns
import pandas as pd
error_df = pd.DataFrame(error_cases)
print("\nError Analysis:")
print(error_df.describe())
print("\nCommon Characteristics:")
print(error_df.groupby(["true", "predicted"]).size())

NameError: name 'test_dataset' is not defined