In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [3]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from sklearn.metrics import classification_report
from torch.amp import autocast, GradScaler
import numpy as np

# Ensure you're running on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def prepare_data(train_size=0.6, max_samples=None):
    # Load dataset
    dataset = load_dataset('sentiment140')

    # Convert sentiment labels from [0, 4] to [0, 1]
    # 0 = negative (0), 4 = positive (1)
    dataset = dataset.map(lambda x: {'sentiment': 1 if x['sentiment'] == 4 else 0})

    # Select balanced subset of data
    train_data = dataset['train']
    if max_samples:
        pos_indices = [i for i, label in enumerate(train_data['sentiment']) if label == 1]
        neg_indices = [i for i, label in enumerate(train_data['sentiment']) if label == 0]

        # Randomly sample equal numbers of positive and negative examples
        samples_per_class = min(len(pos_indices), len(neg_indices), max_samples // 2)
        pos_indices = np.random.choice(pos_indices, samples_per_class, replace=False)
        neg_indices = np.random.choice(neg_indices, samples_per_class, replace=False)

        # Combine and shuffle indices
        selected_indices = np.concatenate([pos_indices, neg_indices])
        np.random.shuffle(selected_indices)
        train_data = train_data.select(selected_indices)

    # Split into train and validation
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        train_data['text'],
        train_data['sentiment'],
        test_size=0.2,
        random_state=42,
        stratify=train_data['sentiment']  # Ensure balanced split
    )

    return train_texts, val_texts, train_labels, val_labels

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])  # Ensure text is string
        label = int(self.labels[idx])  # Ensure label is integer

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def train_model(model, train_loader, val_loader, criterion, optimizer, n_epochs, scaler, patience=3):
    best_val_acc = 0
    best_epoch = 0
    no_improvement_epochs = 0
    history = {
        'train_loss': [], 'train_acc': [],
        'val_loss': [], 'val_acc': []
    }

    for epoch in range(n_epochs):
        # Training phase
        model.train()
        total_train_loss = 0
        train_correct = 0
        train_total = 0

        for i, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()

            with autocast('cuda'):
                outputs = model(input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_train_loss += loss.item()
            _, predicted = torch.max(outputs.logits, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()

        # Validation phase
        model.eval()
        total_val_loss = 0
        val_correct = 0
        val_total = 0
        all_predictions = []
        all_labels = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)
                total_val_loss += loss.item()

                _, predicted = torch.max(outputs.logits, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

                all_predictions.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        # Calculate metrics
        train_loss = total_train_loss / len(train_loader)
        train_acc = (train_correct / train_total) * 100
        val_loss = total_val_loss / len(val_loader)
        val_acc = (val_correct / val_total) * 100

        # Store metrics
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)

        # Print metrics
        print(f'\nEpoch {epoch+1}/{n_epochs}')
        print('-' * 60)
        print(f'Training Loss: {train_loss:.4f}')
        print(f'Training Accuracy: {train_acc:.2f}%')
        print(f'Validation Loss: {val_loss:.4f}')
        print(f'Validation Accuracy: {val_acc:.2f}%')
        print('\nClassification Report:')
        print(classification_report(all_labels, all_predictions))

        # Early stopping
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_epoch = epoch
            no_improvement_epochs = 0
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'best_accuracy': best_val_acc,
            }, 'best_model.pth')
            print(f'New best model saved with validation accuracy: {val_acc:.2f}%')
        else:
            no_improvement_epochs += 1
            if no_improvement_epochs >= patience:
                print(f'Early stopping triggered at epoch {epoch+1}')
                break

    return history

def main():
    # Prepare data
    train_texts, val_texts, train_labels, val_labels = prepare_data(train_size=0.6, max_samples=100000)

    # Initialize tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=2
    ).to(device)

    # Create datasets and dataloaders
    train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
    val_dataset = SentimentDataset(val_texts, val_labels, tokenizer)

    train_loader = DataLoader(
        train_dataset,
        batch_size=32,
        shuffle=True,
        num_workers=2  # Reduced from 4 to avoid warnings
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=32,
        num_workers=2
    )

    # Initialize training components
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()
    scaler = GradScaler('cuda')

    # Train model
    history = train_model(
        model,
        train_loader,
        val_loader,
        criterion,
        optimizer,
        n_epochs=5,
        scaler=scaler,
        patience=3
    )

    return history

if __name__ == "__main__":
    main()

Generating train split:   0%|          | 0/1600000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/498 [00:00<?, ? examples/s]

Map:   0%|          | 0/1600000 [00:00<?, ? examples/s]

Map:   0%|          | 0/498 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/5
------------------------------------------------------------
Training Loss: 0.3916
Training Accuracy: 82.24%
Validation Loss: 0.3614
Validation Accuracy: 84.14%

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.88      0.85     10000
           1       0.87      0.81      0.84     10000

    accuracy                           0.84     20000
   macro avg       0.84      0.84      0.84     20000
weighted avg       0.84      0.84      0.84     20000

New best model saved with validation accuracy: 84.14%

Epoch 2/5
------------------------------------------------------------
Training Loss: 0.2827
Training Accuracy: 88.25%
Validation Loss: 0.3598
Validation Accuracy: 84.25%

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.86      0.85     10000
           1       0.85      0.83      0.84     10000

    accuracy                           0.84     20000
   macro 

In [4]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Ensure you're using the right device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Reinitialize the model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2
).to(device)

# Load the trained model weights
checkpoint = torch.load('best_model.pth', map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

# Define function to preprocess and make predictions
def predict_sentiment(sentences, tokenizer, model):
    inputs = tokenizer(
        sentences,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        _, predictions = torch.max(outputs.logits, 1)

    # Convert predictions to human-readable labels
    sentiment_labels = ['Negative' if label == 0 else 'Positive' for label in predictions]
    return sentiment_labels

# Example custom sentences with emojis
custom_sentences = [
    "I love this! 😊❤️",
    "This is terrible... 😡👎",
    "I had a great day 😍🎉",
    "I can't stand this anymore 😤😞",
    "Absolutely amazing! 👍😊",
    "This is disappointing 😢"
]

# Predict sentiment for custom sentences
predictions = predict_sentiment(custom_sentences, tokenizer, model)
for sentence, prediction in zip(custom_sentences, predictions):
    print(f"Sentence: '{sentence}' => Sentiment: {prediction}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  checkpoint = torch.load('best_model.pth', map_location=device)


Sentence: 'I love this! 😊❤️' => Sentiment: Positive
Sentence: 'This is terrible... 😡👎' => Sentiment: Negative
Sentence: 'I had a great day 😍🎉' => Sentiment: Positive
Sentence: 'I can't stand this anymore 😤😞' => Sentiment: Negative
Sentence: 'Absolutely amazing! 👍😊' => Sentiment: Positive
Sentence: 'This is disappointing 😢' => Sentiment: Negative


In [12]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score

# Ensure you're using the right device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Reinitialize the model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2
).to(device)

# Load the trained model weights
checkpoint = torch.load('best_model.pth', map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

# Function to predict sentiment labels
def predict_sentiment(sentences, tokenizer, model):
    inputs = tokenizer(
        sentences,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        _, predictions = torch.max(outputs.logits, 1)

    return predictions.cpu().numpy()

# Example custom sentences and their true labels (0 = Negative, 1 = Positive)
custom_sentences = [
    "I love this! 😊❤️",
    "This is terrible... 😡👎",
    "I had a great day 😍🎉",
    "I can't stand this anymore 😤😞",
    "Absolutely amazing! 👍😊",
    "This is disappointing 😢"
]

# True labels for evaluation (e.g., [1, 0, 1, 0, 1, 0])
true_labels = [1, 0, 1, 0, 1, 0]

# Predict sentiment for custom sentences
predictions = predict_sentiment(custom_sentences, tokenizer, model)

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy on custom sentences: {accuracy * 100:.2f}%")

# Optionally, print predictions
for sentence, prediction, true_label in zip(custom_sentences, predictions, true_labels):
    sentiment = "Positive" if prediction == 1 else "Negative"
    print(f"Sentence: '{sentence}' => Predicted: {sentiment}, True: {'Positive' if true_label == 1 else 'Negative'}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  checkpoint = torch.load('best_model.pth', map_location=device)


Accuracy on custom sentences: 100.00%
Sentence: 'I love this! 😊❤️' => Predicted: Positive, True: Positive
Sentence: 'This is terrible... 😡👎' => Predicted: Negative, True: Negative
Sentence: 'I had a great day 😍🎉' => Predicted: Positive, True: Positive
Sentence: 'I can't stand this anymore 😤😞' => Predicted: Negative, True: Negative
Sentence: 'Absolutely amazing! 👍😊' => Predicted: Positive, True: Positive
Sentence: 'This is disappointing 😢' => Predicted: Negative, True: Negative
