<a href="https://colab.research.google.com/github/HoseinNekouei/sentiment_analysis/blob/main/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import math
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [8]:
# Constants
EPOCHS = 2
BATCH_SIZE = 32
CHECKPOINT = 'cardiffnlp/twitter-roberta-base-sentiment-latest'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load tokenizer and model
def load_model_and_tokenizer(checkpoint):
    """Load the tokenizer and model from the specified checkpoint."""
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, device_map='auto')
    model.to(DEVICE)
    return tokenizer, model

# Tokenize texts
def tokenize_texts(tokenizer, texts):
    """Tokenize a list of texts using the provided tokenizer."""
    if not isinstance(texts, list):
        texts = texts.tolist()

    encodings = tokenizer(
        texts,
        padding=True,
        max_length=256,
        truncation=True,
        return_tensors='pt'
    )
    return encodings

# Prepare datasets and dataloaders
def prepare_dataloaders(input_ids, attention_mask, labels, batch_size, train_idx, test_idx):
    """Prepare training and test dataloaders."""
    train_dataset = TensorDataset(input_ids[train_idx], attention_mask[train_idx], labels[train_idx])
    test_dataset = TensorDataset(input_ids[test_idx], attention_mask[test_idx], labels[test_idx])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    return train_loader, test_loader

# Train the model
def train_model(model, train_loader, optimizer, epochs, batch_size):
    """Train the model for the specified number of epochs."""
    model.train()
    for epoch in range(epochs):
        for i, batch in enumerate(train_loader):
            optimizer.zero_grad()
            input_ids_batch, attention_mask_batch, labels_batch = [b.to(DEVICE) for b in batch]
            outputs = model(input_ids_batch, attention_mask=attention_mask_batch, labels=labels_batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            if (i + 1) % 100 == 0:
                denominator = math.ceil(len(train_loader.dataset) / batch_size)
                print(f'[Epoch: {epoch + 1}] -> Batch: [{i + 1}/{denominator}]')

        print(f'[Epoch: {epoch + 1}] -> Batch: [{denominator}/{denominator}]')

# Evaluate the model
def evaluate_model(model, test_loader):
    """Evaluate the model on the validation set and return accuracy."""
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids_batch, attention_mask_batch, labels_batch = [b.to(DEVICE) for b in batch]
            outputs = model(input_ids_batch, attention_mask=attention_mask_batch)
            preds = torch.argmax(outputs.logits, dim=1)
            correct += torch.sum(preds == labels_batch).item()
            total += len(labels_batch)
    accuracy = correct / total
    return accuracy

# Main function for k-fold cross-validation
def kfold_cross_validation(train_data, checkpoint, epochs, batch_size, n_splits=5):
    """Perform k-fold cross-validation on the provided dataset."""
    # Load tokenizer and model
    tokenizer, model = load_model_and_tokenizer(checkpoint)
    optimizer = AdamW(model.parameters(), lr=5e-5)

    # Tokenize texts and prepare labels
    texts = train_data['text']
    labels = torch.tensor(train_data['labels'].tolist()).to(DEVICE)
    encodings = tokenize_texts(tokenizer, texts)
    input_ids = encodings['input_ids'].to(DEVICE)
    attention_mask = encodings['attention_mask'].to(DEVICE)

    # Initialize KFold
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracy_scores = []

    # Perform k-fold cross-validation
    for fold, (train_idx, test_idx) in enumerate(kfold.split(input_ids)):
        print(f"Fold {fold + 1}...")

        # Prepare dataloaders
        train_loader, test_loader = prepare_dataloaders(input_ids, attention_mask, labels, batch_size, train_idx, test_idx)

        # Train the model
        train_model(model, train_loader, optimizer, epochs, batch_size)

        # Evaluate the model
        fold_accuracy = evaluate_model(model, test_loader)
        accuracy_scores.append(fold_accuracy)
        print(f"Validation Accuracy: {fold_accuracy:.4f}")
        print('=' * 40)

    # Calculate and print final results
    accuracy_scores = torch.tensor(accuracy_scores)
    print(f"Cross-validation accuracy scores: {accuracy_scores}")
    print(f"Mean accuracy: {torch.mean(accuracy_scores):.4f}")
    print(f"Standard deviation: {torch.std(accuracy_scores):.4f}")

    # Save the model
    model_save_path = '/content/drive/MyDrive/Projects/Sentiment_Analysis/aspect_generation/senti_model_batch32.pt'
    torch.save(model, model_save_path)
    print(f"Model saved to {model_save_path}!")

In [9]:
if __name__ == "__main__":
    import pandas as pd
    from google.colab import drive
    drive.mount('/content/drive')

    !mkdir /content/dataset
    !cp /content/drive/MyDrive/Projects/Sentiment_Analysis/aspect_generation/Augmented_dataset_US_airline_Tweet.csv /content/dataset

    file_path= '/content/drive/MyDrive/Projects/Sentiment_Analysis/aspect_generation/Augmented_dataset_US_airline_Tweet.csv'
    # Load your training data
    train_data = pd.read_csv(file_path, nrows=5)

    # Perform k-fold cross-validation
    kfold_cross_validation(train_data, CHECKPOINT, EPOCHS, BATCH_SIZE)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
mkdir: cannot create directory ‘/content/dataset’: File exists


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ValueError: too many dimensions 'str'