In [1]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

In [3]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [11]:
def load_and_preprocess_data():
    # Load original IMDB data
    imdb_data = pd.read_csv("input/imdb_dataset.csv")
    imdb_data['label'] = (imdb_data['sentiment'] == 'positive').astype(int)

    # Load backtranslated data
    bt_data = pd.read_csv("backtranslated_train_reviews.csv")
    bt_data['label'] = (bt_data['sentiment'] == 'positive').astype(int)

    # Load generated data
    gen_data = pd.read_csv("generated_reviews.csv")
    gen_data['label'] = (gen_data['sentiment'] == 'positive').astype(int)

    # Combine all data
    combined_reviews = pd.concat([
        imdb_data[['review', 'label']],
        bt_data[['review', 'label']],
        gen_data[['review', 'label']]
    ], ignore_index=True)

    return combined_reviews

In [18]:
def train_model():
    # Load and prepare data
    combined_data = load_and_preprocess_data()
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        combined_data['review'].tolist(),
        combined_data['label'].tolist(),
        test_size=0.1
    )

    # Initialize tokenizer and model
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaForSequenceClassification.from_pretrained('roberta-base')

    # Create datasets
    train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
    val_dataset = SentimentDataset(val_texts, val_labels, tokenizer)

    # Training arguments
    training_args = TrainingArguments(
        output_dir="roberta_sentiment",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=500,
        save_strategy="steps",
        save_steps=500,
        load_best_model_at_end=True,
        metric_for_best_model="loss"
    )

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    # Train the model
    trainer.train()
    
    # Save final model
    trainer.save_model("roberta_sentiment_final")
    tokenizer.save_pretrained("roberta_sentiment_final")

In [20]:
train_model()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`

In [16]:
!pip install transformers[torch]

Collecting accelerate>=0.26.0 (from transformers[torch])
  Downloading accelerate-1.1.1-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.1.1-py3-none-any.whl (333 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.1.1
