In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, Dataset


In [None]:

# Define your own dataset class
class TweetDataset(Dataset):
    def __init__(self, tweets, labels):
        self.tweets = tweets
        self.labels = labels
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        tweet = self.tweets[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label)
        }


In [None]:

# Define your own training and evaluation data
train_tweets = [...]  # List of training tweets
train_labels = [...]  # List of corresponding training labels
eval_tweets = [...]  # List of evaluation tweets
eval_labels = [...]  # List of corresponding evaluation labels

# Create instances of your dataset
train_dataset = TweetDataset(train_tweets, train_labels)
eval_dataset = TweetDataset(eval_tweets, eval_labels)

# Define training parameters
batch_size = 32
epochs = 5
learning_rate = 2e-5

# Load pretrained DistilBERT model and adjust it for sequence classification
model_name = 'distilbert-base-uncased'
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=3)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move model and datasets to the device
model.to(device)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)


In [None]:

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{epochs} - Avg. Training Loss: {avg_loss:.4f}")


In [None]:

    # Evaluation loop
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    for batch in eval_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            eval_loss += outputs.loss.item()
            predicted_labels = torch.argmax(outputs.logits, dim=1)
            eval_accuracy += (predicted_labels == labels).sum().item()

    avg_eval_loss = eval_loss
