In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
train_df = pd.read_csv("datasets/train.csv")
test_df = pd.read_csv("datasets/test.csv")

train_df['text'] = train_df['text'].fillna("")
test_df['text'] = test_df['text'].fillna("")

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

MAX_LEN = 128

def tokenize(texts):
    return tokenizer(
        list(texts),
        padding='max_length',
        truncation=True,
        max_length=MAX_LEN,
        return_tensors='pt'
    )

In [8]:
class TweetDataset(Dataset):
    def __init__(self, texts, targets=None):
        self.encodings = tokenize(texts)
        self.targets = targets

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.targets is not None:
            item['labels'] = torch.tensor(self.targets[idx], dtype=torch.long)
        return item

In [9]:
class BERTClassifier(nn.Module):
    def __init__(self):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = output.pooler_output
        x = self.dropout(pooled)
        return self.fc(x)

In [10]:
X_train, X_val, y_train, y_val = train_test_split(
    train_df['text'], train_df['target'], test_size=0.1, stratify=train_df['target'], random_state=42
)

train_dataset = TweetDataset(X_train.tolist(), y_train.tolist())
val_dataset = TweetDataset(X_val.tolist(), y_val.tolist())

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [11]:
import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")


Using device: mps


In [12]:
def train(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

In [13]:
def evaluate(model, loader, device):
    model.eval()
    predictions, targets = [], []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            predictions.extend(preds.cpu().numpy())
            targets.extend(labels.cpu().numpy())
    return accuracy_score(targets, predictions), f1_score(targets, predictions)

In [14]:
device = torch.device(device)
model = BERTClassifier().to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

for epoch in range(3):
    loss = train(model, train_loader, optimizer, criterion, device)
    acc, f1 = evaluate(model, val_loader, device)
    print(f"Epoch {epoch+1} - Loss: {loss:.4f} | Val Acc: {acc:.4f} | Val F1: {f1:.4f}")


Epoch 1 - Loss: 0.4569 | Val Acc: 0.8268 | Val F1: 0.8018
Epoch 2 - Loss: 0.3286 | Val Acc: 0.8255 | Val F1: 0.8018
Epoch 3 - Loss: 0.2311 | Val Acc: 0.8451 | Val F1: 0.7986


In [15]:
test_dataset = TweetDataset(test_df['text'].tolist())
test_loader = DataLoader(test_dataset, batch_size=16)

model.eval()
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())

In [16]:
submission = pd.DataFrame({
    'id': test_df['id'],
    'target': all_preds
})
submission.to_csv('submission.csv', index=False)