In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset, Subset
from tqdm import tqdm
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
checkpoint_folder = './checkpoints'

In [3]:
language_model = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(language_model)
device = "cpu"
# GPU can cause memory issues - using CPU for stability
if torch.cuda.is_available():
    device = "cuda"
device

'cuda'

In [2]:
# Load local TSV (expects columns 'text' and 'label')
import os
df = pd.read_csv("data/data.tsv", sep='\t')
if 'text' not in df.columns or 'label' not in df.columns:
    raise ValueError("data.tsv must contain 'text' and 'label' columns")
# If labels are strings, convert to integer categories
if df['label'].dtype == object:
    df['label'] = df['label'].astype('category').cat.codes
class LocalDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self._df = df.reset_index(drop=True)
    def to_pandas(self):
        return self._df
    def __len__(self):
        return len(self._df)
    def __getitem__(self, idx):
        row = self._df.iloc[idx]
        return {'text': row['text'], 'label': int(row['label'])}
dataset = {'train': LocalDataset(df)}

In [5]:
class TransformerClassifier(nn.Module):
    def __init__(self, model_name, n_classes):
        super(TransformerClassifier, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        layer_size = self.transformer.config.hidden_size

        self.classifer = nn.Sequential(
            nn.Linear(layer_size, n_classes),
            nn.Softmax(dim=1)
        )


    def forward(self, x, attention_mask):
        with torch.no_grad():
            x = self.transformer(input_ids=x, attention_mask=attention_mask)
        x = x.last_hidden_state[:, 0, :]
        x = self.classifer(x)
        return x


model = TransformerClassifier(language_model, 2).to(device)
print(model)

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TransformerClassifier(
  (transformer): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (L

In [6]:
batch_size = 32
n = len(dataset['train'])
train_data, validation_data, test_data= torch.utils.data.random_split(dataset['train'], [int(n * 0.7), int(n * 0.15), int(n * 0.15)])
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

for i in train_loader:
    print(i['text'][0])
    break

President Raul Castro called on Cubans on Monday to unite in swiftly rebuilding the Caribbean nation in the wake of Hurricane Irma, which killed at least 10 people during a devastating three-day rampage along the length of the island. The storm crashed into Cuba late on Friday, with sustained winds of than 157 miles per hour (253 km per hour). It tore along the island s northern shore for some 200 miles (322 km) - lashing tourist resorts on the island s pristine keys - before turning northward to batter Florida. In Havana, people set about removing debris from the streets on Monday and mopping up homes hit by widespread flooding. The hurricane - the first Category 5 storm to make landfall in Cuba since 1932 - tore off roofs, felled trees and downed electricity poles, leaving millions without power and water. State media said on Monday Irma had seriously damaged Cuba s already dilapidated sugar industry, flooding and flattening an extensive area of sugar cane.  Given the immensity of it

In [7]:
# Check Hugging Face dataset structure
print("\nLabel distribution in train:")
print(dataset['train'].to_pandas()['label'].value_counts().sort_index())


Label distribution in train:
label
0    15478
1    14522
Name: count, dtype: int64


In [8]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


def tokenize(text, device):
    tokens = tokenizer(
        text,
        return_tensors='pt',
        padding=True,
        truncation=True
    )
    return tokens['input_ids'].to(device), tokens['attention_mask'].to(device)

In [None]:
start_epoch = 0
max_epochs = 2
save_snapshots = True

if start_epoch != 0:
    model.load_state_dict(torch.load(f"{checkpoint_folder}/epoch-{start_epoch}.pth"))


best_acc = 0

for t in range(start_epoch+1, max_epochs+1):
    print(f"epoch {t}: ", end='')


    # TRAIN
    model.train()
    for row in tqdm(train_loader):
        tokens, attention_mask = tokenize(row['text'], device)
        label = row["label"].to(device)

        loss_fn(model(tokens, attention_mask), label).backward()
        optimizer.step()
        optimizer.zero_grad()

    # VALIDATE
    model.eval()
    total_loss = 0
    correct = 0
    print(f"validation: ", end='')

    with torch.no_grad():
        for row in tqdm(validation_loader):
            tokens, attention_mask = tokenize(row['text'], device)
            label = row["label"].to(device)
            pred = model(tokens, attention_mask)
            correct += (pred.argmax(1) == label).type(torch.float).sum().item()
            total_loss += loss_fn(pred, label).item()

    avg_error = total_loss / len(validation_loader)
    accuracy = correct / len(validation_loader.dataset)
    print("error:", avg_error)
    print("accuracy:", accuracy)


    if save_snapshots and accuracy > best_acc:
        best_acc = accuracy
        torch.save(model.state_dict(),  f"{checkpoint_folder}/epoch-{t}.pth")
print("BEST ACC:", best_acc)


Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/2 - roberta-base


Training:   2%|▏         | 10/657 [00:08<08:14,  1.31it/s]

  Batch 10: Loss = 0.6902


Training:   3%|▎         | 20/657 [00:15<07:52,  1.35it/s]

  Batch 20: Loss = 0.6823


Training:   5%|▍         | 30/657 [00:22<07:44,  1.35it/s]

  Batch 30: Loss = 0.6761


Training:   5%|▌         | 36/657 [00:27<07:40,  1.35it/s]

In [None]:
# TEST
def test(filename):
    model.load_state_dict(torch.load(filename, weights_only=True))
    model.eval()
    total_loss = 0
    correct = 0
    with torch.no_grad():
        for row in tqdm(test_loader):
            tokens, attention_mask = tokenize(row['text'], device)
            label = row["label"].to(device)
            pred = model(tokens, attention_mask)
            correct += (pred.argmax(1) == label).type(torch.float).sum().item()
            total_loss += loss_fn(pred, label).item()

    avg_error = total_loss / len(test_loader)
    accuracy = correct / len(test_loader.dataset)
    print("error:", avg_error)
    print("accuracy:", accuracy)
test(f"{checkpoint_folder}\epoch-4.pth")

100%|██████████| 141/141 [05:15<00:00,  2.24s/it]

error: 0.3429108549517097
accuracy: 0.9895555555555555





In [None]:
torch.cuda.empty_cache()