In [43]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, AutoTokenizer, AutoModel 
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm

In [37]:
tokenizer = AutoTokenizer.from_pretrained("ixa-ehu/berteus-base-cased")
model = BertForSequenceClassification.from_pretrained("ixa-ehu/berteus-base-cased", num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ixa-ehu/berteus-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
data = pd.read_csv('AUGMENTED.csv') 
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [50]:
class SentimentAnalysisDataset(Dataset):

    def __init__(self, tokenizer, data, max_length=128):

        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length
        self.num_classes = 3  
        
    def __len__(self):

        return len(self.data)
    
    def __getitem__(self, idx):

        text = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['label']
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        label = torch.tensor([label], dtype=torch.long)  

        return {'input_ids': encoding['input_ids'].squeeze(0), 
                'attention_mask': encoding['attention_mask'].squeeze(0), 
                'labels': label}

train_dataset = SentimentAnalysisDataset(tokenizer, train_data)
test_dataset = SentimentAnalysisDataset(tokenizer, test_data)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [51]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

epochs = 3

for epoch in range(epochs):

    model.train()
    train_dataloader_tqdm = tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{epochs}', leave=False)

    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].squeeze(1).to(device)  

        outputs = model(input_ids, 
                        attention_mask=attention_mask, 
                        labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_dataloader_tqdm.set_postfix(loss=loss)

    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

Epoch 1/3:   0%|          | 0/792 [03:47<?, ?it/s, loss=tensor(1.0123, grad_fn=<NllLossBackward0>)]

KeyboardInterrupt: 

In [None]:
model.eval()
correct, total = 0, 0

for batch in test_dataloader:
    
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)
    
    with torch.no_grad():

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs[0] 
        predictions = torch.argmax(logits, dim=1)
        
    correct += (predictions == labels).sum().item()
    total += labels.size(0)

accuracy = correct / total
print(f'Test Accuracy: {accuracy:.4f}')