In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, AutoTokenizer, AutoModel 
from sklearn.model_selection import train_test_split
import pandas as pd

In [3]:
tokenizer = AutoTokenizer.from_pretrained("ixa-ehu/berteus-base-cased")
model = AutoModel.from_pretrained("ixa-ehu/berteus-base-cased")

Downloading: 100%|██████████| 24.0/24.0 [00:00<00:00, 6.11kB/s]
Downloading: 100%|██████████| 450/450 [00:00<00:00, 338kB/s]
Downloading: 100%|██████████| 422k/422k [00:00<00:00, 4.12MB/s]
Downloading: 100%|██████████| 498M/498M [02:00<00:00, 4.12MB/s] 


In [4]:
data = pd.read_csv('AUGMENTED.csv') # Adjust the file name and format accordingly
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
class SentimentAnalysisDataset(Dataset):
    def __init__(self, tokenizer, data, max_length=128):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['label']
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].squeeze(0), 'attention_mask': encoding['attention_mask'].squeeze(0), 'label': torch.tensor(label)}

train_dataset = SentimentAnalysisDataset(tokenizer, train_data)
test_dataset = SentimentAnalysisDataset(tokenizer, test_data)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)