In [1]:
import torch

device = 'mps' if torch.backends.mps.is_available() else 'cpu'
device

'mps'

In [2]:
import pandas as pd

train = pd.read_csv('./dataset/semeval_train.tsv', delimiter='\t')
test = pd.read_csv('./dataset/semeval_test.tsv', delimiter='\t')


In [3]:
train.head()

Unnamed: 0,id,text,label,comment
0,1,The system as described above has its greatest...,"Component-Whole(e2,e1)",Comment: Not a collection: there is structure ...
1,2,The <e1>child</e1> was carefully wrapped and b...,Other,Comment:
2,3,The <e1>author</e1> of a keygen uses a <e2>dis...,"Instrument-Agency(e2,e1)",Comment:
3,4,A misty <e1>ridge</e1> uprises from the <e2>su...,Other,Comment:
4,5,The <e1>student</e1> <e2>association</e2> is t...,"Member-Collection(e1,e2)",Comment:


In [4]:
test.head()

Unnamed: 0,id,text,label
0,8001,The most common <e1>audits</e1> were about <e2...,Message-Topic
1,8002,The <e1>company</e1> fabricates plastic <e2>ch...,Product-Producer
2,8003,The school <e1>master</e1> teaches the lesson ...,Instrument-Agency
3,8004,The suspect dumped the dead <e1>body</e1> into...,Entity-Destination
4,8005,Avian <e1>influenza</e1> is an infectious dise...,Cause-Effect


In [5]:
train.shape

(8000, 4)

In [6]:
test.shape

(2717, 3)

In [7]:
train['label'].unique()

array(['Component-Whole(e2,e1)', 'Other', 'Instrument-Agency(e2,e1)',
       'Member-Collection(e1,e2)', 'Cause-Effect(e2,e1)',
       'Entity-Destination(e1,e2)', 'Content-Container(e1,e2)',
       'Message-Topic(e1,e2)', 'Product-Producer(e2,e1)',
       'Member-Collection(e2,e1)', 'Entity-Origin(e1,e2)',
       'Cause-Effect(e1,e2)', 'Component-Whole(e1,e2)',
       'Message-Topic(e2,e1)', 'Product-Producer(e1,e2)',
       'Entity-Origin(e2,e1)', 'Content-Container(e2,e1)',
       'Instrument-Agency(e1,e2)', 'Entity-Destination(e2,e1)'],
      dtype=object)

In [13]:
import transformers
from transformers import RobertaModel, RobertaTokenizer

MAX_LEN = 256
TRAIN_BATCH_SIZE = 64
LEARNING_RATE = 1e-05

tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [16]:
with open('./dataset/semeval_label.tsv') as f:
    labels = [i.strip() for i in f.readlines()]

labels

['Other',
 'Component-Whole',
 'Cause-Effect',
 'Entity-Destination',
 'Member-Collection',
 'Message-Topic',
 'Entity-Origin',
 'Product-Producer',
 'Content-Container',
 'Instrument-Agency']

In [21]:
label2id = {
    'Other': 0,
    'Component-Whole': 1,
    'Cause-Effect': 2,
    'Entity-Destination': 3,
    'Member-Collection': 4,
    'Message-Topic': 5,
    'Entity-Origin': 6,
    'Product-Producer': 7,
    'Content-Container': 8,
    'Instrument-Agency': 9
}

10

In [22]:
from torch.utils.data import Dataset, DataLoader

class SemEvalDataset(Dataset):
    def __init__(self, data:pd.DataFrame, tokenizer:transformers.PreTrainedTokenizer, max_len:int) -> None:
        super().__init__()
        self.tokenizer = tokenizer
        self.text = data['text']
        self.labels = data['label']
        self.max_len = max_len
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        inputs = self.tokenizer.encode(
            self.text[index],
            max_length=self.max_len,
            padding=True,
            add_special_tokens=True,
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'labels': torch.tensor(label2id[self.labels[index]])
        }

In [33]:
train_set = SemEvalDataset(train, tokenizer, MAX_LEN)

In [35]:
print(train_set)

<__main__.SemEvalDataset object at 0x289466dd0>


In [36]:
training_loader = DataLoader(train_set, batch_size=TRAIN_BATCH_SIZE, shuffle=True)

In [40]:
for i, data in training_loader:
    print(data)
    break

TypeError: SemEvalDataset.__getitem__() takes 1 positional argument but 2 were given