In [1]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm

In [2]:
def edit_categories(x):
    if x == 'info_news':
        return 0
    elif x == 'celebrity':
        return 1
    elif x == 'plan':
        return 2
    elif x == 'requests':
        return 3
    elif x == 'rumors':
        return 4
    elif x == 'advice':
        return 5
    elif x == 'restrictions':
        return 6
    elif x == 'personal':
        return 7
    elif x == 'unrelated':
        return 8
    elif x == 'others':
        return 9
    else:
        return -1

In [3]:
# build the pytorch dataset
class ArabertDataset(torch.utils.data.Dataset):
    def __init__(self):
        embeddings = pd.read_pickle('dataset/train_arabert_not_padded.pkl')
        pad_embedding = torch.zeros(1, 1, 768)
        max_len = 0
        for i in range(len(embeddings)):
            if embeddings[i].shape[1] > max_len:
                max_len = embeddings[i].shape[1]
        for i in range(len(embeddings)):
            if embeddings[i].shape[1] < max_len:
                pad = torch.zeros(1, max_len - embeddings[i].shape[1], 768)
                embeddings[i] = torch.cat((embeddings[i], pad), dim=1)
            embeddings[i] = embeddings[i].view(max_len, 768)

        labels = pd.read_csv('dataset/train.csv').drop('text', axis=1, inplace=False)
        labels['category'] = labels['category'].apply(edit_categories)
        self.embeddings = embeddings # already a tensor
        self.stance = labels['stance']
        self.stance = self.stance + 1
        self.category = labels['category']
        self.stance = torch.tensor(self.stance)
        self.category = torch.tensor(self.category)

    def __len__(self):
        return len(self.stance)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.stance[idx], self.category[idx]

In [4]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [15]:
# build the pytorch dataloader
train_dataset = ArabertDataset()
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

# build the model
class RNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = torch.nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

In [17]:
# train the model
model = RNN(768, 256, 4, 3).to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

for epoch in range(10):
    for i, (embedding, stance, category) in enumerate(tqdm(train_loader)):

        outputs = model(embedding)
        loss = criterion(outputs, stance)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # calculate the epoch accuracy


        if (i+1) % len(train_loader) == 0:
            print(f'Epoch [{epoch+1}/{10}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

100%|██████████| 219/219 [03:23<00:00,  1.07it/s]
  0%|          | 0/219 [00:00<?, ?it/s]

Epoch [1/10], Step [219/219], Loss: 0.4172


100%|██████████| 219/219 [02:53<00:00,  1.26it/s]
  0%|          | 0/219 [00:00<?, ?it/s]

Epoch [2/10], Step [219/219], Loss: 0.6149


100%|██████████| 219/219 [03:08<00:00,  1.16it/s]
  0%|          | 0/219 [00:00<?, ?it/s]

Epoch [3/10], Step [219/219], Loss: 0.3338


 82%|████████▏ | 179/219 [02:30<00:35,  1.12it/s]