In [1]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm

In [2]:
def edit_categories(x):
    if x == 'info_news':
        return 0
    elif x == 'celebrity':
        return 1
    elif x == 'plan':
        return 2
    elif x == 'requests':
        return 3
    elif x == 'rumors':
        return 4
    elif x == 'advice':
        return 5
    elif x == 'restrictions':
        return 6
    elif x == 'personal':
        return 7
    elif x == 'unrelated':
        return 8
    elif x == 'others':
        return 9
    else:
        return -1

In [3]:
# build the pytorch dataset
class ArabertDataset(torch.utils.data.Dataset):
    def __init__(self, train=True, test=False):
        if train:
            embeddings = pd.read_pickle('dataset/train_arabert_not_padded.pkl')
        elif test:
            embeddings = pd.read_pickle('dataset/test_arabert_not_padded.pkl')
        else:
            embeddings = pd.read_pickle('dataset/val_arabert_not_padded.pkl')
        pad_embedding = torch.zeros(1, 1, 768)
        max_len = 0
        embeddings = embeddings.values
        for i in range(len(embeddings)):
            if embeddings[i].shape[1] > max_len:
                max_len = embeddings[i].shape[1]
        for i in range(len(embeddings)):
            if embeddings[i].shape[1] < max_len:
                pad = torch.zeros(1, max_len - embeddings[i].shape[1], 768).to(torch.device('cuda'))
                embeddings[i] = torch.cat((embeddings[i], pad), dim=1)
            embeddings[i] = embeddings[i].view(max_len, 768)

        if train:
            labels = pd.read_pickle('dataset/stances_arabert.pkl')
        elif test:
            labels = pd.read_pickle('dataset/test_stances_arabert.pkl')
        else:   
            labels = pd.read_pickle('dataset/val_stances_arabert.pkl')
        labels['category'] = labels['category'].apply(edit_categories)
        self.embeddings = embeddings # already a tensor
        self.stance = labels['stance']
        self.category = labels['category']
        self.stance = torch.tensor(self.stance.values)
        self.category = torch.tensor(self.category.values)

    def __len__(self):
        return len(self.stance)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.stance[idx], self.category[idx]

In [4]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [5]:
torch.cuda.is_available()

True

In [6]:
# build the pytorch dataloader
train_dataset = ArabertDataset()
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)

# build the model
class RNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = torch.nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

In [7]:
# train the model
model = RNN(768, 256, 4, 3).to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(10):
    for i, (embedding, stance, category) in enumerate(tqdm(train_loader)):
        embedding = embedding.to(device)
        stance = stance.to(device)
        category = category.to(device)

        outputs = model(embedding)
        loss = criterion(outputs, stance)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # calculate the total epoch accuracy
        _, predicted = torch.max(outputs.data, 1)
        total = stance.size(0)
        correct = (predicted == stance).sum().item()
        accuracy = correct / total

        if (i+1) % len(train_loader) == 0:
            print(f'Epoch [{epoch+1}/{10}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
            print(f'Epoch [{epoch+1}/{10}], Step [{i+1}/{len(train_loader)}], Accuracy: {accuracy:.4f}')

100%|██████████| 110/110 [00:02<00:00, 42.16it/s]
  5%|▍         | 5/110 [00:00<00:02, 43.80it/s]

Epoch [1/10], Step [110/110], Loss: 0.2272
Epoch [1/10], Step [110/110], Accuracy: 1.0000


100%|██████████| 110/110 [00:02<00:00, 42.16it/s]
  5%|▍         | 5/110 [00:00<00:02, 45.48it/s]

Epoch [2/10], Step [110/110], Loss: 0.3697
Epoch [2/10], Step [110/110], Accuracy: 0.9167


100%|██████████| 110/110 [00:02<00:00, 43.78it/s]
  5%|▍         | 5/110 [00:00<00:02, 43.05it/s]

Epoch [3/10], Step [110/110], Loss: 0.9107
Epoch [3/10], Step [110/110], Accuracy: 0.7500


100%|██████████| 110/110 [00:02<00:00, 43.41it/s]
  5%|▍         | 5/110 [00:00<00:02, 45.00it/s]

Epoch [4/10], Step [110/110], Loss: 0.2448
Epoch [4/10], Step [110/110], Accuracy: 1.0000


100%|██████████| 110/110 [00:02<00:00, 42.72it/s]
  5%|▍         | 5/110 [00:00<00:02, 42.15it/s]

Epoch [5/10], Step [110/110], Loss: 0.5887
Epoch [5/10], Step [110/110], Accuracy: 0.8333


100%|██████████| 110/110 [00:02<00:00, 44.16it/s]
  5%|▍         | 5/110 [00:00<00:02, 41.00it/s]

Epoch [6/10], Step [110/110], Loss: 0.6620
Epoch [6/10], Step [110/110], Accuracy: 0.7500


100%|██████████| 110/110 [00:02<00:00, 43.15it/s]
  5%|▍         | 5/110 [00:00<00:02, 43.40it/s]

Epoch [7/10], Step [110/110], Loss: 0.5516
Epoch [7/10], Step [110/110], Accuracy: 0.8333


100%|██████████| 110/110 [00:02<00:00, 44.16it/s]
  5%|▍         | 5/110 [00:00<00:02, 43.42it/s]

Epoch [8/10], Step [110/110], Loss: 0.6678
Epoch [8/10], Step [110/110], Accuracy: 0.8333


100%|██████████| 110/110 [00:02<00:00, 43.53it/s]
  5%|▍         | 5/110 [00:00<00:02, 41.50it/s]

Epoch [9/10], Step [110/110], Loss: 0.7528
Epoch [9/10], Step [110/110], Accuracy: 0.7500


100%|██████████| 110/110 [00:02<00:00, 43.31it/s]

Epoch [10/10], Step [110/110], Loss: 0.7512
Epoch [10/10], Step [110/110], Accuracy: 0.7500





In [8]:
# inference mode
model.eval()

# load the dev set
dev_dataset = ArabertDataset(train=False)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=64, shuffle=True)

# get accuracy on development set
with torch.no_grad():
    correct = 0
    total = 0
    for embedding, stance, category in dev_loader:
        embedding = embedding.to(device)
        stance = stance.to(device)
        category = category.to(device)

        outputs = model(embedding)
        _, predicted = torch.max(outputs.data, 1)
        total += stance.size(0)
        correct += (predicted == stance).sum().item()
    print(f'Accuracy of the model on the dev set: {100 * correct / total}%')


Accuracy of the model on the dev set: 80.4%
