In [1]:
import pandas as pd
import numpy as np
import torch
from torch.autograd import Variable 
from tqdm import tqdm

In [2]:
def edit_categories(x):
    if x == 'info_news':
        return 0
    elif x == 'celebrity':
        return 1
    elif x == 'plan':
        return 2
    elif x == 'requests':
        return 3
    elif x == 'rumors':
        return 4
    elif x == 'advice':
        return 5
    elif x == 'restrictions':
        return 6
    elif x == 'personal':
        return 7
    elif x == 'unrelated':
        return 8
    elif x == 'others':
        return 9
    else:
        return -1

In [3]:
'''

# build the pytorch dataset
class ArabertDataset(torch.utils.data.Dataset):
    def __init__(self, dataset_path):
        dataset = pd.read_pickle(dataset_path)
        pad_embedding = torch.zeros(1, 768)
        max_len = 0
        embeddings = dataset['embeddings'].values
        for i in range(len(embeddings)):
            if embeddings[i].shape[0] > max_len:
                max_len = embeddings[i].shape[0]
        for i in range(len(embeddings)):
            if embeddings[i].shape[0] < max_len:
                pad = torch.zeros(max_len - embeddings[i].shape[0], 768).to(torch.device('cuda'))
                embeddings[i] = torch.cat((embeddings[i], pad), dim=0)
        categories = dataset['category'].apply(edit_categories)
        self.embeddings = embeddings # already a tensor
        self.stance = dataset['stance']
        self.category = categories
        self.stance = torch.tensor(self.stance.values)
        self.category = torch.tensor(self.category.values)



    def __len__(self):
        return len(self.stance)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.stance[idx], self.category[idx]

'''

In [None]:
# build the pytorch dataset
class ArabertDataset(torch.utils.data.Dataset):
    def __init__(self):
        embeddings = pd.read_pickle('dataset/train_arabert_not_padded.pkl')
        pad_embedding = torch.zeros(1, 1, 768)
        max_len = 0
        for i in range(len(embeddings)):
            if embeddings[i].shape[1] > max_len:
                max_len = embeddings[i].shape[1]
        for i in range(len(embeddings)):
            if embeddings[i].shape[1] < max_len:
                pad = torch.zeros(1, max_len - embeddings[i].shape[1], 768)
                embeddings[i] = torch.cat((embeddings[i], pad), dim=1)
            embeddings[i] = embeddings[i].view(max_len, 768)

        labels = pd.read_csv('dataset/train.csv').drop('text', axis=1, inplace=False)
        labels['category'] = labels['category'].apply(edit_categories)
        #self.embeddings = embeddings # already a tensor
        self.stance = labels['stance']

        self.stance = self.stance + 1
        self.category = labels['category']
        #self.stance = torch.tensor(self.stance)
        self.category = torch.tensor(self.category)


        self.embeddings = Variable(embeddings)              # I saw someone doing this
        self.stance = Variable(torch.Tensor(self.stance))


    def __len__(self):
        return len(self.stance)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.stance[idx], self.category[idx]

In [4]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [5]:
torch.cuda.is_available()

True

In [6]:
# build the pytorch dataloader
train_dataset = ArabertDataset('output/train_1_arabert.pkl')
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)

'''
# build the model
class RNN(torch.nn.Module):

        def __init__(self, input_size, hidden_size, num_layers, num_classes):
            super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = torch.nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_size, num_classes)




    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

    '''



In [None]:

class RNN(torch.nn.Module):
#I also saw him doing this

    def __init__(self, num_classes, input_size, hidden_size, num_layers, seq_length):
        super(RNN, self).__init__()
        self.num_classes = num_classes #number of classes
        self.num_layers = num_layers #number of layers
        self.input_size = input_size #input size
        self.hidden_size = hidden_size #hidden state
        self.seq_length = seq_length #sequence length

        self.lstm = torch.nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                          num_layers=num_layers, batch_first=True) #lstm
        self.fc_1 =  torch.nn.Linear(hidden_size, 128) #fully connected 1
        self.fc = torch.nn.Linear(128, num_classes) #fully connected last layer

        self.relu = torch.nn.ReLU()
    
    def forward(self,x):
        h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)) #hidden state
        c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)) #internal state
        # Propagate input through LSTM
        output, (hn, cn) = self.lstm(x, (h_0, c_0)) #lstm with input, hidden, and internal state
        hn = hn.view(-1, self.hidden_size) #reshaping the data for Dense layer next
        out = self.relu(hn)
        out = self.fc_1(out) #first Dense
        out = self.relu(out) #relu
        out = self.fc(out) #Final Output
        return out

In [7]:
# train the stance_model
stance_model = RNN(768, 1024, 2, 3).to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(stance_model.parameters(), lr=1e-3)
stance_model.train()
for epoch in range(10):
    for i, (embedding, stance, category) in enumerate(tqdm(train_loader)):
        embedding = embedding.to(device)
        stance = stance.to(device)
        category = category.to(device)

        outputs = stance_model(embedding)
        loss = criterion(outputs, stance)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # calculate the total epoch accuracy
        _, predicted = torch.max(outputs.data, 1)
        total = stance.size(0)
        correct = (predicted == stance).sum().item()
        accuracy = correct / total

        if (i+1) % len(train_loader) == 0:
            print(f'Epoch [{epoch+1}/{10}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
            print(f'Epoch [{epoch+1}/{10}], Step [{i+1}/{len(train_loader)}], Accuracy: {accuracy:.4f}')

100%|██████████| 110/110 [00:10<00:00, 10.76it/s]
  2%|▏         | 2/110 [00:00<00:09, 10.91it/s]

Epoch [1/10], Step [110/110], Loss: 1.0126
Epoch [1/10], Step [110/110], Accuracy: 0.5000


100%|██████████| 110/110 [00:10<00:00, 10.81it/s]
  2%|▏         | 2/110 [00:00<00:09, 10.86it/s]

Epoch [2/10], Step [110/110], Loss: 0.8392
Epoch [2/10], Step [110/110], Accuracy: 0.6667


100%|██████████| 110/110 [00:10<00:00, 10.62it/s]
  2%|▏         | 2/110 [00:00<00:10, 10.07it/s]

Epoch [3/10], Step [110/110], Loss: 0.8901
Epoch [3/10], Step [110/110], Accuracy: 0.7500


100%|██████████| 110/110 [00:10<00:00, 10.65it/s]
  2%|▏         | 2/110 [00:00<00:10, 10.67it/s]

Epoch [4/10], Step [110/110], Loss: 0.5482
Epoch [4/10], Step [110/110], Accuracy: 0.8333


100%|██████████| 110/110 [00:10<00:00, 10.64it/s]
  2%|▏         | 2/110 [00:00<00:10, 10.58it/s]

Epoch [5/10], Step [110/110], Loss: 0.7098
Epoch [5/10], Step [110/110], Accuracy: 0.7500


100%|██████████| 110/110 [00:10<00:00, 10.62it/s]
  2%|▏         | 2/110 [00:00<00:10, 10.39it/s]

Epoch [6/10], Step [110/110], Loss: 0.4360
Epoch [6/10], Step [110/110], Accuracy: 0.8333


100%|██████████| 110/110 [00:10<00:00, 10.65it/s]
  2%|▏         | 2/110 [00:00<00:10, 10.64it/s]

Epoch [7/10], Step [110/110], Loss: 0.1548
Epoch [7/10], Step [110/110], Accuracy: 1.0000


100%|██████████| 110/110 [00:10<00:00, 10.64it/s]
  2%|▏         | 2/110 [00:00<00:10, 10.79it/s]

Epoch [8/10], Step [110/110], Loss: 0.6748
Epoch [8/10], Step [110/110], Accuracy: 0.8333


100%|██████████| 110/110 [00:10<00:00, 10.60it/s]
  2%|▏         | 2/110 [00:00<00:10, 10.47it/s]

Epoch [9/10], Step [110/110], Loss: 0.8360
Epoch [9/10], Step [110/110], Accuracy: 0.7500


100%|██████████| 110/110 [00:10<00:00, 10.63it/s]

Epoch [10/10], Step [110/110], Loss: 0.2328
Epoch [10/10], Step [110/110], Accuracy: 1.0000





In [8]:
# inference mode
stance_model.eval()

# load the dev set
dev_dataset = ArabertDataset('output/dev_1_arabert.pkl')
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=64, shuffle=True)

# get accuracy on development set
with torch.no_grad():
    correct = 0
    total = 0
    for embedding, stance, category in dev_loader:
        embedding = embedding.to(device)
        stance = stance.to(device)
        category = category.to(device)

        outputs = stance_model(embedding)
        _, predicted = torch.max(outputs.data, 1)
        total += stance.size(0)
        correct += (predicted == stance).sum().item()
    print(f'Accuracy of the model on the dev set: {100 * correct / total}%')


Accuracy of the model on the dev set: 80.4%


In [9]:
# Results
# RNN + train_1_arabert.pkl --> 80.4% dev accuracy
# RNN + train_2_arabert.pkl --> 71.0% dev accuracy
# RNN + train_3_arabert.pkl --> 7.2% dev accuracy
# RNN + train_4_arabert.pkl --> 28.9% dev accuracy

In [10]:
# train the category_model
category_model = RNN(768, 512, 4, 10).to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(category_model.parameters(), lr=1e-3)
category_model.train()
for epoch in range(10):
    for i, (embedding, stance, category) in enumerate(tqdm(train_loader)):
        embedding = embedding.to(device)
        stance = stance.to(device)
        category = category.to(device)

        outputs = category_model(embedding)
        loss = criterion(outputs, category)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # calculate the total epoch accuracy
        _, predicted = torch.max(outputs.data, 1)
        total = category.size(0)
        correct = (predicted == category).sum().item()
        accuracy = correct / total

        if (i+1) % len(train_loader) == 0:
            print(f'Epoch [{epoch+1}/{10}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
            print(f'Epoch [{epoch+1}/{10}], Step [{i+1}/{len(train_loader)}], Accuracy: {accuracy:.4f}')

100%|██████████| 110/110 [00:07<00:00, 15.46it/s]
  2%|▏         | 2/110 [00:00<00:06, 15.50it/s]

Epoch [1/10], Step [110/110], Loss: 1.8118
Epoch [1/10], Step [110/110], Accuracy: 0.3333


100%|██████████| 110/110 [00:07<00:00, 15.62it/s]
  2%|▏         | 2/110 [00:00<00:07, 15.39it/s]

Epoch [2/10], Step [110/110], Loss: 1.8754
Epoch [2/10], Step [110/110], Accuracy: 0.4167


100%|██████████| 110/110 [00:07<00:00, 15.55it/s]
  2%|▏         | 2/110 [00:00<00:06, 15.96it/s]

Epoch [3/10], Step [110/110], Loss: 1.8705
Epoch [3/10], Step [110/110], Accuracy: 0.3333


100%|██████████| 110/110 [00:07<00:00, 15.71it/s]
  2%|▏         | 2/110 [00:00<00:06, 15.77it/s]

Epoch [4/10], Step [110/110], Loss: 1.4215
Epoch [4/10], Step [110/110], Accuracy: 0.5833


100%|██████████| 110/110 [00:06<00:00, 16.01it/s]
  2%|▏         | 2/110 [00:00<00:06, 15.47it/s]

Epoch [5/10], Step [110/110], Loss: 1.6378
Epoch [5/10], Step [110/110], Accuracy: 0.4167


100%|██████████| 110/110 [00:06<00:00, 15.96it/s]
  2%|▏         | 2/110 [00:00<00:06, 16.06it/s]

Epoch [6/10], Step [110/110], Loss: 1.2903
Epoch [6/10], Step [110/110], Accuracy: 0.5833


100%|██████████| 110/110 [00:07<00:00, 15.56it/s]
  2%|▏         | 2/110 [00:00<00:06, 15.62it/s]

Epoch [7/10], Step [110/110], Loss: 1.8242
Epoch [7/10], Step [110/110], Accuracy: 0.5000


100%|██████████| 110/110 [00:06<00:00, 15.76it/s]
  2%|▏         | 2/110 [00:00<00:06, 15.77it/s]

Epoch [8/10], Step [110/110], Loss: 1.3903
Epoch [8/10], Step [110/110], Accuracy: 0.6667


100%|██████████| 110/110 [00:06<00:00, 15.76it/s]
  2%|▏         | 2/110 [00:00<00:06, 16.01it/s]

Epoch [9/10], Step [110/110], Loss: 2.0631
Epoch [9/10], Step [110/110], Accuracy: 0.4167


100%|██████████| 110/110 [00:06<00:00, 16.26it/s]

Epoch [10/10], Step [110/110], Loss: 1.9842
Epoch [10/10], Step [110/110], Accuracy: 0.2500





In [11]:
# inference mode
category_model.eval()

# load the dev set
dev_dataset = ArabertDataset('output/dev_1_arabert.pkl')
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=64, shuffle=True)

# get accuracy on development set
with torch.no_grad():
    correct = 0
    total = 0
    for embedding, stance, category in dev_loader:
        embedding = embedding.to(device)
        stance = stance.to(device)
        category = category.to(device)

        outputs = category_model(embedding)
        _, predicted = torch.max(outputs.data, 1)
        total += category.size(0)
        correct += (predicted == category).sum().item()
    print(f'Accuracy of the model on the dev set: {100 * correct / total}%')


Accuracy of the model on the dev set: 50.6%


In [12]:
# Results
# RNN + train_1_arabert.pkl --> 54.5% dev accuracy
# RNN + train_2_arabert.pkl --> 54.4% dev accuracy
# RNN + train_3_arabert.pkl --> 7.2% dev accuracy
# RNN + train_4_arabert.pkl --> 54.5% dev accuracy