In [1]:
import pandas as pd
import numpy as np
import torch
from torch.autograd import Variable 
from tqdm import tqdm

In [2]:
def edit_categories(x):
    if x == 'info_news':
        return 0
    elif x == 'celebrity':
        return 1
    elif x == 'plan':
        return 2
    elif x == 'requests':
        return 3
    elif x == 'rumors':
        return 4
    elif x == 'advice':
        return 5
    elif x == 'restrictions':
        return 6
    elif x == 'personal':
        return 7
    elif x == 'unrelated':
        return 8
    elif x == 'others':
        return 9
    else:
        return -1

In [4]:

# build the pytorch dataset
class ArabertDataset(torch.utils.data.Dataset):
    def __init__(self, dataset_path):
        dataset = pd.read_pickle(dataset_path)
        pad_embedding = torch.zeros(1, 768)
        max_len = 0
        embeddings = dataset['embeddings'].values
        for i in range(len(embeddings)):
            if embeddings[i].shape[0] > max_len:
                max_len = embeddings[i].shape[0]
        for i in range(len(embeddings)):
            if embeddings[i].shape[0] < max_len:
                pad = torch.zeros(max_len - embeddings[i].shape[0], 768).to(torch.device('cuda'))
                embeddings[i] = torch.cat((embeddings[i], pad), dim=0)
        categories = dataset['category'].apply(edit_categories)
        self.embeddings = embeddings # already a tensor
        self.stance = dataset['stance']
        self.category = categories
        self.stance = torch.tensor(self.stance.values)
        self.category = torch.tensor(self.category.values)



    def __len__(self):
        return len(self.stance)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.stance[idx], self.category[idx]


In [5]:
# build the pytorch dataset
# class ArabertDataset(torch.utils.data.Dataset):
#     def __init__(self):
#         embeddings = pd.read_pickle('dataset/train_arabert_not_padded.pkl')
#         pad_embedding = torch.zeros(1, 768)
#         max_len = 0
#         for i in range(len(embeddings)):
#             if embeddings[i].shape[1] > max_len:
#                 max_len = embeddings[i].shape[1]
#         for i in range(len(embeddings)):
#             if embeddings[i].shape[1] < max_len:
#                 pad = torch.zeros(1, max_len - embeddings[i].shape[1], 768)
#                 embeddings[i] = torch.cat((embeddings[i], pad), dim=1)
#             embeddings[i] = embeddings[i].view(max_len, 768)

#         labels = pd.read_csv('dataset/train.csv').drop('text', axis=1, inplace=False)
#         labels['category'] = labels['category'].apply(edit_categories)
#         #self.embeddings = embeddings # already a tensor
#         self.stance = labels['stance']

#         self.stance = self.stance + 1
#         self.category = labels['category']
#         #self.stance = torch.tensor(self.stance)
#         self.category = torch.tensor(self.category)


#         self.embeddings = Variable(embeddings)              # I saw someone doing this
#         self.stance = Variable(torch.Tensor(self.stance))


#     def __len__(self):
#         return len(self.stance)

#     def __getitem__(self, idx):
#         return self.embeddings[idx], self.stance[idx], self.category[idx]

In [6]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [7]:
torch.cuda.is_available()

True

In [8]:
# build the pytorch dataloader
train_dataset = ArabertDataset('output/train_1_arabert.pkl')
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)

# build the model
class RNN(torch.nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = torch.nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_size, num_classes)




    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out




In [8]:

# class RNN(torch.nn.Module):
# #I also saw him doing this

#     def __init__(self, num_classes, input_size, hidden_size, num_layers, seq_length):
#         super(RNN, self).__init__()
#         self.num_classes = num_classes #number of classes
#         self.num_layers = num_layers #number of layers
#         self.input_size = input_size #input size
#         self.hidden_size = hidden_size #hidden state
#         self.seq_length = seq_length #sequence length

#         self.lstm = torch.nn.LSTM(input_size=input_size, hidden_size=hidden_size,
#                           num_layers=num_layers, batch_first=True) #lstm
#         self.fc_1 =  torch.nn.Linear(hidden_size, 128) #fully connected 1
#         self.fc = torch.nn.Linear(128, num_classes) #fully connected last layer

#         self.relu = torch.nn.ReLU()
    
#     def forward(self,x):
#         h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)) #hidden state
#         c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)) #internal state
#         # Propagate input through LSTM
#         output, (hn, cn) = self.lstm(x, (h_0, c_0)) #lstm with input, hidden, and internal state
#         hn = hn.view(-1, self.hidden_size) #reshaping the data for Dense layer next
#         out = self.relu(hn)
#         out = self.fc_1(out) #first Dense
#         out = self.relu(out) #relu
#         out = self.fc(out) #Final Output
#         return out

In [9]:
# train the stance_model
stance_model = RNN(768, 256, 2, 3).to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(stance_model.parameters(), lr=1e-3)
stance_model.train()
for epoch in range(10):
    for i, (embedding, stance, category) in enumerate(tqdm(train_loader)):
        embedding = embedding.to(device)
        stance = stance.to(device)
        category = category.to(device)

        outputs = stance_model(embedding)
        loss = criterion(outputs, stance)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # calculate the total epoch accuracy
        _, predicted = torch.max(outputs.data, 1)
        total = stance.size(0)
        correct = (predicted == stance).sum().item()
        accuracy = correct / total

        if (i+1) % len(train_loader) == 0:
            print(f'Epoch [{epoch+1}/{10}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
            print(f'Epoch [{epoch+1}/{10}], Step [{i+1}/{len(train_loader)}], Accuracy: {accuracy:.4f}')

100%|██████████| 110/110 [00:02<00:00, 54.53it/s]
  5%|▌         | 6/110 [00:00<00:01, 54.56it/s]

Epoch [1/10], Step [110/110], Loss: 0.3829
Epoch [1/10], Step [110/110], Accuracy: 0.9167


100%|██████████| 110/110 [00:01<00:00, 56.32it/s]
  5%|▌         | 6/110 [00:00<00:01, 56.40it/s]

Epoch [2/10], Step [110/110], Loss: 0.5202
Epoch [2/10], Step [110/110], Accuracy: 0.8333


100%|██████████| 110/110 [00:02<00:00, 54.85it/s]
  5%|▌         | 6/110 [00:00<00:01, 56.11it/s]

Epoch [3/10], Step [110/110], Loss: 0.5041
Epoch [3/10], Step [110/110], Accuracy: 0.8333


100%|██████████| 110/110 [00:01<00:00, 56.09it/s]
  5%|▌         | 6/110 [00:00<00:01, 55.76it/s]

Epoch [4/10], Step [110/110], Loss: 0.5196
Epoch [4/10], Step [110/110], Accuracy: 0.8333


100%|██████████| 110/110 [00:01<00:00, 56.53it/s]
  5%|▌         | 6/110 [00:00<00:01, 54.63it/s]

Epoch [5/10], Step [110/110], Loss: 0.5418
Epoch [5/10], Step [110/110], Accuracy: 0.8333


100%|██████████| 110/110 [00:01<00:00, 56.10it/s]
  5%|▌         | 6/110 [00:00<00:01, 57.99it/s]

Epoch [6/10], Step [110/110], Loss: 1.4795
Epoch [6/10], Step [110/110], Accuracy: 0.4167


100%|██████████| 110/110 [00:01<00:00, 55.83it/s]
  5%|▌         | 6/110 [00:00<00:01, 56.14it/s]

Epoch [7/10], Step [110/110], Loss: 0.6795
Epoch [7/10], Step [110/110], Accuracy: 0.7500


100%|██████████| 110/110 [00:01<00:00, 57.38it/s]
  5%|▌         | 6/110 [00:00<00:01, 59.17it/s]

Epoch [8/10], Step [110/110], Loss: 1.5010
Epoch [8/10], Step [110/110], Accuracy: 0.4167


100%|██████████| 110/110 [00:01<00:00, 57.95it/s]
  5%|▌         | 6/110 [00:00<00:01, 56.81it/s]

Epoch [9/10], Step [110/110], Loss: 0.2602
Epoch [9/10], Step [110/110], Accuracy: 1.0000


100%|██████████| 110/110 [00:01<00:00, 57.91it/s]

Epoch [10/10], Step [110/110], Loss: 0.4488
Epoch [10/10], Step [110/110], Accuracy: 0.9167





In [10]:
# inference mode
stance_model.eval()

# load the dev set
dev_dataset = ArabertDataset('output/dev_1_arabert.pkl')
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=64, shuffle=True)

# get accuracy on development set
with torch.no_grad():
    correct = 0
    total = 0
    for embedding, stance, category in dev_loader:
        embedding = embedding.to(device)
        stance = stance.to(device)
        category = category.to(device)

        outputs = stance_model(embedding)
        _, predicted = torch.max(outputs.data, 1)
        total += stance.size(0)
        correct += (predicted == stance).sum().item()
    print(f'Accuracy of the model on the dev set: {100 * correct / total}%')


Accuracy of the model on the dev set: 80.4%


In [11]:
# Results
# RNN + train_1_arabert.pkl --> 80.4% dev accuracy
# RNN + train_2_arabert.pkl --> 71.0% dev accuracy
# RNN + train_3_arabert.pkl --> 7.2% dev accuracy
# RNN + train_4_arabert.pkl --> 28.9% dev accuracy

In [12]:
# train the category_model
category_model = RNN(768, 512, 4, 10).to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(category_model.parameters(), lr=1e-3)
category_model.train()
for epoch in range(10):
    for i, (embedding, stance, category) in enumerate(tqdm(train_loader)):
        embedding = embedding.to(device)
        stance = stance.to(device)
        category = category.to(device)

        outputs = category_model(embedding)
        loss = criterion(outputs, category)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # calculate the total epoch accuracy
        _, predicted = torch.max(outputs.data, 1)
        total = category.size(0)
        correct = (predicted == category).sum().item()
        accuracy = correct / total

        if (i+1) % len(train_loader) == 0:
            print(f'Epoch [{epoch+1}/{10}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
            print(f'Epoch [{epoch+1}/{10}], Step [{i+1}/{len(train_loader)}], Accuracy: {accuracy:.4f}')

100%|██████████| 110/110 [00:07<00:00, 15.17it/s]
  2%|▏         | 2/110 [00:00<00:07, 14.97it/s]

Epoch [1/10], Step [110/110], Loss: 1.9709
Epoch [1/10], Step [110/110], Accuracy: 0.4167


100%|██████████| 110/110 [00:07<00:00, 15.39it/s]
  2%|▏         | 2/110 [00:00<00:06, 15.70it/s]

Epoch [2/10], Step [110/110], Loss: 1.9573
Epoch [2/10], Step [110/110], Accuracy: 0.5000


100%|██████████| 110/110 [00:07<00:00, 15.54it/s]
  2%|▏         | 2/110 [00:00<00:07, 14.88it/s]

Epoch [3/10], Step [110/110], Loss: 1.8193
Epoch [3/10], Step [110/110], Accuracy: 0.4167


100%|██████████| 110/110 [00:07<00:00, 15.40it/s]
  2%|▏         | 2/110 [00:00<00:07, 13.73it/s]

Epoch [4/10], Step [110/110], Loss: 1.7431
Epoch [4/10], Step [110/110], Accuracy: 0.4167


100%|██████████| 110/110 [00:07<00:00, 15.41it/s]
  2%|▏         | 2/110 [00:00<00:06, 15.93it/s]

Epoch [5/10], Step [110/110], Loss: 1.7083
Epoch [5/10], Step [110/110], Accuracy: 0.5000


100%|██████████| 110/110 [00:07<00:00, 15.51it/s]
  2%|▏         | 2/110 [00:00<00:07, 15.15it/s]

Epoch [6/10], Step [110/110], Loss: 1.4416
Epoch [6/10], Step [110/110], Accuracy: 0.5833


100%|██████████| 110/110 [00:07<00:00, 15.32it/s]
  2%|▏         | 2/110 [00:00<00:07, 15.24it/s]

Epoch [7/10], Step [110/110], Loss: 1.9162
Epoch [7/10], Step [110/110], Accuracy: 0.4167


100%|██████████| 110/110 [00:07<00:00, 15.06it/s]
  2%|▏         | 2/110 [00:00<00:06, 15.67it/s]

Epoch [8/10], Step [110/110], Loss: 1.6688
Epoch [8/10], Step [110/110], Accuracy: 0.5000


100%|██████████| 110/110 [00:07<00:00, 15.34it/s]
  2%|▏         | 2/110 [00:00<00:07, 14.66it/s]

Epoch [9/10], Step [110/110], Loss: 1.4062
Epoch [9/10], Step [110/110], Accuracy: 0.5833


100%|██████████| 110/110 [00:07<00:00, 15.20it/s]

Epoch [10/10], Step [110/110], Loss: 1.6545
Epoch [10/10], Step [110/110], Accuracy: 0.5833





In [19]:
# inference mode
category_model.eval()

# load the dev set
dev_dataset = ArabertDataset('output/dev_1_arabert.pkl')
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=64, shuffle=True)

# get accuracy on development set
with torch.no_grad():
    correct = 0
    total = 0
    for embedding, stance, category in dev_loader:
        embedding = embedding.to(device)
        stance = stance.to(device)
        category = category.to(device)

        outputs = category_model(embedding)
        _, predicted = torch.max(outputs.data, 1)
        total += category.size(0)
        correct += (predicted == category).sum().item()
    print(f'Accuracy of the model on the dev set: {100 * correct / total}%')


NameError: name 'category_model' is not defined

In [14]:
# Results
# RNN + train_1_arabert.pkl --> 54.5% dev accuracy
# RNN + train_2_arabert.pkl --> 54.4% dev accuracy
# RNN + train_3_arabert.pkl --> 7.2% dev accuracy
# RNN + train_4_arabert.pkl --> 54.5% dev accuracy