In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np

from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
from torchtext.vocab import GloVe

from sklearn.model_selection import train_test_split

# Build a classifier pytorch model

In [10]:
class TweetDisasterClassifier(nn.Module):
    def __init__(self, input_size, hidden_sizes, num_classes):
        super(TweetDisasterClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_sizes[0])
        self.relu = nn.ReLU()
        self.hidden_layers = nn.ModuleList()
        
        for i in range(len(hidden_sizes) - 1):
            self.hidden_layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1]))
        
        self.fc_last = nn.Linear(hidden_sizes[-1], num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)

        for hidden_layer in self.hidden_layers:
            out = hidden_layer(out)
            out = self.relu(out)
        
        out = self.fc_last(out)
        return out


In [11]:
input_size = 74 # dimensionality for maximum of 30 words
hidden_size = [74, 264, 128]
num_classes = 2

model = TweetDisasterClassifier(input_size, hidden_size, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Tokenize data

In [12]:
tokenizer = get_tokenizer('basic_english')

df = pd.read_csv("../data/train.csv")
df = df.sample(frac=1)
df['tokenized_text'] = df['text'].apply(tokenizer) # todo maybe try with n-grams



# Embed the data

In [13]:
glove = GloVe(name='840B', dim=300)

word_to_index = {}
index = 0

for df_index, row in df.iterrows():
    for word in row["tokenized_text"]:
        if word not in word_to_index:
            word_to_index[word] = index
            index += 1

def embed(tokens):
    numerical_sentence = [word_to_index[word] for word in tokens]
    return numerical_sentence


df["embeddings"] = df["tokenized_text"].apply(embed)
df


Unnamed: 0,id,keyword,location,text,target,tokenized_text,embeddings
7365,10545,windstorm,,Texas Seeks Comment on Rules for Changes to Wi...,1,"[texas, seeks, comment, on, rules, for, change...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]"
6783,9717,tragedy,'Merica,'Sometimes God uses sorrowful tragedy to set t...,0,"[', sometimes, god, uses, sorrowful, tragedy, ...","[13, 14, 15, 16, 17, 18, 7, 19, 20, 21, 5, 22,..."
6907,9905,traumatised,,I'm traumatised???? @megancoopy @laurathorne97...,0,"[i, ', m, traumatised, ?, ?, ?, ?, @megancoopy...","[30, 13, 31, 32, 33, 33, 33, 33, 34, 35, 9, 10..."
6659,9542,threat,Everywhere,Build and share your own custom applications a...,0,"[build, and, share, your, own, custom, applica...","[37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 4..."
4893,6966,massacre,The World,Petition/No Medals for 1890 Massacre Justice f...,1,"[petition/no, medals, for, 1890, massacre, jus...","[54, 55, 5, 56, 57, 58, 5, 59, 60, 61, 62, 63,..."
...,...,...,...,...,...,...,...
1544,2231,chemical%20emergency,"Tyler, TX",@pjcoyle ... need to be included in emergency ...,0,"[@pjcoyle, ., ., ., need, to, be, included, in...","[23534, 11, 11, 11, 187, 7, 253, 23535, 89, 14..."
3327,4766,evacuated,Gold Coast,Powerlines down over tram on GC Highway. Passe...,1,"[powerlines, down, over, tram, on, gc, highway...","[12924, 533, 86, 8392, 3, 23537, 778, 11, 5250..."
4608,6551,injury,Lahore,Live Cricket Score In All Match International\...,0,"[live, cricket, score, in, all, match, interna...","[1627, 21209, 8387, 89, 44, 2976, 1230, 11397,..."
5387,7687,panic,Toronto,tomorrow's going to be a year since I went to ...,1,"[tomorrow, ', s, going, to, be, a, year, since...","[159, 13, 70, 1746, 7, 253, 95, 2548, 1076, 30..."


In [14]:
numerical_sentences = [torch.tensor(sentence) for sentence in df["embeddings"].tolist()]
labels = torch.tensor(df["target"].tolist())

numerical_sentences[len(numerical_sentences) -1]

tensor([  360,   521,    20,   749,    62,  3827,   150,   749,    62,  3827,
          248,  4690,  4410,   148,   722,    20,  4410,   148,    11,    11,
           11,     9,    10,    11, 23544])

In [15]:
padded_sentences = pad_sequence(numerical_sentences, batch_first=True)

embedding_train, embedding_validation = train_test_split(padded_sentences, test_size=0.1)
label_train, label_validation = train_test_split(labels, test_size=0.1)

assert len(embedding_train) == len(label_train)
assert len(embedding_validation) == len(label_validation)

batch_size = 128
num_epochs = 1000

train_dataset = TensorDataset(embedding_train, label_train)
val_dataset = TensorDataset(embedding_validation, label_validation)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Train the model

In [16]:
for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(train_loader):
        data = data.to(dtype=model.parameters().__next__().dtype)

        outputs = model(data)
        loss = criterion(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f'Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')


Epoch [1/1000], Step [1/54], Loss: 30.1217
Epoch [1/1000], Step [2/54], Loss: 58995.3320
Epoch [1/1000], Step [3/54], Loss: 7192721.5000
Epoch [1/1000], Step [4/54], Loss: 2188347310080.0000
Epoch [1/1000], Step [5/54], Loss: 704139712.0000
Epoch [1/1000], Step [6/54], Loss: 1823307776.0000
Epoch [1/1000], Step [7/54], Loss: 9783951606415360.0000
Epoch [1/1000], Step [8/54], Loss: 58808210554880.0000
Epoch [1/1000], Step [9/54], Loss: 3531909445693502128128.0000
Epoch [1/1000], Step [10/54], Loss: 552503429196005277252863393792.0000
Epoch [1/1000], Step [11/54], Loss: 0.6969
Epoch [1/1000], Step [12/54], Loss: 0.6974
Epoch [1/1000], Step [13/54], Loss: 0.6958
Epoch [1/1000], Step [14/54], Loss: 0.6936
Epoch [1/1000], Step [15/54], Loss: 0.6953
Epoch [1/1000], Step [16/54], Loss: 0.6943
Epoch [1/1000], Step [17/54], Loss: 0.6935
Epoch [1/1000], Step [18/54], Loss: 0.6944
Epoch [1/1000], Step [19/54], Loss: 0.6941
Epoch [1/1000], Step [20/54], Loss: 0.6938
Epoch [1/1000], Step [21/54], L

# Test the model

In [25]:
test_input = df.iloc[0]
embedding = test_input["embeddings"]


padding_size = input_size - len(embedding)

for i in range(padding_size):
    embedding.append(0.0)


model(torch.tensor(embedding))

RuntimeError: expected scalar type Long but found Float