In [76]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np

from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
from torchtext.vocab import GloVe

from sklearn.model_selection import train_test_split

# Build a classifier pytorch model

In [77]:
class TweetDisasterClassifier(nn.Module):
    def __init__(self, input_size, hidden_sizes, num_classes):
        super(TweetDisasterClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_sizes[0])
        self.relu = nn.ReLU()
        self.hidden_layers = nn.ModuleList()
        
        for i in range(len(hidden_sizes) - 1):
            self.hidden_layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1]))
        
        self.fc_last = nn.Linear(hidden_sizes[-1], num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)

        for hidden_layer in self.hidden_layers:
            out = hidden_layer(out)
            out = self.relu(out)
        
        out = self.fc_last(out)
        return out


In [78]:
input_size = 74 # dimensionality for maximum of 30 words
hidden_size = [74, 264, 128]
num_classes = 2

model = TweetDisasterClassifier(input_size, hidden_size, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Tokenize data

In [79]:
tokenizer = get_tokenizer('basic_english')

df = pd.read_csv("../data/train.csv")
df = df.sample(frac=1)
df['tokenized_text'] = df['text'].apply(tokenizer) # todo maybe try with n-grams



# Embed the data

In [80]:
glove = GloVe(name='840B', dim=300)

word_to_index = {}
index = 0

for df_index, row in df.iterrows():
    for word in row["tokenized_text"]:
        if word not in word_to_index:
            word_to_index[word] = index
            index += 1

def embed(tokens):
    numerical_sentence = [word_to_index[word] for word in tokens]
    return numerical_sentence


df["embeddings"] = df["tokenized_text"].apply(embed)
df


Unnamed: 0,id,keyword,location,text,target,tokenized_text,embeddings
3067,4401,electrocute,,@devon_breneman hopefully it doesn't electrocu...,0,"[@devon_breneman, hopefully, it, doesn, ', t, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
3452,4936,exploded,,Reasons @BlueWestlo has exploded on @YouTube #...,0,"[reasons, @bluewestlo, has, exploded, on, @you...","[11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]"
4536,6450,injured,Tropical SE FLorida,#WakeUpFlorida... #Floridians more likely to b...,0,"[#wakeupflorida, ., ., ., #floridians, more, l...","[22, 20, 20, 20, 23, 24, 25, 26, 27, 28, 29, 3..."
4941,7043,mayhem,"Detroit, Michigan",I liked a @YouTube video from @itsjustinstuart...,0,"[i, liked, a, @youtube, video, from, @itsjusti...","[38, 39, 30, 16, 40, 41, 42, 43, 19, 20, 44, 3..."
1295,1870,burned,Cherry Creek Denver CO,Metal Cutting Sparks Brush Fire In Brighton: A...,1,"[metal, cutting, sparks, brush, fire, in, brig...","[48, 49, 50, 51, 52, 53, 54, 30, 51, 52, 55, 5..."
...,...,...,...,...,...,...,...
6705,9605,thunder,,Thunder???,0,"[thunder, ?, ?, ?]","[1416, 87, 87, 87]"
5386,7686,panic,Milwaukee WI,Someone asked me about a monkey fist about 2 f...,0,"[someone, asked, me, about, a, monkey, fist, a...","[2431, 4239, 617, 211, 30, 13081, 9137, 211, 8..."
2078,2984,dead,"ÌÏT: -26.695807,27.837865",@kg4vaal lmaov.v hard the 'Ny' is the the new ...,0,"[@kg4vaal, lmaov, ., v, hard, the, ', ny, ', i...","[23534, 23535, 20, 6234, 2363, 126, 4, 7306, 4..."
6649,9525,terrorist,MAD as Hell,RT AbbsWinston: #Zionist #Terrorist Demolish T...,1,"[rt, abbswinston, #zionist, #terrorist, demoli...","[1208, 1471, 1472, 1473, 988, 23539, 8444, 161..."


In [81]:
numerical_sentences = [torch.tensor(sentence) for sentence in df["embeddings"].tolist()]
labels = torch.tensor(df["target"].tolist())

numerical_sentences[len(numerical_sentences) -1]

tensor([  119,  5005,  5006,    53,    30,  2131,    43,    19,    20, 23543,
           15,   126,  5009,  5010,  5008,  5011,  5012,  5013, 23544,  5014])

In [82]:
padded_sentences = pad_sequence(numerical_sentences, batch_first=True)

embedding_train, embedding_validation = train_test_split(padded_sentences, test_size=0.1)
label_train, label_validation = train_test_split(labels, test_size=0.1)

assert len(embedding_train) == len(label_train)
assert len(embedding_validation) == len(label_validation)

batch_size = 128
num_epochs = 1000

train_dataset = TensorDataset(embedding_train, label_train)
val_dataset = TensorDataset(embedding_validation, label_validation)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Train the model

In [87]:
for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(train_loader):
        data = data.to(dtype=model.parameters().__next__().dtype)

        outputs = model(data)
        loss = criterion(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (batch_idx + 1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
