In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe

from sklearn.model_selection import train_test_split

# Build a classifier pytorch model

In [10]:
class TweetDisasterClassifier(nn.Module):
    def __init__(self, input_size, hidden_sizes, num_classes):
        super(TweetDisasterClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_sizes[0])
        self.relu = nn.ReLU()
        self.hidden_layers = nn.ModuleList()
        
        for i in range(len(hidden_sizes) - 1):
            self.hidden_layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1]))
        
        self.fc_last = nn.Linear(hidden_sizes[-1], num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)

        for hidden_layer in self.hidden_layers:
            out = hidden_layer(out)
            out = self.relu(out)
        
        out = self.fc_last(out)
        return out


In [11]:
input_size = 30 * 300 # dimensionality for maximum of 30 words
hidden_size = [128, 264, 128]
num_classes = 2

model = TweetDisasterClassifier(input_size, hidden_size, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Tokenize data

In [14]:
tokenizer = get_tokenizer('basic_english')

df = pd.read_csv("../data/train.csv")
df = df.sample(frac=1)
df['tokenized_text'] = df['text'].apply(tokenizer)



# Embed the data

In [26]:
glove = GloVe(name='840B', dim=300)

def embed(tokens):
    embeds = []
    for token in tokens:
        if token in glove.stoi:
            embeds.append(glove.stoi[token])
        embeds.append(3000000) # maybe change this we ignore alot of unknowns here
    
    return embeds


df["embeddings"] = df["tokenized_text"].apply(embed)
df


Unnamed: 0,id,keyword,location,text,target,tokenized_text,embeddings
1318,1905,burning,,@nagel_ashley @Vicken52 @BasedLaRock @goonc1ty...,0,"[@nagel_ashley, @vicken52, @basedlarock, @goon...","[3000000, 3000000, 3000000, 3000000, 10252, 30..."
2622,3763,destruction,"Houston, TX",REPUBLICAN STYLED ECONOMIC DESTRUCTION | Under...,0,"[republican, styled, economic, destruction, |,...","[21414, 3000000, 20347, 3000000, 1423, 3000000..."
6670,9559,thunder,"Enfield, UK",Illusoria Icarus nowplaying check out http://t...,0,"[illusoria, icarus, nowplaying, check, out, ht...","[3000000, 249002, 3000000, 790059, 3000000, 52..."
5116,7296,nuclear%20reactor,,Check out this awesome profile on #GE's swimmi...,0,"[check, out, this, awesome, profile, on, #ge, ...","[528, 3000000, 56, 3000000, 27, 3000000, 1378,..."
4663,6629,inundated,United States,#tech Data Overload: The Growing Demand for Co...,0,"[#tech, data, overload, the, growing, demand, ...","[272712, 3000000, 335, 3000000, 21856, 3000000..."
...,...,...,...,...,...,...,...
7320,10479,wild%20fires,,@EnzasBargains A5 Donated some fruit snacks &a...,1,"[@enzasbargains, a5, donated, some, fruit, sna...","[3000000, 94242, 3000000, 9038, 3000000, 85, 3..."
1529,2211,chemical%20emergency,"Virginia, United States",#Illinois: Emergency units simulate a chemical...,1,"[#illinois, emergency, units, simulate, a, che...","[3000000, 3006, 3000000, 2312, 3000000, 24647,..."
3119,4479,electrocuted,"Redondo Beach, CA",Do babies actually get electrocuted from wall ...,1,"[do, babies, actually, get, electrocuted, from...","[47, 3000000, 5034, 3000000, 406, 3000000, 86,..."
2169,3111,debris,Nigeria,Malaysia confirms plane debris washed up on Re...,1,"[malaysia, confirms, plane, debris, washed, up...","[36811, 3000000, 14125, 3000000, 3957, 3000000..."


# Train the model

In [12]:
num_epochs = 10
batch_size = 64

# Convert the data into PyTorch tensors
X_train = torch.Tensor(X_train)
y_train = torch.LongTensor(y_train)

# Create a data loader
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Training loop
for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(train_loader):
        # Forward pass
        outputs = model(data)
        loss = criterion(outputs, targets)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (batch_idx + 1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')


NameError: name 'X_train' is not defined