In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe

from sklearn.model_selection import train_test_split

# Build a classifier pytorch model

In [3]:
class TweetDisasterClassifier(nn.Module):
    def __init__(self, input_size, hidden_sizes, num_classes):
        super(TweetDisasterClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_sizes[0])
        self.relu = nn.ReLU()
        self.hidden_layers = nn.ModuleList()
        
        for i in range(len(hidden_sizes) - 1):
            self.hidden_layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1]))
        
        self.fc_last = nn.Linear(hidden_sizes[-1], num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)

        for hidden_layer in self.hidden_layers:
            out = hidden_layer(out)
            out = self.relu(out)
        
        out = self.fc_last(out)
        return out


In [4]:
input_size = 30 * 300 # dimensionality for maximum of 30 words
hidden_size = [128, 264, 128]
num_classes = 2

model = TweetDisasterClassifier(input_size, hidden_size, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Tokenize data

In [5]:
tokenizer = get_tokenizer('basic_english')

df = pd.read_csv("../data/train.csv")
df = df.sample(frac=1)
df['tokenized_text'] = df['text'].apply(tokenizer)



# Embed the data

In [9]:
glove = GloVe(name='840B', dim=300)

def embed(tokens):
    embeds = []
    for token in tokens:
        if token in glove.stoi:
            embeds.append(glove.stoi[token])
        embeds.append(3000000) # maybe change this we ignore alot of unknowns here
    
    return embeds


df["embeddings"] = df["tokenized_text"].apply(embed)
df


Unnamed: 0,id,keyword,location,text,target,tokenized_text,embeddings
560,809,battle,"Baton Rouge, LA",#DU19 who gon get in this rap battle with me,0,"[#du19, who, gon, get, in, this, rap, battle, ...","[3000000, 74, 3000000, 43641, 3000000, 86, 300..."
3515,5025,eyewitness,,Interesting approach but doesn't replace Eyewi...,1,"[interesting, approach, but, doesn, ', t, repl...","[939, 3000000, 1399, 3000000, 42, 3000000, 234..."
1704,2459,collided,"Peterborough, On",#Newswatch: 2 vehicles collided at Lock and La...,1,"[#newswatch, 2, vehicles, collided, at, lock, ...","[3000000, 80, 3000000, 2951, 3000000, 43975, 3..."
5649,8060,rescue,Trinidad & Tobago,Policyholders object to Clico rescue plan http...,1,"[policyholders, object, to, clico, rescue, pla...","[66948, 3000000, 2355, 3000000, 4, 3000000, 30..."
43,63,ablaze,,SOOOO PUMPED FOR ABLAZE ???? @southridgelife,0,"[soooo, pumped, for, ablaze, ?, ?, ?, ?, @sout...","[15809, 3000000, 16846, 3000000, 11, 3000000, ..."
...,...,...,...,...,...,...,...
4530,6441,injured,MD,And you wonder why he's injured every year htt...,0,"[and, you, wonder, why, he, ', s, injured, eve...","[3, 3000000, 18, 3000000, 1817, 3000000, 297, ..."
3523,5037,eyewitness,Philippines,Read an eyewitness account from #Hiroshima fro...,1,"[read, an, eyewitness, account, from, #hiroshi...","[301, 3000000, 39, 3000000, 48864, 3000000, 90..."
6287,8980,storm,#PhanTrash,The sky's clear the storm has passed but it's ...,0,"[the, sky, ', s, clear, the, storm, has, passe...","[2, 3000000, 4027, 3000000, 55, 3000000, 269, ..."
1590,2297,cliff%20fall,DRAW A CIRCLE THAT'S THE EARTH,*Jumps off of a cliff while drinking tea*\n\nT...,0,"[*jumps, off, of, a, cliff, while, drinking, t...","[3000000, 184, 3000000, 5, 3000000, 6, 3000000..."


# Train the model

In [7]:
num_epochs = 10
batch_size = 64

# Convert the data into PyTorch tensors
X_train = torch.Tensor(X_train)
y_train = torch.LongTensor(y_train)

# Create a data loader
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Training loop
for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(train_loader):
        # Forward pass
        outputs = model(data)
        loss = criterion(outputs, targets)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (batch_idx + 1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')


NameError: name 'X_train' is not defined