# Deep Learning Classification

In [33]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

from tqdm import tqdm
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

In [63]:
df = pd.read_csv("data/processed_tweets.csv").astype({"post_text":str})
X = df.post_text
y = df.label
data = [(word_tokenize(post_text)) for post_text in X]
X_train, X_test, y_train, y_test = train_test_split(data, y, stratify = y, random_state = 611)

In [3]:
index2word = ["<PAD>", "<SOS>", "<EOS>"]

for ds in [X_train, X_test]:
    for tweet in ds:
        for token in tweet:
            if token not in index2word:
                index2word.append(token)
word2index = {token: idx for idx, token in enumerate(index2word)}

In [13]:
seq_length = 32
def encode_and_pad(tweet, length):
    sos = [word2index["<SOS>"]]
    eos = [word2index["<EOS>"]]
    pad = [word2index["<PAD>"]]

    if len(tweet) < length - 2: # -2 for SOS and EOS
        n_pads = length - 2 - len(tweet)
        encoded = [word2index[w] for w in tweet]
        return sos + encoded + eos + pad * n_pads 
    else: # tweet is longer than possible; truncating
        encoded = [word2index[w] for w in tweet]
        truncated = encoded[:length - 2]
        return sos + truncated + eos

train_encoded = [encode_and_pad(tweet, seq_length) for tweet in X_train]
test_encoded = [encode_and_pad(tweet, seq_length) for tweet in X_test]

In [14]:
batch_size = 32

train_ds = TensorDataset(torch.as_tensor(train_encoded), torch.as_tensor(list(y_train)))
test_ds = TensorDataset(torch.as_tensor(test_encoded), torch.as_tensor(list(y_test)))

train_dl = DataLoader(train_ds, shuffle=True, batch_size=batch_size, drop_last=True)
test_dl = DataLoader(test_ds, shuffle=True, batch_size=batch_size, drop_last=True)

In [54]:
class BiLSTM_SentimentAnalysis(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=64, dropout=0.1) :
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.Dropout(dropout),
            nn.ReLU()
        )
        self.dense = nn.Sequential(
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, x, hidden):
        """
        The forward method takes in the input and the previous hidden state 
        """
        embs = self.embedding(x)
        out, hidden = self.lstm(embs, hidden)
        out = self.fc(out)
        out = self.dense(out)
        out = out.squeeze()
        out = out[:, -1]
        return out, hidden
    
    def init_hidden(self):
        return (torch.zeros(2, batch_size, 64), torch.zeros(2, batch_size, 64))

In [55]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BiLSTM_SentimentAnalysis(len(word2index)).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-4)

In [None]:
epochs = 50
losses = []
model.train()

for e in tqdm(range(epochs)):

    h0, c0 =  model.init_hidden()

    h0 = h0.to(device)
    c0 = c0.to(device)

    for batch_idx, batch in enumerate(train_dl):

        input = batch[0].to(device)
        target = batch[1].to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            out, hidden = model(input, (h0, c0))
            loss = criterion(out.type(torch.DoubleTensor), target.type(torch.DoubleTensor))
            loss.backward()
            optimizer.step()
        if batch_idx%10 == 0:
            print("\nepoch:", e+1 ,' num:', batch_idx, 'loss:', loss.item())
    losses.append(loss.item())