In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import (precision_score,
                             recall_score, f1_score, roc_auc_score,
                             precision_recall_curve, roc_curve,
                             auc)

import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
df = pd.read_csv('twitter_data.csv')

In [3]:
df = df.rename(columns = {'tweet': 'text', 'intention': 'target'})

In [4]:
len(df)

9119

In [5]:
df['target'].unique()

array([1, 0])

In [6]:
df['text'] = df['text'].astype(str)

In [7]:
def clean_text(text):
  text = text.lower()
  text = re.sub(r'@\w+\s?', '', text)
  text = re.sub(r'#\w+\s?', '', text)
  text = re.sub(r'[^\w\s]', '', text)
  text = re.sub(r'\d+', '', text)

  return text

df['text'] = df['text'].apply(clean_text)
df.head()

Unnamed: 0,text,target
0,my life is meaningless i just want to end my l...,1
1,muttering i wanna die to myself daily for a fe...,1
2,work slave i really feel like my only purpose ...,1
3,i did something on the of october i overdosed...,1
4,i feel like no one cares i just want to die ma...,1


In [8]:
def normalize_text(text):
  text = word_tokenize(text)
  wnl = WordNetLemmatizer()
  text = [wnl.lemmatize(word) for word in text if word not in set(stopwords.words('english'))]

  text = " ".join(text)
  return text


df['text'] = df['text'].apply(normalize_text)
df['tokens'] = df['text'].apply(lambda x: x.split())
df.head()

Unnamed: 0,text,target,tokens
0,life meaningless want end life badly life comp...,1,"[life, meaningless, want, end, life, badly, li..."
1,muttering wan na die daily month feel worthles...,1,"[muttering, wan, na, die, daily, month, feel, ..."
2,work slave really feel like purpose life make ...,1,"[work, slave, really, feel, like, purpose, lif..."
3,something october overdosed felt alone horribl...,1,"[something, october, overdosed, felt, alone, h..."
4,feel like one care want die maybe feel le lonely,1,"[feel, like, one, care, want, die, maybe, feel..."


In [9]:
import torch
from torch import nn
import pandas as pd
from torch.utils.data import DataLoader, Dataset
import tqdm
from tqdm.auto import tqdm
import torch.cuda
from IPython.display import clear_output
import matplotlib.pyplot as plt
from torchtext.vocab import build_vocab_from_iterator
import torchtext
from torchmetrics import Accuracy

In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [11]:
min_freq = 15
specials = ['<pad>', '<bos>', '<eos>', '<unk>']
vocab = build_vocab_from_iterator(iterator = df['tokens'],
                                  specials = specials,
                                  min_freq = min_freq)
pad_idx = vocab['<pad>']
bos_idx = vocab['<bos>']
eos_idx = vocab['<eos>']
unk_idx = vocab['<unk>']
vocab.set_default_index(unk_idx)

In [12]:
class Tweets_dataset:
  def __init__(self, tokens, labels, vocab):
    self.tokens = tokens
    self.labels = labels
    self.min_freq = min_freq
    self.vocab = vocab
    self.itos = self.vocab.get_itos()
    self.pad_idx = self.vocab['<pad>']
    self.bos_idx = self.vocab['<bos>']
    self.eos_idx = self.vocab['<eos>']
    self.unk_idx = self.vocab['<unk>']
    self.vocab.set_default_index(self.unk_idx)
    self.max_len = self.tokens.apply(len).max() + 2

  def __len__(self):
    return len(self.tokens)

  def tokens_to_idx(self, tokens):
    return [self.vocab[token] for token in tokens]

  def idx_to_tokens(self, indicies):
    return [self.itos(idx) for idx in indicies]

  def encode(self, sentence):
    sentence = ['<bos>'] + sentence + ['<eos>']
    return self.tokens_to_idx(sentence)

  def decode(self, sentence):
    tokens = self.idx_to_tokens(sentence)
    return " ".join(token for token in tokens if token not in self.specials)

  def __getitem__(self, idx):
    tokens = self.tokens.iloc[idx]
    label = self.labels.iloc[idx]
    encoded = self.encode(tokens)
    padded = torch.full((self.max_len, ), self.pad_idx, dtype = torch.int64)
    padded[:len(encoded)] = torch.tensor(encoded)

    return padded, label

In [13]:
BATCH_SIZE = 64
X = df['tokens']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [14]:
train_dataset = Tweets_dataset(X_train, y_train, vocab)
test_dataset = Tweets_dataset(X_test, y_test, vocab)

In [15]:
train_dataloader = DataLoader(train_dataset,
                              batch_size = BATCH_SIZE, shuffle = True)
test_dataloader = DataLoader(test_dataset,
                             batch_size = BATCH_SIZE, shuffle = False)

In [16]:
EMBED_DIM = 64
HIDDEN_DIM = 64
NUM_CLASSES = 2
VOCAB_SIZE = len(vocab)

In [17]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


class LSTM_classifier(nn.Module):
  def __init__(self, embed_dim = EMBED_DIM,
               hidden_dim = HIDDEN_DIM, output_dim = NUM_CLASSES):
    super().__init__()
    self.embedding = nn.Embedding(num_embeddings = VOCAB_SIZE,
                                  embedding_dim=embed_dim,
                                  padding_idx = pad_idx)
    self.lstm = nn.LSTM(input_size = embed_dim,
                        hidden_size = hidden_dim,
                        batch_first = True)
    self.lin = nn.Linear(in_features = hidden_dim, out_features = output_dim)


  def forward(self, tokens):
    embeds = self.embedding(tokens)
    output, _ = self.lstm(embeds)
    output = output.max(dim = 1)[0]

    output = self.lin(output)
    return output

In [18]:
model = LSTM_classifier()
model.to(device)

LSTM_classifier(
  (embedding): Embedding(2340, 64, padding_idx=0)
  (lstm): LSTM(64, 64, batch_first=True)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)

In [19]:
LR = 0.005

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params = model.parameters(), lr = LR)
acc_fn = Accuracy(task = 'multiclass', num_classes = 2).to(device)

In [20]:
def train_step(train_loader, model, loss_fn, acc_fn, optimizer, device):
  train_loss = 0.0
  train_acc = 0.0
  model.to(device)

  for batch, (X, y) in enumerate(train_loader):
    X, y = X.to(device), y.to(device)
    model.train()

    y_pred = model(X)
    loss = loss_fn(y_pred, y)
    acc = acc_fn(y_pred, y)

    train_loss += loss
    train_acc += acc

    optimizer.zero_grad()

    loss.backward()

    optimizer.step()



  train_loss /= len(train_loader)
  train_acc /= len(train_loader)
  print(f"Train loss = {train_loss}, Train accuracy = {train_acc}")

In [21]:
def test_step(test_loader, model, loss_fn, acc_fn, optimizer, device):
  test_loss, test_acc = 0.0, 0.0
  model.to(device)
  model.eval()

  with torch.inference_mode():
    for X, y in test_loader:
      X, y = X.to(device), y.to(device)
      test_pred = model(X)
      test_loss += loss_fn(test_pred, y)
      test_acc += acc_fn(test_pred, y)

    test_loss /= len(test_loader)
    test_acc /= len(test_loader)

    print(f"Test loss = {test_loss}, Test accuracy = {test_acc}")

In [22]:
NUM_EPOCHS = 7
for epoch in tqdm(range(NUM_EPOCHS)):
  print(f"Epoch = {epoch}\n----------------------")
  train_step(train_dataloader, model, loss_fn, acc_fn, optimizer, device)
  test_step(test_dataloader, model, loss_fn, acc_fn, optimizer, device)

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch = 0
----------------------
Train loss = 0.30994006991386414, Train accuracy = 0.873324453830719
Test loss = 0.24599620699882507, Test accuracy = 0.9005571603775024
Epoch = 1
----------------------
Train loss = 0.18515770137310028, Train accuracy = 0.930342435836792
Test loss = 0.20838510990142822, Test accuracy = 0.9142441749572754
Epoch = 2
----------------------
Train loss = 0.1195107251405716, Train accuracy = 0.9557679891586304
Test loss = 0.21345700323581696, Test accuracy = 0.9178779125213623
Epoch = 3
----------------------
Train loss = 0.08640360087156296, Train accuracy = 0.9682247042655945
Test loss = 0.2410614788532257, Test accuracy = 0.9087936282157898
Epoch = 4
----------------------
Train loss = 0.05313383415341377, Train accuracy = 0.9817187786102295
Test loss = 0.2574859857559204, Test accuracy = 0.9164243936538696
Epoch = 5
----------------------
Train loss = 0.03171321004629135, Train accuracy = 0.9901562333106995
Test loss = 0.2919921278953552, Test accuracy =