In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import string
import nltk
import re
import pandas as pd
import gensim
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch import Tensor
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# **Filtering + Testing**

In [4]:
def remove_punctuation(text):
    punctuationfree = "".join([i for i in text if i not in string.punctuation])
    return punctuationfree

def lower_case(text):
    lowered_case = "".join([i.lower() for i in text])
    return lowered_case

nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('')
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

def tokenization(text):
    tokens = re.split('\W+',text)
    return tokens

nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

def remove_non_alphabets(text):
    regex = re.compile('[^a-zA-Z]')
    return regex.sub(' ', text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
def FilterText(text):
    return remove_stopwords(lemmatizer(tokenization(lower_case(remove_non_alphabets(remove_punctuation(text))))))

# **Loading Dataset**

In [6]:
train = pd.read_csv('./drive/MyDrive/NN-HW4/twitter-suicidal_data.csv')
x = train['tweet']
y = train['intention']

In [7]:
xs = [FilterText(text) for text in x]
ys = [ele for ele in y]

In [8]:
x_train, x_test, y_train, y_test = train_test_split(xs, ys, test_size=0.2, random_state=42)

In [9]:
FilterText(x[0])

['life',
 'meaningless',
 'want',
 'end',
 'life',
 'badly',
 'life',
 'completely',
 'empty',
 'dont',
 'want',
 'create',
 'meaning',
 'creating',
 'meaning',
 'pain',
 'long',
 'hold',
 'back',
 'urge',
 'run',
 'car',
 'head',
 'first',
 'next',
 'person',
 'coming',
 'opposite',
 'way',
 'stop',
 'feeling',
 'jealous',
 'tragic',
 'character',
 'like',
 'gomer',
 'pile',
 'swift',
 'end',
 'able',
 'bring',
 'life']

# **Embedding Matrix: Word2Vec**

In [10]:
# Create CBOW model
word2vec1 = gensim.models.Word2Vec(x_train, min_count = 1, vector_size = 300, window = 10)
# Create Skip Gram model
word2vec2 = gensim.models.Word2Vec(x_train, min_count = 1, vector_size = 300, window = 10, sg = 1)

In [11]:
# Print results
print("Cosine similarity between 'meaningless' and 'empty' - CBOW : ", word2vec1.wv.similarity('meaningless', 'empty'))
# Print results
print("Cosine similarity between 'meaningless' and 'empty' - Skip Gram : ", word2vec2.wv.similarity('meaningless', 'empty'))

Cosine similarity between 'meaningless' and 'empty' - CBOW :  0.98537517
Cosine similarity between 'meaningless' and 'empty' - Skip Gram :  0.8748165


In [12]:
word2vec1.save('./drive/MyDrive/NN-HW4/word2vec1.bin')
word2vec2.save('./drive/MyDrive/NN-HW4/word2vec2.bin')

In [13]:
max_sequence_length = max([len(x) for x in x_train])
word2vec1.wv.vectors = torch.tensor(word2vec1.wv.vectors).to(device)
embedding_matrix = word2vec1.wv.vectors

In [14]:
class CustomDataset(Dataset):
  def __init__(self, data, labels, word2vec_model, max_sequence_length):
    self.data = data
    self.labels = labels
    self.word2vec_model = word2vec_model
    self.max_sequence_length = max_sequence_length

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    indices = [self.word2vec_model.wv.key_to_index[token] for token in self.data[index] if token in self.word2vec_model.wv.key_to_index]
    indices = indices[:self.max_sequence_length] + [0] * max(0, self.max_sequence_length - len(indices))
    sample = {"data": torch.tensor(indices), "label": torch.tensor(self.labels[index], dtype=torch.float)}
    return sample

In [15]:
def calculate_accuracy(predictions, targets):
  correct = ((predictions.reshape(targets.shape)>=0.5) == targets).sum().item()
  total = targets.size(0)
  accuracy = correct / total
  return accuracy

# **LSTM Neural Network**

In [16]:
class CustomLSTM(nn.Module):
  def __init__(self, embedding_matrix, hidden_size, output_size):
    super(CustomLSTM, self).__init__()
    self.embedding = nn.Embedding(embedding_matrix.size(0), embedding_matrix.size(1))
    self.embedding.weight = nn.Parameter(embedding_matrix)
    self.embedding.weight.requires_grad = False
    self.lstm = nn.LSTM(embedding_matrix.size(1), hidden_size, batch_first=True)
    self.fc = nn.Linear(hidden_size, output_size)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    embedded = self.embedding(x)
    lstm_out, _ = self.lstm(embedded)
    aggregated = torch.mean(lstm_out, dim=1)
    # output = self.fc(lstm_out[-1, :, :])
    output = self.fc(aggregated)
    output = self.sigmoid(output)
    return output

In [17]:
custom_dataset = CustomDataset(x_train, y_train, word2vec1, max_sequence_length)
dataloader = DataLoader(custom_dataset, batch_size=64, shuffle=True)

In [18]:
hidden_size = 50
output_size = 1
lstm_model = CustomLSTM(embedding_matrix, hidden_size, output_size).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=0.01)

In [19]:
num_epochs = 20
for epoch in range(num_epochs):
  total_loss = 0.0
  total_accuracy = 0.0

  for batch in dataloader:
    data_batch = batch["data"].to(device)
    label_batch = batch["label"].to(device)

    optimizer.zero_grad()

    outputs = lstm_model(data_batch)
    loss = criterion(outputs.squeeze(), label_batch)

    total_loss += loss.item()

    loss.backward()
    optimizer.step()

    batch_accuracy = calculate_accuracy(outputs, label_batch)
    total_accuracy += batch_accuracy

  average_loss = total_loss / len(dataloader)
  average_acc = total_accuracy / len(dataloader)

  print(f'Epoch {epoch+1}/{num_epochs}, Loss: {average_loss:.4f}, Accuracy: {average_acc:.4f}')

Epoch 1/20, Loss: 0.6201, Accuracy: 0.6435
Epoch 2/20, Loss: 0.3562, Accuracy: 0.8606
Epoch 3/20, Loss: 0.3129, Accuracy: 0.8757
Epoch 4/20, Loss: 0.2962, Accuracy: 0.8792
Epoch 5/20, Loss: 0.2845, Accuracy: 0.8853
Epoch 6/20, Loss: 0.2751, Accuracy: 0.8898
Epoch 7/20, Loss: 0.2704, Accuracy: 0.8917
Epoch 8/20, Loss: 0.2614, Accuracy: 0.8954
Epoch 9/20, Loss: 0.2571, Accuracy: 0.8949
Epoch 10/20, Loss: 0.2564, Accuracy: 0.8975
Epoch 11/20, Loss: 0.2553, Accuracy: 0.8956
Epoch 12/20, Loss: 0.2473, Accuracy: 0.8988
Epoch 13/20, Loss: 0.2489, Accuracy: 0.8961
Epoch 14/20, Loss: 0.2390, Accuracy: 0.9014
Epoch 15/20, Loss: 0.2457, Accuracy: 0.8994
Epoch 16/20, Loss: 0.2352, Accuracy: 0.9042
Epoch 17/20, Loss: 0.2401, Accuracy: 0.8999
Epoch 18/20, Loss: 0.2347, Accuracy: 0.9057
Epoch 19/20, Loss: 0.2334, Accuracy: 0.9039
Epoch 20/20, Loss: 0.2387, Accuracy: 0.9009


In [20]:
custom_dataset_test = CustomDataset(x_test, y_test, word2vec1, max_sequence_length)
dataloader_test = DataLoader(custom_dataset_test, batch_size=64, shuffle=True)

total_loss = 0.0
total_accuracy = 0.0
ops = []
trgs = []

for batch in dataloader_test:
  data_batch = batch["data"].to(device)
  label_batch = batch["label"].to(device)

  with torch.no_grad():
    outputs = lstm_model(data_batch)
    loss = criterion(outputs.squeeze(), label_batch)

    total_loss += loss.item()

    batch_accuracy = calculate_accuracy(outputs, label_batch)
    total_accuracy += batch_accuracy

    for el in outputs:
      ops.append(Tensor.cpu(el >= 0.5).item())

    for el in label_batch:
      trgs.append(Tensor.cpu(el).item())

average_loss = total_loss / len(dataloader_test)
average_acc = total_accuracy / len(dataloader_test)

print(f'Test, Loss: {average_loss:.4f}, Accuracy: {average_acc:.4f}')

Test, Loss: 0.2728, Accuracy: 0.8885


In [21]:
tn, fp, fn, tp = confusion_matrix(ops, trgs).ravel()
tn, fp, fn, tp

(939, 121, 80, 684)

In [22]:
print(classification_report(trgs, ops))

              precision    recall  f1-score   support

         0.0       0.89      0.92      0.90      1019
         1.0       0.90      0.85      0.87       805

    accuracy                           0.89      1824
   macro avg       0.89      0.89      0.89      1824
weighted avg       0.89      0.89      0.89      1824



# **2-Layer LSTM Neural Network**

In [23]:
class CustomTwoLSTM(nn.Module):
  def __init__(self, embedding_matrix, hidden_size, output_size):
    super(CustomTwoLSTM, self).__init__()
    self.embedding = nn.Embedding(embedding_matrix.size(0), embedding_matrix.size(1))
    self.embedding.weight = nn.Parameter(embedding_matrix)
    self.embedding.weight.requires_grad = False
    self.lstm = nn.LSTM(embedding_matrix.size(1), hidden_size, num_layers=2)
    self.fc = nn.Linear(hidden_size, output_size)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    embedded = self.embedding(x)
    lstm_out, _ = self.lstm(embedded)
    aggregated = torch.mean(lstm_out, dim=1)
    # output = self.fc(lstm_out[-1, :, :])
    output = self.fc(aggregated)
    output = self.sigmoid(output)
    return output

In [24]:
custom_dataset = CustomDataset(x_train, y_train, word2vec1, max_sequence_length)
dataloader = DataLoader(custom_dataset, batch_size=64, shuffle=True)

In [25]:
hidden_size = 50
output_size = 1
twolstm_model = CustomTwoLSTM(embedding_matrix, hidden_size, output_size).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(twolstm_model.parameters(), lr=0.01)

In [26]:
num_epochs = 20
for epoch in range(num_epochs):
  total_loss = 0.0
  total_accuracy = 0.0

  for batch in dataloader:
    data_batch = batch["data"].to(device)
    label_batch = batch["label"].to(device)

    optimizer.zero_grad()

    outputs = twolstm_model(data_batch)
    loss = criterion(outputs.squeeze(), label_batch)

    total_loss += loss.item()

    loss.backward()
    optimizer.step()

    batch_accuracy = calculate_accuracy(outputs, label_batch)
    total_accuracy += batch_accuracy

  average_loss = total_loss / len(dataloader)
  average_acc = total_accuracy / len(dataloader)

  print(f'Epoch {epoch+1}/{num_epochs}, Loss: {average_loss:.4f}, Accuracy: {average_acc:.4f}')

Epoch 1/20, Loss: 0.6684, Accuracy: 0.5974
Epoch 2/20, Loss: 0.5565, Accuracy: 0.7642
Epoch 3/20, Loss: 0.4980, Accuracy: 0.7981
Epoch 4/20, Loss: 0.4645, Accuracy: 0.8250
Epoch 5/20, Loss: 0.4638, Accuracy: 0.8225
Epoch 6/20, Loss: 0.4398, Accuracy: 0.8289
Epoch 7/20, Loss: 0.4196, Accuracy: 0.8408
Epoch 8/20, Loss: 0.4102, Accuracy: 0.8459
Epoch 9/20, Loss: 0.4064, Accuracy: 0.8454
Epoch 10/20, Loss: 0.3958, Accuracy: 0.8465
Epoch 11/20, Loss: 0.3963, Accuracy: 0.8473
Epoch 12/20, Loss: 0.3893, Accuracy: 0.8502
Epoch 13/20, Loss: 0.3872, Accuracy: 0.8473
Epoch 14/20, Loss: 0.3800, Accuracy: 0.8500
Epoch 15/20, Loss: 0.3828, Accuracy: 0.8466
Epoch 16/20, Loss: 0.3714, Accuracy: 0.8517
Epoch 17/20, Loss: 0.3609, Accuracy: 0.8561
Epoch 18/20, Loss: 0.3540, Accuracy: 0.8578
Epoch 19/20, Loss: 0.3499, Accuracy: 0.8602
Epoch 20/20, Loss: 0.3440, Accuracy: 0.8609


In [27]:
custom_dataset_test = CustomDataset(x_test, y_test, word2vec1, max_sequence_length)
dataloader_test = DataLoader(custom_dataset_test, batch_size=64, shuffle=True)

total_loss = 0.0
total_accuracy = 0.0
ops = []
trgs = []

for batch in dataloader_test:
  data_batch = batch["data"].to(device)
  label_batch = batch["label"].to(device)

  with torch.no_grad():
    outputs = twolstm_model(data_batch)
    loss = criterion(outputs.squeeze(), label_batch)

    total_loss += loss.item()

    batch_accuracy = calculate_accuracy(outputs, label_batch)
    total_accuracy += batch_accuracy

    for el in outputs:
      ops.append(Tensor.cpu(el >= 0.5).item())

    for el in label_batch:
      trgs.append(Tensor.cpu(el).item())

average_loss = total_loss / len(dataloader_test)
average_acc = total_accuracy / len(dataloader_test)

print(f'Test, Loss: {average_loss:.4f}, Accuracy: {average_acc:.4f}')

Test, Loss: 0.3360, Accuracy: 0.8696


In [28]:
tn, fp, fn, tp = confusion_matrix(ops, trgs).ravel()
tn, fp, fn, tp

(983, 199, 36, 606)

In [29]:
print(classification_report(trgs, ops))

              precision    recall  f1-score   support

         0.0       0.83      0.96      0.89      1019
         1.0       0.94      0.75      0.84       805

    accuracy                           0.87      1824
   macro avg       0.89      0.86      0.87      1824
weighted avg       0.88      0.87      0.87      1824



# **CNN + 2-Layer LSTM Neural Network**

In [30]:
class CustomCNNTwoLSTM(nn.Module):
  def __init__(self, embedding_matrix, hidden_size, output_size):
    super(CustomCNNTwoLSTM, self).__init__()
    self.embedding = nn.Embedding(embedding_matrix.size(0), embedding_matrix.size(1))
    self.embedding.weight = nn.Parameter(embedding_matrix)
    self.embedding.weight.requires_grad = False
    self.conv = nn.Conv1d(embedding_matrix.size(1), 64, kernel_size=3, stride=1, padding=1)
    self.relu = nn.ReLU()
    self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
    self.flatten = nn.Flatten()
    self.dropout = nn.Dropout(0.5)
    self.lstm = nn.LSTM(64, hidden_size, num_layers=2, batch_first=True)
    self.fc = nn.Linear(hidden_size, output_size)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x = self.embedding(x)
    x = x.permute(0, 2, 1)
    x = self.conv(x)

    x = self.relu(x)
    x = self.pool(x)
    batch_size, features, length = x.size()
    x = self.flatten(x)
    x = self.dropout(x)

    x = x.reshape(batch_size, length, features)
    lstm_out, _ = self.lstm(x)
    aggregated = torch.mean(lstm_out, dim=1)
    output = self.fc(aggregated)
    output = self.sigmoid(output)
    return output

In [31]:
custom_dataset = CustomDataset(x_train, y_train, word2vec1, max_sequence_length)
dataloader = DataLoader(custom_dataset, batch_size=64, shuffle=True)

In [32]:
hidden_size = 50
output_size = 1
cnntwolstm_model = CustomCNNTwoLSTM(embedding_matrix, hidden_size, output_size).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(cnntwolstm_model.parameters(), lr=0.001)

In [33]:
num_epochs = 20
for epoch in range(num_epochs):
  total_loss = 0.0
  total_accuracy = 0.0

  for batch in dataloader:
    data_batch = batch["data"].to(device)
    label_batch = batch["label"].to(device)

    optimizer.zero_grad()

    outputs = cnntwolstm_model(data_batch)
    loss = criterion(outputs.squeeze(), label_batch)

    total_loss += loss.item()

    loss.backward()
    optimizer.step()

    batch_accuracy = calculate_accuracy(outputs, label_batch)
    total_accuracy += batch_accuracy

  average_loss = total_loss / len(dataloader)
  average_acc = total_accuracy / len(dataloader)

  print(f'Epoch {epoch+1}/{num_epochs}, Loss: {average_loss:.4f}, Accuracy: {average_acc:.4f}')

Epoch 1/20, Loss: 0.6853, Accuracy: 0.5582
Epoch 2/20, Loss: 0.6585, Accuracy: 0.6045
Epoch 3/20, Loss: 0.4383, Accuracy: 0.8261
Epoch 4/20, Loss: 0.3715, Accuracy: 0.8469
Epoch 5/20, Loss: 0.3517, Accuracy: 0.8551
Epoch 6/20, Loss: 0.3472, Accuracy: 0.8618
Epoch 7/20, Loss: 0.3477, Accuracy: 0.8583
Epoch 8/20, Loss: 0.3355, Accuracy: 0.8618
Epoch 9/20, Loss: 0.3374, Accuracy: 0.8584
Epoch 10/20, Loss: 0.3379, Accuracy: 0.8609
Epoch 11/20, Loss: 0.3357, Accuracy: 0.8585
Epoch 12/20, Loss: 0.3309, Accuracy: 0.8624
Epoch 13/20, Loss: 0.3332, Accuracy: 0.8620
Epoch 14/20, Loss: 0.3312, Accuracy: 0.8647
Epoch 15/20, Loss: 0.3282, Accuracy: 0.8633
Epoch 16/20, Loss: 0.3265, Accuracy: 0.8668
Epoch 17/20, Loss: 0.3298, Accuracy: 0.8706
Epoch 18/20, Loss: 0.3318, Accuracy: 0.8663
Epoch 19/20, Loss: 0.3230, Accuracy: 0.8637
Epoch 20/20, Loss: 0.3223, Accuracy: 0.8670


In [34]:
custom_dataset_test = CustomDataset(x_test, y_test, word2vec1, max_sequence_length)
dataloader_test = DataLoader(custom_dataset_test, batch_size=64, shuffle=True)

total_loss = 0.0
total_accuracy = 0.0
ops = []
trgs = []

for batch in dataloader_test:
  data_batch = batch["data"].to(device)
  label_batch = batch["label"].to(device)

  with torch.no_grad():
    outputs = cnntwolstm_model(data_batch)
    loss = criterion(outputs.squeeze(), label_batch)

    total_loss += loss.item()

    batch_accuracy = calculate_accuracy(outputs, label_batch)
    total_accuracy += batch_accuracy

    for el in outputs:
      ops.append(Tensor.cpu(el >= 0.5).item())

    for el in label_batch:
      trgs.append(Tensor.cpu(el).item())

average_loss = total_loss / len(dataloader_test)
average_acc = total_accuracy / len(dataloader_test)

print(f'Test, Loss: {average_loss:.4f}, Accuracy: {average_acc:.4f}')

Test, Loss: 0.3101, Accuracy: 0.8766


In [35]:
tn, fp, fn, tp = confusion_matrix(ops, trgs).ravel()
tn, fp, fn, tp

(989, 195, 30, 610)

In [36]:
print(classification_report(trgs, ops))

              precision    recall  f1-score   support

         0.0       0.84      0.97      0.90      1019
         1.0       0.95      0.76      0.84       805

    accuracy                           0.88      1824
   macro avg       0.89      0.86      0.87      1824
weighted avg       0.89      0.88      0.87      1824

