In [1]:
import re
import numpy as np
import pandas as pd
import gzip
import gensim.downloader as api
import torch
import torch.nn as nn
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from gensim.models import KeyedVectors, Word2Vec

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# text pre-processing
!pip install pyspellchecker
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from spellchecker import SpellChecker

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words("english"))

from google.colab import drive

Collecting pyspellchecker
  Downloading pyspellchecker-0.7.2-py3-none-any.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [2]:
drive.mount("/content/drive")

file_path = "/content/drive/MyDrive/Neural Network/data/text_emotion.csv"
data = pd.read_csv(file_path)
data = data[['sentiment','content']]
data = data.rename(columns={'sentiment': 'emotion', 'content': 'text'})

print(data['emotion'].value_counts())
data.head()

Mounted at /content/drive
neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: emotion, dtype: int64


Unnamed: 0,emotion,text
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


### Preprocessing the data for training

In [3]:
le = LabelEncoder()
data['emotion'] = le.fit_transform(data['emotion'])

data.head()

Unnamed: 0,emotion,text
0,2,@tiffanylue i know i was listenin to bad habi...
1,10,Layin n bed with a headache ughhhh...waitin o...
2,10,Funeral ceremony...gloomy friday...
3,3,wants to hang out with friends SOON!
4,8,@dannycastillo We want to trade with someone w...


In [4]:
spell = SpellChecker()
lemmatizer= WordNetLemmatizer()

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'website', text)

def clean_text(text):
    ## Remove at(username)
    pattern = r'@[^@\s]+'
    text = re.sub(pattern, '', text)

    ## Remove punctuations
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', text)
    text = text.replace('؛',"", )

    ## remove extra whitespace
    text = re.sub('\s+', ' ', text)
    text =  " ".join(text.split())
    return text.strip()

def correct_word(word):
    corrected_word = spell.correction(word)

    if corrected_word is not None:
        return corrected_word
    else:
        return word

def preprocess_text(text):
  # Look into custom tokenizer later
  words = nltk.word_tokenize(text)
  process_words = []
  for word in words:
    if word.isdigit():
      continue

    word = word.lower()

    if word in stop_words:
      continue

    # word  = correct_word(word)

    process_words.append(word)

  return process_words

def normalize_and_tokenize(text):
  # Look into custom tokenizer later
  text = remove_urls(text)
  text = clean_text(text)
  return preprocess_text(text)

In [5]:
data['tokens'] = data.text.apply(lambda text : normalize_and_tokenize(text))
data.head()

Unnamed: 0,emotion,text,tokens
0,2,@tiffanylue i know i was listenin to bad habi...,"[know, listenin, bad, habit, earlier, started,..."
1,10,Layin n bed with a headache ughhhh...waitin o...,"[layin, n, bed, headache, ughhhh, waitin, call]"
2,10,Funeral ceremony...gloomy friday...,"[funeral, ceremony, gloomy, friday]"
3,3,wants to hang out with friends SOON!,"[wants, hang, friends, soon]"
4,8,@dannycastillo We want to trade with someone w...,"[want, trade, someone, houston, tickets, one]"


### Word embedding

In [6]:
model_path = "/content/drive/MyDrive/Neural Network/data/word2vec.bin"
model = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [7]:
model.similar_by_word("friend", topn=5)

[('pal', 0.7476359009742737),
 ('friends', 0.7098034024238586),
 ('buddy', 0.6972493529319763),
 ('dear_friend', 0.6960037350654602),
 ('acquaintance', 0.6843010783195496)]

In [8]:
model['friends'].shape

(300,)

In [9]:
vocab = set()
for tokens in data['tokens']:
  for token in tokens:
    vocab.add(token)

len(vocab)

29680

In [10]:
matrix_len = len(vocab) + 1 # extra word embedding for padding
weights_matrix = np.zeros((matrix_len, 300))
word2idx = {}
words_found = 0

words_missed = []

for i, word in enumerate(vocab):
  word2idx[word] = i+1
  if word in model:
    weights_matrix[i+1] = model[word]
    words_found += 1
  else:
    weights_matrix[i+1] = np.random.normal(scale=0.3, size=(300,))
    words_missed.append(word)
print(f"words found: {words_found} out of total of {len(vocab)} words")

words found: 19672 out of total of 29680 words


In [11]:
def text_to_sequence(tokens):
  return [ word2idx[word] for word in tokens]

data['indices'] = data['tokens'].apply(text_to_sequence)
data.head()

Unnamed: 0,emotion,text,tokens,indices
0,2,@tiffanylue i know i was listenin to bad habi...,"[know, listenin, bad, habit, earlier, started,...","[22087, 20299, 16078, 16204, 24828, 13621, 211..."
1,10,Layin n bed with a headache ughhhh...waitin o...,"[layin, n, bed, headache, ughhhh, waitin, call]","[2381, 25488, 10672, 8932, 14909, 8635, 26199]"
2,10,Funeral ceremony...gloomy friday...,"[funeral, ceremony, gloomy, friday]","[18757, 3750, 8340, 13631]"
3,3,wants to hang out with friends SOON!,"[wants, hang, friends, soon]","[20998, 27748, 24431, 24814]"
4,8,@dannycastillo We want to trade with someone w...,"[want, trade, someone, houston, tickets, one]","[21753, 305, 4325, 16886, 17119, 20427]"


### Padding

In [12]:
target_length = 16
# Function to pad or truncate a sequence to the target length
def pad_or_truncate(sequence):
  if len(sequence) < target_length:
      # Pad with zeros at the end if the sequence is shorter
      return sequence + [0] * (target_length - len(sequence))
  else:
      # Truncate the sequence if it is longer
      return sequence[:target_length]


# Apply the function to the 'indices' column
data['indices'] = data['indices'].apply(pad_or_truncate)
data.head()

Unnamed: 0,emotion,text,tokens,indices
0,2,@tiffanylue i know i was listenin to bad habi...,"[know, listenin, bad, habit, earlier, started,...","[22087, 20299, 16078, 16204, 24828, 13621, 211..."
1,10,Layin n bed with a headache ughhhh...waitin o...,"[layin, n, bed, headache, ughhhh, waitin, call]","[2381, 25488, 10672, 8932, 14909, 8635, 26199,..."
2,10,Funeral ceremony...gloomy friday...,"[funeral, ceremony, gloomy, friday]","[18757, 3750, 8340, 13631, 0, 0, 0, 0, 0, 0, 0..."
3,3,wants to hang out with friends SOON!,"[wants, hang, friends, soon]","[20998, 27748, 24431, 24814, 0, 0, 0, 0, 0, 0,..."
4,8,@dannycastillo We want to trade with someone w...,"[want, trade, someone, houston, tickets, one]","[21753, 305, 4325, 16886, 17119, 20427, 0, 0, ..."


### Dataset and Dataloader

In [13]:
class TextEmotionDataset(Dataset):
  def __init__(self, df):
      self.indices = df['indices'].values
      self.emotion = df['emotion'].values

  def __len__(self):
      return len(self.emotion)

  def __getitem__(self, idx):
    indices = self.indices[idx]
    emotion = self.emotion[idx]

    indices = torch.tensor(indices, dtype=torch.int)

    return indices, emotion

In [14]:
train_df, test_df = train_test_split(data, test_size=0.2)

train_dataset = TextEmotionDataset(train_df)
test_dataset = TextEmotionDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

### Training

In [15]:
emb_layer = nn.Embedding.from_pretrained(torch.tensor(weights_matrix), padding_idx=0)
input, output = train_loader.dataset[0]
embedding = emb_layer(input)
print(input)
embedding.shape

tensor([15490, 17708, 16906, 14842, 25106, 23025,  9170, 28973,     0,     0,
            0,     0,     0,     0,     0,     0], dtype=torch.int32)


torch.Size([16, 300])

In [16]:
class BLSTM_MaxGlobalPooling(nn.Module):
  def __init__(self, hidden_dim, num_classes, kernel_size=2):
    super().__init__()

    self.embedding = nn.Embedding.from_pretrained(torch.tensor(weights_matrix, dtype=torch.float), padding_idx=0)
    self.embedding_dim = 300
    self.hidden_dim = hidden_dim

    self.lstm = nn.LSTM(self.embedding_dim, hidden_dim, num_layers=1, batch_first=True, bidirectional=True)
    self.conv1 = nn.Conv2d(1, 128, kernel_size=(2, 2*hidden_dim))
    self.fc = nn.Linear(128*(target_length-1), num_classes)
    self.dropout = nn.Dropout(p=0.5)
    self.softmax = nn.Softmax(dim=1)

  def forward(self, x):
    embedded = self.embedding(x) # (batch, target_length, length of embedding)
    out, _ = self.lstm(embedded) # (batch, target_length, 2 * hidden_dim)
    out = out.reshape(-1, 1, target_length, 2 * self.hidden_dim)
    out = self.conv1(out) # (batch, 128, target_length-1, 1)
    out = out.reshape(-1, 128*(target_length-1)) # (batch, 128, target_length-1)
    out = self.dropout(out)
    out = self.fc(out)
    h = self.softmax(out)

    return h

model1 = BLSTM_MaxGlobalPooling(300, 13)
model1

BLSTM_MaxGlobalPooling(
  (embedding): Embedding(29681, 300, padding_idx=0)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (conv1): Conv2d(1, 128, kernel_size=(2, 600), stride=(1, 1))
  (fc): Linear(in_features=1920, out_features=13, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (softmax): Softmax(dim=1)
)

In [None]:
def train_loop(model, optimizer, criterion, dataloader):
  loss_total, train_correct = 0, 0
  for x, y in dataloader:
    optimizer.zero_grad()
    h = model(x)
    loss = criterion(h, y)
    loss.backward()
    optimizer.step()

    loss_total += loss.item()

    pred = torch.argmax(h, axis=1)
    train_correct += (pred == y).float().sum().item()

  print(f"train acc: {train_correct / len(dataloader.dataset)}")
  return loss_total, train_correct / len(dataloader.dataset)

def test_loop(model, optimizer, dataloader):
  test_correct = 0
  with torch.no_grad():
    for x, y in test_loader:
      h = model(x)
      pred = torch.argmax(h, axis=1)
      test_correct += (pred == y).float().sum().item()
  print(f"test acc: {test_correct / len(dataloader.dataset)}")
  return test_correct / len(dataloader.dataset)

def train(no_epochs, model, optimizer, criterion, train_dataloader, test_dataloader):
  losses, test_accuracies, train_accuracies = [], [], []

  for epoch in range(no_epochs):
    model.train()
    loss, train_acc = train_loop(model, optimizer, criterion, train_dataloader)
    losses.append(loss)
    train_accuracies.append(train_acc)

    test_acc = test_loop(model, optimizer, test_dataloader)
    test_accuracies.append(test_acc)

  return losses, train_accuracies, test_accuracies

criterion = nn.CrossEntropyLoss()
optimizer1 = torch.optim.Adam(model1.parameters(), lr=0.0005)

train(30, model1, optimizer1, criterion, train_loader, test_loader)

train acc: 0.26571875
test acc: 0.28875
train acc: 0.313
test acc: 0.3095
train acc: 0.3245
test acc: 0.327625
train acc: 0.3329375
test acc: 0.317375
train acc: 0.33640625
test acc: 0.310625
train acc: 0.337125
test acc: 0.314625
train acc: 0.34153125
test acc: 0.317375
train acc: 0.34609375
test acc: 0.3125
train acc: 0.3489375
test acc: 0.316
train acc: 0.3526875
test acc: 0.313625
train acc: 0.35559375
test acc: 0.3115
train acc: 0.361875
test acc: 0.326125
train acc: 0.36459375
test acc: 0.318125
train acc: 0.37196875
test acc: 0.329
train acc: 0.38578125
test acc: 0.328875
train acc: 0.393125
test acc: 0.331875
train acc: 0.39725
test acc: 0.331
train acc: 0.3991875
test acc: 0.328
train acc: 0.3950625
test acc: 0.332
train acc: 0.4068125
test acc: 0.330375
train acc: 0.4118125
test acc: 0.326875
train acc: 0.4165
test acc: 0.32725
