In [4]:
import re
import os

os.environ['SSL_CERT_FILE'] = certifi.where()

import numpy as np
import pandas as pd
import gzip
import gensim.downloader as api
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from gensim.models import KeyedVectors, Word2Vec

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# text pre-processing
!pip install pyspellchecker
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from spellchecker import SpellChecker

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# from google.colab import drive



[nltk_data] Downloading package punkt to /Users/larrywu/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /Users/larrywu/nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/larrywu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/larrywu/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [5]:
# drive.mount("/content/drive")

# file_path = "/content/drive/MyDrive/Neural Network/data/text_emotion.csv"
file_path = "./text_emotion.csv"
data = pd.read_csv(file_path)
data = data[['sentiment','content']]
data = data.rename(columns={'sentiment': 'emotion', 'content': 'text'})

print(data['emotion'].value_counts())
data.head()

emotion
neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: count, dtype: int64


Unnamed: 0,emotion,text
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


### Preprocessing the data for training

In [6]:
le = LabelEncoder()
data['emotion'] = le.fit_transform(data['emotion'])

data.head()

Unnamed: 0,emotion,text
0,2,@tiffanylue i know i was listenin to bad habi...
1,10,Layin n bed with a headache ughhhh...waitin o...
2,10,Funeral ceremony...gloomy friday...
3,3,wants to hang out with friends SOON!
4,8,@dannycastillo We want to trade with someone w...


In [7]:
spell = SpellChecker()
lemmatizer= WordNetLemmatizer()

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'website', text)

def clean_text(text):
    ## Remove at(username)
    pattern = r'@[^@\s]+'
    text = re.sub(pattern, '', text)

    ## Remove punctuations
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', text)
    text = text.replace('؛',"", )

    ## remove extra whitespace
    text = re.sub('\s+', ' ', text)
    text =  " ".join(text.split())
    return text.strip()

def correct_word(word):
    corrected_word = spell.correction(word)

    if corrected_word is not None:
        return corrected_word
    else:
        return word


def preprocess_text(text):
  # Look into custom tokenizer later
  words = nltk.word_tokenize(text)
  correct_words = []
  for word in words:
    # Perform spelling correction
    corrected_word = correct_word(word)

    correct_words.append(corrected_word)

  return " ".join(correct_words)

def normalize_text(text):
  text = remove_urls(text)
  text = clean_text(text)
  # text = preprocess_text(text)
  return text

def tokenize(text):
  # Look into custom tokenizer later
  return nltk.word_tokenize(text)

In [8]:
data.text = data.text.apply(lambda text : normalize_text(text))
data['tokens'] = data.text.apply(lambda text : tokenize(text))
data.head()

Unnamed: 0,emotion,text,tokens
0,2,i know i was listenin to bad habit earlier and...,"[i, know, i, was, listenin, to, bad, habit, ea..."
1,10,Layin n bed with a headache ughhhh waitin on y...,"[Layin, n, bed, with, a, headache, ughhhh, wai..."
2,10,Funeral ceremony gloomy friday,"[Funeral, ceremony, gloomy, friday]"
3,3,wants to hang out with friends SOON,"[wants, to, hang, out, with, friends, SOON]"
4,8,We want to trade with someone who has Houston ...,"[We, want, to, trade, with, someone, who, has,..."


### Word embedding

In [12]:
# model_path = "/content/drive/MyDrive/Neural Network/data/word2vec.bin"
model_path = "./word2vec.bin"
model = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [13]:
model.similar_by_word("friends", topn=5)

[('acquaintances', 0.7328366041183472),
 ('friend', 0.7098035216331482),
 ('buddies', 0.7023523449897766),
 ('pals', 0.6816098093986511),
 ('relatives', 0.6528787016868591)]

In [14]:
model['friends'].shape

(300,)

In [15]:
vocab = set()
for tokens in data['tokens']:
  for token in tokens:
    vocab.add(token)

len(vocab)

38405

In [16]:
matrix_len = len(vocab) + 1 # extra word embedding for padding
weights_matrix = np.zeros((matrix_len, 300))
word2idx = {}
words_found = 0

words_missed = []
for i, word in enumerate(vocab):
  word2idx[word] = i+1
  try:
      weights_matrix[i+1] = model[word]
      words_found += 1
  except KeyError:
      weights_matrix[i+1] = np.random.normal(scale=0.6, size=(300, ))
      words_missed.append(word)

print(f"words found: {words_found} out of total of {len(vocab)} words")

words found: 29003 out of total of 38405 words


In [17]:
def text_to_sequence(tokens):
  return [ word2idx[word] for word in tokens]

data['indices'] = data['tokens'].apply(text_to_sequence)
data.head()

Unnamed: 0,emotion,text,tokens,indices
0,2,i know i was listenin to bad habit earlier and...,"[i, know, i, was, listenin, to, bad, habit, ea...","[36993, 37129, 36993, 34779, 38250, 36231, 301..."
1,10,Layin n bed with a headache ughhhh waitin on y...,"[Layin, n, bed, with, a, headache, ughhhh, wai...","[23034, 36560, 10185, 8914, 12793, 28797, 2637..."
2,10,Funeral ceremony gloomy friday,"[Funeral, ceremony, gloomy, friday]","[29581, 8824, 35250, 32243]"
3,3,wants to hang out with friends SOON,"[wants, to, hang, out, with, friends, SOON]","[21484, 36231, 9351, 5063, 8914, 11483, 36799]"
4,8,We want to trade with someone who has Houston ...,"[We, want, to, trade, with, someone, who, has,...","[13453, 19489, 36231, 30773, 8914, 32140, 3792..."


### Padding

In [24]:
target_length = 32

# Function to pad or truncate a sequence to the target length
def pad_or_truncate(sequence):
    if len(sequence) < target_length:
        # Pad with zeros at the end if the sequence is shorter
        return sequence + [0] * (target_length - len(sequence))
    else:
        # Truncate the sequence if it is longer
        return sequence[:target_length]

# Apply the function to the 'indices' column
data['indices'] = data['indices'].apply(pad_or_truncate)
data.head()

Unnamed: 0,emotion,text,tokens,indices
0,2,i know i was listenin to bad habit earlier and...,"[i, know, i, was, listenin, to, bad, habit, ea...","[36993, 37129, 36993, 34779, 38250, 36231, 301..."
1,10,Layin n bed with a headache ughhhh waitin on y...,"[Layin, n, bed, with, a, headache, ughhhh, wai...","[23034, 36560, 10185, 8914, 12793, 28797, 2637..."
2,10,Funeral ceremony gloomy friday,"[Funeral, ceremony, gloomy, friday]","[29581, 8824, 35250, 32243, 0, 0, 0, 0, 0, 0, ..."
3,3,wants to hang out with friends SOON,"[wants, to, hang, out, with, friends, SOON]","[21484, 36231, 9351, 5063, 8914, 11483, 36799,..."
4,8,We want to trade with someone who has Houston ...,"[We, want, to, trade, with, someone, who, has,...","[13453, 19489, 36231, 30773, 8914, 32140, 3792..."


### Dataset and Dataloader

In [25]:
class TextEmotionDataset(Dataset):
  def __init__(self, df):
      self.indices = df['indices'].values
      self.emotion = df['emotion'].values

  def __len__(self):
      return len(self.emotion)

  def __getitem__(self, idx):
    indices = self.indices[idx]
    emotion = self.emotion[idx]

    indices = torch.tensor(indices, dtype=torch.int)

    return indices, emotion

In [26]:
train_df, test_df = train_test_split(data, test_size=0.1)

train_dataset = TextEmotionDataset(train_df)
test_dataset = TextEmotionDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

### Training

In [27]:
emb_layer = nn.Embedding.from_pretrained(torch.tensor(weights_matrix), padding_idx=0)
input, output = train_loader.dataset[0]
embedding = emb_layer(input)
print(input)
embedding.shape

tensor([35378,  9729,  9729, 36791, 12793, 23486,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0], dtype=torch.int32)


torch.Size([32, 300])

In [28]:
class TextClassificationModel(nn.Module):
  def __init__(self, hidden_dim, num_layers, num_classes, kernel_size=2, dropout_prob=0.2):
    super().__init__()

    self.embedding = nn.Embedding.from_pretrained(torch.tensor(weights_matrix, dtype=torch.float), padding_idx=0)
    self.embedding_dim = 300
    self.hidden_dim = hidden_dim

    self.lstm = nn.LSTM(self.embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
    self.max_pooling = nn.MaxPool1d(kernel_size=target_length)
    self.fc = nn.Linear(2*hidden_dim, num_classes)
    self.dropout = nn.Dropout(p=dropout_prob)
    self.softmax = nn.Softmax(dim=1)

  def forward(self, x):
    embedded = self.embedding(x) # (batch, length of text, length of embedding)
    out, _ = self.lstm(embedded) # (batch, length of text, 2 * hidden size)
    out = out.permute(0, 2, 1)
    out = self.max_pooling(out) # (batch, 2 * hidden size, 1)
    out = out.reshape(-1, 2*self.hidden_dim)
    out = self.dropout(out)
    out = self.fc(out)
    preds = self.softmax(out)

    return preds

model = TextClassificationModel(128, 1, 13)
model

TextClassificationModel(
  (embedding): Embedding(38406, 300, padding_idx=0)
  (lstm): LSTM(300, 128, batch_first=True, bidirectional=True)
  (max_pooling): MaxPool1d(kernel_size=32, stride=32, padding=0, dilation=1, ceil_mode=False)
  (fc): Linear(in_features=256, out_features=13, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (softmax): Softmax(dim=1)
)

In [29]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 100
losses = []
test_accuracies = []
train_accuracies = []

for epoch in range(num_epochs):
  loss_total, correct_total = 0, 0
  model.train()


  for x, y in train_loader:
    optimizer.zero_grad()
    h = model(x)
    loss = criterion(h, y)
    loss.backward()
    optimizer.step()

    loss_total += loss.item()

  losses.append(loss_total / len(train_loader.dataset))
  print(f"loss: {losses[-1]}")

  with torch.no_grad():
    for x, y in test_loader:
      h = model(x)
      pred = torch.argmax(h, axis=1)
      print(pred)
      correct_total += (pred == y).float().sum().item()
    test_accuracies.append(correct_total / len(test_loader.dataset))
    print(f"testing accuracy: {test_accuracies[-1]}")



loss: 0.009579474462403191
tensor([12, 12, 12, 12,  8, 12, 12, 12, 12,  8, 12,  8, 12,  8, 12,  8,  8,  8,
         8,  8,  8, 12, 12, 12,  8, 12, 12, 12,  8,  8,  8, 12,  8, 12, 12, 12,
         8,  8, 12, 12,  8,  8,  8,  8,  8,  8,  8, 12, 12,  8, 12,  8,  8, 12,
         8, 12, 12,  8, 12,  8,  8, 12,  8,  8,  8,  8,  8, 12,  8, 12, 12,  8,
         8,  8, 12, 12,  8,  8,  8, 12, 12, 12, 12,  8, 12,  8, 12,  8, 12, 12,
        12, 12,  8,  8, 12, 12,  8, 12,  8,  8,  8,  8, 12, 12, 12, 12,  8, 12,
         8, 12, 12,  8,  8, 12,  8,  8, 12, 12,  8, 12,  8,  8, 12, 12, 12, 12,
        12, 12, 12,  8, 12,  8, 12, 12, 12,  8, 12,  8,  8,  8,  8, 12, 12, 12,
        12,  8,  8,  8, 12,  8,  8, 12,  8, 12, 12, 12, 12,  8, 12,  8,  8,  8,
        12, 12,  8,  8,  8, 12,  8, 12,  8, 12,  8, 12, 12,  8,  8, 12, 12, 12,
        12,  8, 12, 12,  8,  8,  8, 12,  8,  8,  8,  8,  8, 12,  8,  8, 12,  8,
        12, 12, 12, 12, 12,  8,  8, 12,  8,  8, 12,  8,  8,  8, 12, 12,  8, 12,
        12,  

KeyboardInterrupt: 