## Exercise 3 - Emotion Recognition

In [39]:
import pandas as pd
import numpy as np
import csv
import re
import string
from collections import defaultdict
import torch
import requests
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import spacy
import random
import time

from sklearn.metrics import precision_recall_fscore_support

In [2]:
# use the GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: NVIDIA GeForce RTX 3090


In [3]:
# download datasets from github by using github api 
def download_datasets(overwrite = False):
    api_url = f"https://api.github.com/repos/cardiffnlp/tweeteval/contents/datasets/emotion"

    # if there is a not empty folder called ex3_datasets at present word dictionary, we think you have already downloaded datasets
    # if you want to download anyway, please set the parameter overwrite as True
    # When the overwrite is set as True, the old txt file will be overwrited.
    if not os.path.exists("./ex3_datasets"):
        os.makedirs("./ex3_datasets")
        response = requests.get(api_url)
    elif os.path.exists("./ex3_datasets") and not os.listdir("./ex3_datasets"):
        response = requests.get(api_url)
    elif not overwrite:
        print("Files have downloaded before, if you want to overwrite them, please pass parameter: overwrite = True")
        return
    
    if response.status_code == 200:
        data = response.json()
 
        for item in data:
            file_url = item['download_url']
            file_name = item['name']
            response = requests.get(file_url)
        
            if response.status_code == 200:
                with open('./ex3_datasets/' + file_name, 'wb') as file:
                    file.write(response.content)
                print(f"{file_name} is downloaded sucessfully!")
            else:
                print(f"{file_name}: failed!")
    else:
        print(f"Fail! HTTP respond code: {response.status_code}")

download_datasets()

Files have downloaded before, if you want to overwrite them, please pass parameter: overwrite = True


In [4]:
# load dataset
with open(f'./ex3_datasets/train_text.txt') as f:
    x_train = f.read().splitlines()
with open(f'./ex3_datasets/train_labels.txt') as f:
    y_train = f.read().splitlines()
with open(f'./ex3_datasets/val_text.txt') as f:
    x_val = f.read().splitlines()
with open(f'./ex3_datasets/val_labels.txt') as f:
    y_val = f.read().splitlines()
with open(f'./ex3_datasets/test_text.txt') as f:
    x_test = f.read().splitlines()
with open(f'./ex3_datasets/test_labels.txt') as f:
    y_test = f.read().splitlines()


In [5]:
def load_text(path):
    with open(path, 'rb') as f:
        texts = []
        for line in f:
            texts.append(line.decode(errors='ignore').lower().strip())
    return texts

In [7]:
# filter data set: data with labels 0 and 1 left
def filter_data(arr, x, y):
    index = [i for i, v in enumerate(y) if v in arr]
    y = [int(v) for v in y if v in arr]
    x = [v for i, v in enumerate(x) if i in index]
    return x, y

anger_joy_arr = ['0','1']
anger_sadness_arr = ['0','3']

# prepare anger and joy record
x_train_1, y_train_1 = filter_data(anger_joy_arr, x_train, y_train)
x_val_1, y_val_1 = filter_data(anger_joy_arr, x_val, y_val)
x_test_1, y_test_1 = filter_data(anger_joy_arr, x_test, y_test)

# prepare anger and sadness record
x_train_2, y_train_2 = filter_data(anger_sadness_arr, x_train, y_train)
x_val_2, y_val_2 = filter_data(anger_sadness_arr, x_val, y_val)
x_test_2, y_test_2 = filter_data(anger_sadness_arr, x_test, y_test)


In [8]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\10405\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\10405\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
def preprocess_text(text, use_stem=False):
    # Convert to lowercase
    text = text.lower()

    # Tokenization
    words = word_tokenize(text)

    # Remove punctuation and non-alphabetic characters
    words = [word for word in words if word.isalpha()]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Stemming (you can choose to use stemming or lemmatization)
    if use_stem:
        stemmer = PorterStemmer()
        words = [stemmer.stem(word) for word in words]

    return words

In [12]:
nlp = spacy.blank('en')

In [15]:
def tokenize(texts):
  """
  Assign unique id to each token
  """
  max_lens = []
  tokenized_texts = []
  word2idx = {}

  # Add <pad> and <unk> tokens to the vocabulary
  word2idx['<pad>'] = 0
  word2idx['<unk>'] = 1

  # Building our vocab from the corpus starting from index 2
  idx = 2
  for text in texts:
    tokenized_text = []
    max_len = 0
    for sent in text:
      tokenized_sent = nlp(sent)
      # Add `tokenized_sent` to `tokenized_texts`
      tokenized_text.append(tokenized_sent)
      # Add new token to `word2idx`
      for token in tokenized_sent:
        # string any token objects are different things, be careful.
        if token.text not in word2idx:
          word2idx[token.text] = idx
          idx += 1

          # Update `max_len`
      max_len = max(max_len, len(tokenized_sent))

    tokenized_texts.append(tokenized_text)
    max_lens.append(max_len)

  return tokenized_texts, word2idx, max_lens

In [16]:
def encode(tokenized_texts, word2idx, max_len):
    input_ids = []
    for i, tokenized_text in enumerate(tokenized_texts):
        input_ids_temp = []
        for tokenized_sent in tokenized_text:
            # Pad sentences to max_len
            tokenized_padded_sent = list(tokenized_sent) + ['<pad>'] * (max_len[i] - len(tokenized_sent))

            # Encode tokens to input_ids
            input_id = [word2idx.get(str(token)) for token in tokenized_padded_sent]
            input_ids_temp.append(input_id)
        
        input_ids.append(np.array(input_ids_temp, dtype=np.int64))

    return input_ids

In [17]:
tokenized_texts, word2idx, max_lens = tokenize([x_train_1,x_val_1, x_test_1])
input_ids_1 = encode(tokenized_texts, word2idx, max_lens)

In [18]:
# Convert data type to torch.Tensor
train_inputs_1 = torch.from_numpy(input_ids_1[0])
train_labels_1 = torch.tensor(y_train_1, dtype=torch.long)

val_input_1 = torch.from_numpy(input_ids_1[1])
val_labels_1 = torch.tensor(y_val_1, dtype=torch.long)

test_input_1 = torch.from_numpy(input_ids_1[2])
test_labels_1 = torch.tensor(y_test_1, dtype=torch.long)

In [21]:
train_data_1 = TensorDataset(train_inputs_1, train_labels_1)
val_data_1 = TensorDataset(val_input_1, val_labels_1)
test_data_1 = TensorDataset(test_input_1, test_labels_1) 

In [22]:
batch_size = 8

train_dataloader = DataLoader(train_data_1, batch_size=batch_size)
val_dataloader = DataLoader(val_data_1)
test_dataloader = DataLoader(test_data_1)

In [23]:
class CNN(nn.Module):
    def __init__(self,
                 vocab_size=len(word2idx),
                 embed_dim=300,
                 filter_sizes=[3, 4, 5],
                 num_filters=[100, 100, 100],
                 num_classes=2,
                 dropout=0.5):
        """
        The constructor for CNN class.
        Args:
            vocab_size (int): Need to be specified when pretrained word
                embeddings are not used.
            embed_dim (int): Dimension of word vectors. Need to be specified
                when pretrained word embeddings are not used. Default: 300
            filter_sizes (List[int]): List of filter sizes. Default: [3, 4, 5]
            num_filters (List[int]): List of number of filters, has the same
                length as `filter_sizes`. Default: [100, 100, 100]
            n_classes (int): Number of classes. Default: 2
            dropout (float): Dropout rate. Default: 0.5
        """

        super(CNN, self).__init__()
        # Embedding layer
        self.embed_dim = embed_dim

        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                          embedding_dim=self.embed_dim,
                                          padding_idx=0,
                                          max_norm=5.0)
        # Conv Network
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=self.embed_dim,
                      out_channels=num_filters[i],
                      kernel_size=filter_sizes[i])
            for i in range(len(filter_sizes))
        ])
        # Fully-connected layer and Dropout
        self.fc = nn.Linear(np.sum(num_filters), num_classes)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, input_ids):
        """Perform a forward pass through the network.

        Args:
            input_ids (torch.Tensor): A tensor of token ids with shape
                (batch_size, max_sent_length)

        Returns:
            logits (torch.Tensor): Output logits with shape (batch_size,
                n_classes)
        """

        # Get embeddings from `input_ids`. Output shape: (batch_size, max_len, embed_dim)
        x_embed = self.embedding(input_ids).float()

        # Permute `x_embed` to match input shape requirement of `nn.Conv1d`.
        # Output shape: (b, embed_dim, max_len)
        x_reshaped = x_embed.permute(0, 2, 1)

        # Apply CNN and ReLU. Output shape: (b, num_filters[i], L_out)
        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]

        # Max pooling. Output shape: (b, num_filters[i], 1)
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in x_conv_list]

        # Concatenate x_pool_list to feed the fully connected layer.
        # Output shape: (b, sum(num_filters))
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],
                         dim=1)
        # print(x_fc.shape)

        # Compute logits. Output shape: (b, n_classes)
        logits = self.fc(self.dropout(x_fc))

        return logits

In [46]:
# Instantiate CNN model
cnn1_1 = CNN(embed_dim=300,
            filter_sizes=[3, 4, 5],
            num_filters=[100, 100, 100],
            num_classes=2,
            dropout=0.5)

# Send model to `device` (GPU/CPU)
cnn1_1.to(device)

# Instantiate Adadelta optimizer
optimizer_1 = optim.Adadelta(cnn1_1.parameters(),
                               lr=0.0001,
                               rho=0.95)

In [47]:
def train(model, optimizer, train_dataloader, val_dataloader): 
  # Specify loss function
  loss_fn = nn.CrossEntropyLoss()

  # Start training loop
  print("Start training...\n")
  print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Val Loss':^12}")
  print("-"*60)

  for epoch_i in range(10):
    total_loss = 0
    # Put the model into the training mode
    model.train()
    for step, batch in enumerate(train_dataloader):

      # Load batch to GPU
      b_input_ids, b_labels = tuple(t.to(device) for t in batch)

      # Zero out any previously calculated gradients
      model.zero_grad()

      # Perform a forward pass. This will return logits.
      logits = model(b_input_ids)
      #break

      # Compute loss and accumulate the loss values
      loss = loss_fn(logits, b_labels)

      total_loss += loss.item()

      # Perform a backward pass to calculate gradients
      loss.backward()

      # Update parameters
      optimizer.step()

      # Calculate the average loss over the entire training data
      avg_train_loss = total_loss / len(train_dataloader)

    with torch.no_grad():


      total_val_loss = 0

      for step, batch in enumerate(val_dataloader):

        # Load batch to GPU
        b_input_ids, b_labels = tuple(t.to(device) for t in batch)

        logits = model(b_input_ids)

        val_loss = loss_fn(logits, b_labels)

        total_val_loss += val_loss.item()

        avg_val_loss = total_val_loss / len(val_dataloader)

    print(f"{epoch_i + 1:^7} | {avg_train_loss:^12.6f} | {avg_val_loss:^12.6f} \n")
  
  return model

In [45]:
cnn1_1 = train(cnn1_1, optimizer_1, train_dataloader, val_dataloader)

Start training...

 Epoch  |  Train Loss  |   Val Loss  
------------------------------------------------------------
   1    |   5.354104   |   6.015400   

   2    |   5.310530   |   6.006713   

   3    |   5.331689   |   6.030020   

   4    |   5.297817   |   5.981511   

   5    |   5.262202   |   6.023051   

   6    |   5.215111   |   5.963070   

   7    |   5.241347   |   6.048886   

   8    |   5.228076   |   5.882992   

   9    |   5.228333   |   5.891965   

  10    |   5.173050   |   5.780343   



In [82]:
def model_eval(model, labels, dataloader):

    predicted_labels = []

    correct = 0

    with torch.no_grad():

        for batch in dataloader:

            b_input_ids, b_labels = tuple(t.to(device) for t in batch)

            logits = model(b_input_ids)

            predicted = torch.argmax(logits, dim=1).cpu().flatten().numpy()

            predicted_labels.extend(predicted.tolist())

            correct += np.sum((predicted==(b_labels.cpu().numpy())).astype(int))

        _, _, f1, _ = precision_recall_fscore_support(labels.tolist(), predicted_labels, average='macro')

        acc = correct / len(labels)
    
    print(acc)
    print(f1)

    return acc, f1

In [83]:
acc1_1, f1_1_1 = model_eval(cnn1_1, test_labels_1, test_dataloader)

0.5786026200873362
0.4536936892393837


In [55]:
# Instantiate CNN model
cnn1_2 = CNN(embed_dim=300,
            filter_sizes=[3, 4, 5],
            num_filters=[100, 100, 100],
            num_classes=2,
            dropout=0.5)

# Send model to `device` (GPU/CPU)
cnn1_2.to(device)

# Instantiate Adadelta optimizer
optimizer_2 = optim.Adam(cnn1_2.parameters(),
                               lr=0.00001)

In [56]:
cnn1_2 = train(cnn1_2, optimizer_2, train_dataloader, val_dataloader)

Start training...

 Epoch  |  Train Loss  |   Val Loss  
------------------------------------------------------------
   1    |   0.717986   |   0.693579   

   2    |   0.665863   |   0.680757   

   3    |   0.635314   |   0.669511   

   4    |   0.626067   |   0.670366   

   5    |   0.616355   |   0.679254   

   6    |   0.605228   |   0.679100   

   7    |   0.596914   |   0.677550   

   8    |   0.597610   |   0.681434   

   9    |   0.588854   |   0.674347   

  10    |   0.587875   |   0.660362   



In [84]:
acc1_2, f1_1_2 = model_eval(cnn1_2, test_labels_1, test_dataloader)

0.611353711790393
0.3871988574435299


In [74]:
# Instantiate CNN model
cnn1_3 = CNN(embed_dim=300,
            filter_sizes=[3, 5, 7],
            num_filters=[100, 100, 100],
            num_classes=2,
            dropout=0.3)

# Send model to `device` (GPU/CPU)
cnn1_3.to(device)

# Instantiate Adadelta optimizer
optimizer_3 = optim.AdamW(cnn1_3.parameters(),
                               lr=0.00001)

In [75]:
cnn1_3 = train(cnn1_3, optimizer_3, train_dataloader, val_dataloader)

Start training...

 Epoch  |  Train Loss  |   Val Loss  
------------------------------------------------------------
   1    |   0.666585   |   0.669307   

   2    |   0.633026   |   0.661011   

   3    |   0.615734   |   0.661195   

   4    |   0.606105   |   0.648152   

   5    |   0.596460   |   0.655633   

   6    |   0.590363   |   0.657398   

   7    |   0.582103   |   0.666376   

   8    |   0.570982   |   0.653069   

   9    |   0.564006   |   0.646005   

  10    |   0.554060   |   0.655791   



In [85]:
acc1_3, f1_1_3 = model_eval(cnn1_3, test_labels_1, test_dataloader)

0.6146288209606987
0.3961247910655424
