In [None]:
from tqdm.notebook import tqdm
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import torch

# Importing libraries
import copy
import os

import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Subset
from typing import Tuple, List
from torch.optim import Optimizer

from nltk.corpus import words, wordnet
import nltk

import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

BASE_PATH = 'drive/MyDrive/NLP_project/'
# BASE_PATH = './'
if not os.path.exists(BASE_PATH):
    raise ValueError('path does not exist')

In [None]:
from IPython.core.magic import register_cell_magic

@register_cell_magic
def skip(line, cell):
    return

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

# Load dataset

In [None]:
df_train = pd.read_csv(BASE_PATH + 'datasets/EmoTrain.csv')
df_test = pd.read_csv(BASE_PATH + 'datasets/EmoVal.csv')

In [None]:
df_train = df_train.drop(axis=1, columns=['Unnamed: 0'])
df_test = df_test.drop(axis=1, columns=['Unnamed: 0'])
df_train

In [None]:
# step = 5_000
df_train = df_train[:190_000]
df_test = df_test[:190_000]
# df_train = df_train[:100_000]
# df_test = df_test[:100_000]

# Pre processing

## Description of data

In [None]:
df_train.info()

In [None]:
print(df_train.isnull().sum())

In [None]:
def split_features_labels(df) -> Tuple[pd.Series, pd.DataFrame]:
    x = df['text']
    y = df.drop(axis=1, columns=['text'])
    return x, y


x_train, y_train = split_features_labels(df_train)
x_test, y_test = split_features_labels(df_test)

In [None]:
x_train

In [None]:
y_train

## Data cleaning

Lower case all the words

In [None]:
def lower_case(x):
    x = x.str.lower()
    return x

x_train = lower_case(x_train)
x_test = lower_case(x_test)

In [None]:
x_train

Convert to tokens

In [None]:
x_train = x_train.to_list()
x_test = x_test.to_list()

In [None]:
print(len(x_train))
print(x_train[:5])

In [None]:
def list_of_words(x):
    x = list(map(lambda i: i.split(), x))
    return x

x_train = list_of_words(x_train)
x_test = list_of_words(x_test)

Then we have **list** of **list** of **tokens**

In [None]:
x_train

What kind of **characters** are there in the dataset?

In [None]:
characters = {'isalnum': 0}
for text in x_train:
    for word in text:
        for c in word:
            if c.isalnum():
                characters['isalnum'] += 1
            elif c in characters:
                characters[c] += 1
            else:
                characters[c] = 0

In [None]:
print(len(characters))
keys = list(characters.keys())
keys.sort()
print(keys)
characters

- Remove non-semantic characters
- Convert `?` and `!` and `emojis` to tokens

In [None]:
def split_special_chars(word: str):
    if word.isalnum():
        return word, []
    specials = list(filter(lambda c: c == '!' or c == '?', word))
    return ''.join(filter(lambda c: c.isalnum(), word)), specials

def clean_tokens(x):
    for text in tqdm(x):
        specials = []
        for i, word in enumerate(text):
            text[i], special_word = split_special_chars(word)
            specials.extend(special_word)
        text.extend(specials)

clean_tokens(x_train)
clean_tokens(x_test)

In [None]:
x_train

Remove stop-words

In [None]:
print(len(ENGLISH_STOP_WORDS))
print(ENGLISH_STOP_WORDS)

In [None]:
def remove_stop_words(x):
    x = list(map(lambda text: list(filter(lambda word: word not in ENGLISH_STOP_WORDS, text)), x))
    return x

x_train = remove_stop_words(x_train)
x_test = remove_stop_words(x_test)

# Vectorize the dataset

An example of vectorization

In [None]:
df = pd.DataFrame(dtype=bool)
for i, text in enumerate(tqdm(x_train[:5])):
    for word in text:
        df.loc[i, word] = True
df.fillna(False, inplace=True)

In [None]:
df

In [None]:
top_words = {}
for text in x_train:
    for word in text:
            if word in top_words:
                top_words[word] += 1
            else:
                top_words[word] = 1

top_words_ordered = sorted(top_words.items(), key=lambda kv: kv[1])
print(top_words_ordered)

In [None]:
print(list(map(lambda x: x[1], top_words_ordered)).index(1))
print(list(map(lambda x: x[1], top_words_ordered)).index(2))
print(list(map(lambda x: x[1], top_words_ordered)).index(3))
print(list(map(lambda x: x[1], top_words_ordered)).index(4))
print(list(map(lambda x: x[1], top_words_ordered)).index(5))

In [None]:
word_list = list(map(lambda x: x[0], top_words_ordered[-1000:]))
print(len(word_list))

Create an index for every unique word

In [None]:
tensor_index = {}
for i, word in enumerate(tqdm(word_list)):
    tensor_index[word] = i

In [None]:
print(len(tensor_index))

Check synonyms

In [None]:
nltk.download('wordnet')  # Download WordNet if not already installed

def get_synonyms(word):
    synonyms = set()
    for synset in wordnet.synsets(word):
        for lemma in synset.lemmas():
            synonyms.add(lemma.name().lower())
    return synonyms - {word}

word = "happy"
print(get_synonyms(word))

Create a tensor

In [None]:
def create_x_tensor(x, syn=False):
    x_tensor = torch.zeros(len(x), len(tensor_index), dtype=torch.float16)
    for i, text in enumerate(tqdm(x)):
        for word in text:
            if word in tensor_index:
                x_tensor[i, tensor_index[word]] += 1
                if syn:
                    for syn_word in get_synonyms(word):
                        if syn_word in tensor_index:
                            x_tensor[i, tensor_index[syn_word]] += 1
    return x_tensor

In [None]:
# x_train
# last_index = 0
# for i in range(0, len(x_train), step):
#     last_index = i
#     torch.save(create_x_tensor(x_train[i:i+step]), BASE_PATH + f'tensors/x_train_tensor{i//step}.pt')
# torch.save(create_x_tensor(x_train[last_index:]), BASE_PATH + f'tensors/x_train_tensor{last_index//step + 1}.pt')

In [None]:
torch.save(create_x_tensor(x_train, syn=True), BASE_PATH + 'tensors/x_train_tensor.pt')
torch.save(create_x_tensor(x_test), BASE_PATH + 'tensors/x_test_tensor.pt')

Convert labels to tensor

In [None]:
y_train.describe()

In [None]:
def create_y_tensor(y):
    y_tensor = torch.tensor(y.values, dtype=torch.float16)
    return y_tensor

In [None]:
# y_train
# last_index = 0
# for i in range(0, len(y_train), step):
#     last_index = i
#     torch.save(create_y_tensor(y_train[i:i+step]), BASE_PATH + f'tensors/y_train_tensor{i//step}.pt')
# torch.save(create_y_tensor(y_train[last_index:]), BASE_PATH + f'tensors/y_train_tensor{last_index//step + 1}.pt')

In [None]:
torch.save(create_y_tensor(y_train), BASE_PATH + 'tensors/y_train_tensor.pt')
torch.save(create_y_tensor(y_test), BASE_PATH + 'tensors/y_test_tensor.pt')