In [1]:
from tqdm.notebook import tqdm
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import torch

# Importing libraries
import copy
import os

import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import time

import torchvision
from torchvision import transforms

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Subset
from typing import Tuple, List
from torch.optim import Optimizer
from torch.utils.data import DataLoader

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os

BASE_PATH = 'drive/MyDrive/NLP_project/'
# BASE_PATH = './'
if not os.path.exists(BASE_PATH):
    raise ValueError('path does not exist')

In [4]:
from IPython.core.magic import register_cell_magic

@register_cell_magic
def skip(line, cell):
    return

In [5]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cpu


# Load dataset

In [6]:
df_train = pd.read_csv(BASE_PATH + 'datasets/EmoTrain.csv')
df_test = pd.read_csv(BASE_PATH + 'datasets/EmoVal.csv')

In [7]:
df_train = df_train.drop(axis=1, columns=['Unnamed: 0'])
df_test = df_test.drop(axis=1, columns=['Unnamed: 0'])
df_train

Unnamed: 0,text,praise,amusement,anger,disapproval,confusion,interest,sadness,fear,joy,love
0,Is there some scripture you could quote me? I'...,1,0,0,0,0,1,0,0,0,0
1,Good. Now we just need people to dislike commi...,1,0,0,0,0,0,0,0,0,0
2,This was driving me NUTS!,0,1,0,0,0,0,0,0,0,0
3,Thank you for your advice!,0,0,0,0,0,0,0,0,0,1
4,Some do. Some don't. Blanket generalizations a...,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
190097,They needed to insert the phrase “over mainten...,0,0,0,0,1,0,0,0,0,0
190098,Back in the seventies and eighties we all did ...,1,0,0,0,0,0,0,0,0,0
190099,"6lbs is a lap dog, if someone shoots that caus...",0,0,0,0,0,0,0,1,0,0
190100,This gets much worse on the 2nd loop.,0,0,0,0,1,0,0,0,0,0


In [8]:
# df_train = df_train[:190100]
# df_test = df_test[:190100]

# Pre processing

## Description of data

In [9]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 190102 entries, 0 to 190101
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   text         190102 non-null  object
 1   praise       190102 non-null  int64 
 2   amusement    190102 non-null  int64 
 3   anger        190102 non-null  int64 
 4   disapproval  190102 non-null  int64 
 5   confusion    190102 non-null  int64 
 6   interest     190102 non-null  int64 
 7   sadness      190102 non-null  int64 
 8   fear         190102 non-null  int64 
 9   joy          190102 non-null  int64 
 10  love         190102 non-null  int64 
dtypes: int64(10), object(1)
memory usage: 16.0+ MB


In [10]:
print(df_train.isnull().sum())

text           0
praise         0
amusement      0
anger          0
disapproval    0
confusion      0
interest       0
sadness        0
fear           0
joy            0
love           0
dtype: int64


In [11]:
def split_features_labels(df) -> Tuple[pd.Series, pd.DataFrame]:
    x = df['text']
    y = df.drop(axis=1, columns=['text'])
    return x, y


x_train, y_train = split_features_labels(df_train)
x_test, y_test = split_features_labels(df_test)

In [12]:
x_train

Unnamed: 0,text
0,Is there some scripture you could quote me? I'...
1,Good. Now we just need people to dislike commi...
2,This was driving me NUTS!
3,Thank you for your advice!
4,Some do. Some don't. Blanket generalizations a...
...,...
190097,They needed to insert the phrase “over mainten...
190098,Back in the seventies and eighties we all did ...
190099,"6lbs is a lap dog, if someone shoots that caus..."
190100,This gets much worse on the 2nd loop.


In [13]:
y_train

Unnamed: 0,praise,amusement,anger,disapproval,confusion,interest,sadness,fear,joy,love
0,1,0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1
4,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
190097,0,0,0,0,1,0,0,0,0,0
190098,1,0,0,0,0,0,0,0,0,0
190099,0,0,0,0,0,0,0,1,0,0
190100,0,0,0,0,1,0,0,0,0,0


## Data cleaning

Lower case all the words

In [14]:
def lower_case(x):
    x = x.str.lower()
    return x

x_train = lower_case(x_train)
x_test = lower_case(x_test)

In [15]:
x_train

Unnamed: 0,text
0,is there some scripture you could quote me? i'...
1,good. now we just need people to dislike commi...
2,this was driving me nuts!
3,thank you for your advice!
4,some do. some don't. blanket generalizations a...
...,...
190097,they needed to insert the phrase “over mainten...
190098,back in the seventies and eighties we all did ...
190099,"6lbs is a lap dog, if someone shoots that caus..."
190100,this gets much worse on the 2nd loop.


Convert to tokens

In [16]:
x_train = x_train.to_list()
x_test = x_test.to_list()

In [17]:
print(len(x_train))
print(x_train[:5])

190102
["is there some scripture you could quote me? i'd like to read up on it just to be sure for myself", 'good. now we just need people to dislike commies more than they do now', 'this was driving me nuts!', 'thank you for your advice!', "some do. some don't. blanket generalizations are almost always false and unhelpful."]


In [18]:
def list_of_words(x):
    x = list(map(lambda i: i.split(), x))
    return x

x_train = list_of_words(x_train)
x_test = list_of_words(x_test)

Then we have **list** of **list** of **tokens**

In [19]:
x_train

[['is',
  'there',
  'some',
  'scripture',
  'you',
  'could',
  'quote',
  'me?',
  "i'd",
  'like',
  'to',
  'read',
  'up',
  'on',
  'it',
  'just',
  'to',
  'be',
  'sure',
  'for',
  'myself'],
 ['good.',
  'now',
  'we',
  'just',
  'need',
  'people',
  'to',
  'dislike',
  'commies',
  'more',
  'than',
  'they',
  'do',
  'now'],
 ['this', 'was', 'driving', 'me', 'nuts!'],
 ['thank', 'you', 'for', 'your', 'advice!'],
 ['some',
  'do.',
  'some',
  "don't.",
  'blanket',
  'generalizations',
  'are',
  'almost',
  'always',
  'false',
  'and',
  'unhelpful.'],
 ['those', 'are', 'separate', 'issues', 'from', 'the', 'sample', 'size.'],
 ['my',
  'fur',
  'son',
  'was',
  'the',
  'first',
  'i',
  'came',
  'out',
  'to.',
  'most',
  'important',
  'also'],
 ['really',
  'appreciate',
  'this',
  'post',
  'and',
  'article.',
  'i',
  'just',
  'subscribed',
  'and',
  'between',
  'this',
  'and',
  'the',
  'sidebar',
  "i'm",
  'feeling',
  'pumped',
  'about',
  'start

What kind of **characters** are there in the dataset?

In [20]:
characters = {'isalnum': 0}
for text in x_train:
    for word in text:
        for c in word:
            if c.isalnum():
                characters['isalnum'] += 1
            elif c in characters:
                characters[c] += 1
            else:
                characters[c] = 0

In [21]:
print(len(characters))
keys = list(characters.keys())
keys.sort()
print(keys)
characters

464
['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', 'isalnum', '{', '|', '}', '~', '¡', '£', '¦', '©', '«', '¬', '¯', '°', '´', '·', '»', '̕', '̖', '̗', '̘', '̙', '̜', '̝', '̞', '̟', '̠', '̢', '̣', '̤', '̥', '̦', '̧', '̨', '̪', '̫', '̬', '̭', '̮', '̯', '̰', '̱', '̲', '̳', '̶', '̷', '̹', '̺', '̻', '̼', 'ͅ', '͈', '͍', '͎', '͏', '͓', '͔', '͕', '͖', '͘', '͙', '͚', '͜', '͝', '͞', '͟', '͠', '͡', '͢', '׳', '\u200d', '–', '—', '‘', '’', '“', '”', '„', '•', '…', '\u202a', '\u202c', '‽', '€', '™', '√', '≈', '≠', '▀', '▫', '☁', '☂', '☆', '☕', '☝', '☠', '☹', '☺', '♀', '♂', '♡', '♥', '♪', '♫', '♭', '♾', '♿', '⚔', '⚡', '⚰', '⛏', '⛑', '✊', '✋', '✌', '✔', '✨', '❄', '❣', '❤', '⠀', '⠁', '⠃', '⠄', '⠇', '⠈', '⠉', '⠊', '⠋', '⠏', '⠓', '⠘', '⠙', '⠚', '⠛', '⠞', '⠟', '⠢', '⠦', '⠫', '⠳', '⠴', '⠶', '⠸', '⠹', '⠻', '⠾', '⠿', '⡀', '⡄', '⡆', '⡇', '⡏', '⡜', '⡟', '⡶', '⡷', '⡼', '⡾', '⡿', '⢀', '⢠', '⢣', '⢤', '⢧', '⢰', '⢸', '⢹'

{'isalnum': 10318511,
 '?': 23258,
 "'": 65641,
 '.': 207020,
 '!': 37673,
 '’': 29723,
 '[': 34053,
 ']': 34038,
 ',': 62975,
 '"': 10806,
 ':': 6846,
 ')': 5248,
 '-': 8630,
 '💩': 2,
 '/': 5537,
 '*': 8644,
 '(': 3894,
 '^': 1509,
 '>': 2991,
 '~': 999,
 '”': 2183,
 '😒': 18,
 '£': 50,
 '“': 2205,
 '🙄': 92,
 '🍻': 7,
 '—': 112,
 '😂': 1143,
 '=': 421,
 '$': 760,
 '😍': 107,
 '😪': 14,
 ';': 880,
 '&': 356,
 '😭': 226,
 '❤': 355,
 '️': 723,
 '%': 930,
 '͡': 49,
 '͜': 32,
 '🅱': 33,
 '😔': 47,
 '+': 516,
 '🤕': 7,
 '@': 122,
 '🤣': 319,
 '#': 755,
 '🙌': 46,
 '😇': 10,
 '🎶': 50,
 '💃': 22,
 '<': 484,
 '\u200d': 223,
 '♀': 143,
 '☺': 57,
 '😢': 79,
 '😦': 1,
 '😜': 12,
 '♡': 3,
 '_': 609,
 '‘': 340,
 '💪': 26,
 '🤔': 154,
 '😣': 12,
 '😊': 100,
 '🐢': 2,
 '😄': 26,
 '😐': 32,
 '😤': 273,
 '😞': 14,
 '¦': 2,
 '¯': 83,
 '😱': 24,
 '👍': 129,
 '🍑': 2,
 '😹': 2,
 '👌': 84,
 '🏾': 11,
 '😕': 19,
 '♂': 76,
 '🏻': 128,
 '–': 13,
 '💕': 37,
 '❣': 10,
 '😎': 86,
 '😡': 48,
 '🤬': 11,
 '😴': 5,
 '€': 32,
 '👏': 173,
 '™': 31,
 '🤢': 5

- Remove non-semantic characters
- Convert `?` and `!` and `emojis` to tokens

In [22]:
def remove_special_chars(word: str):
    if word.isalnum():
        return word, []
    specials = list(filter(lambda c: (c in ('?', '!') or c > '\u202c') and c.isprintable(), filter(lambda c: not c.isalnum(), word)))
    return ''.join(filter(lambda c: c.isalnum(), word)), specials

def clean_tokens(x):
    for text in tqdm(x):
        specials = []
        for i, word in enumerate(text):
            text[i], special_word = remove_special_chars(word)
            specials.extend(special_word)
        text.extend(specials)

clean_tokens(x_train)
clean_tokens(x_test)

  0%|          | 0/190102 [00:00<?, ?it/s]

  0%|          | 0/10562 [00:00<?, ?it/s]

In [23]:
x_train

[['is',
  'there',
  'some',
  'scripture',
  'you',
  'could',
  'quote',
  'me',
  'id',
  'like',
  'to',
  'read',
  'up',
  'on',
  'it',
  'just',
  'to',
  'be',
  'sure',
  'for',
  'myself',
  '?'],
 ['good',
  'now',
  'we',
  'just',
  'need',
  'people',
  'to',
  'dislike',
  'commies',
  'more',
  'than',
  'they',
  'do',
  'now'],
 ['this', 'was', 'driving', 'me', 'nuts', '!'],
 ['thank', 'you', 'for', 'your', 'advice', '!'],
 ['some',
  'do',
  'some',
  'dont',
  'blanket',
  'generalizations',
  'are',
  'almost',
  'always',
  'false',
  'and',
  'unhelpful'],
 ['those', 'are', 'separate', 'issues', 'from', 'the', 'sample', 'size'],
 ['my',
  'fur',
  'son',
  'was',
  'the',
  'first',
  'i',
  'came',
  'out',
  'to',
  'most',
  'important',
  'also'],
 ['really',
  'appreciate',
  'this',
  'post',
  'and',
  'article',
  'i',
  'just',
  'subscribed',
  'and',
  'between',
  'this',
  'and',
  'the',
  'sidebar',
  'im',
  'feeling',
  'pumped',
  'about',
  's

Remove stop-words

In [24]:
print(len(ENGLISH_STOP_WORDS))
print(ENGLISH_STOP_WORDS)

318
frozenset({'somehow', 'against', 'sometime', 'onto', 'fifteen', 'then', 'the', 'he', 'often', 'whom', 'wherever', 'thereby', 'anything', 'nor', 'must', 'thus', 'last', 'also', 'an', 'would', 'into', 'too', 'out', 'our', 'myself', 'except', 'them', 'cannot', 'thru', 'per', 'their', 'being', 'until', 'four', 'enough', 'me', 'thin', 'else', 'other', 'who', 'in', 'whether', 'during', 'therein', 'was', 'top', 'side', 'whereas', 'beyond', 'afterwards', 'been', 'keep', 'con', 'nowhere', 'above', 'every', 'back', 'yet', 'own', 'take', 'be', 'sincere', 'behind', 'even', 'anyway', 'together', 'and', 'one', 'is', 'if', 'moreover', 'his', 'forty', 'while', 'another', 'about', 'mostly', 'ten', 'co', 'sixty', 'whole', 'bill', 'ie', 'formerly', 'noone', 'someone', 'made', 'almost', 'everywhere', 'towards', 'whereupon', 'toward', 'thereupon', 'down', 'everyone', 'first', 'everything', 'ourselves', 'no', 'us', 'within', 'third', 'always', 'they', 'between', 'meanwhile', 'at', 'less', 'can', 'never'

In [25]:
def remove_stop_words(x):
    x = list(map(lambda text: list(filter(lambda word: word not in ENGLISH_STOP_WORDS, text)), x))
    return x

x_train = remove_stop_words(x_train)
x_test = remove_stop_words(x_test)

In [26]:
x_train

[['scripture', 'quote', 'id', 'like', 'read', 'just', 'sure', '?'],
 ['good', 'just', 'need', 'people', 'dislike', 'commies'],
 ['driving', 'nuts', '!'],
 ['thank', 'advice', '!'],
 ['dont', 'blanket', 'generalizations', 'false', 'unhelpful'],
 ['separate', 'issues', 'sample', 'size'],
 ['fur', 'son', 'came', 'important'],
 ['really',
  'appreciate',
  'post',
  'article',
  'just',
  'subscribed',
  'sidebar',
  'im',
  'feeling',
  'pumped',
  'starting',
  'learning'],
 ['chuckle',
  'hope',
  'raptors',
  'v',
  'warriors',
  'finals',
  'thats',
  'gonna',
  'fun',
  'series',
  'watch'],
 ['favorite', 'shows', 'time'],
 ['hope', 'theres', 'dirt', 'jerk', 'rid'],
 ['oh', 'man', 'forgot', 'ebay', 'old', 'textbooks', 'ive', 'meaning', '!'],
 ['buncha', 'bastards', 'favorite', 'quotes', 'hahaha'],
 ['really', 'hope', 'wont', 'chance', 'enjoy', 'man', 'didnt', 'like', ''],
 ['question', 'say', 'want'],
 ['hey', 'good', 'ya', '!'],
 ['exactly', 'play', 'pvp', 'game', 'pc'],
 ['u', 'got

# Vectorize the dataset

An example of vectorization

In [27]:
df = pd.DataFrame(dtype=bool)
for i, text in enumerate(tqdm(x_train[:5])):
    for word in text:
        df.loc[i, word] = True
df.fillna(False, inplace=True)

  0%|          | 0/5 [00:00<?, ?it/s]

  df.fillna(False, inplace=True)


In [28]:
df

Unnamed: 0,scripture,quote,id,like,read,just,sure,?,good,need,...,driving,nuts,!,thank,advice,dont,blanket,generalizations,false,unhelpful
0,True,True,True,True,True,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,True,False,False,True,True,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,True,True,True,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,True,True,True,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,True,True,True,True


Create an index for every unique word

In [29]:
def generate_tensor_index(x):
    tensor_index = {}
    for i, text in enumerate(tqdm(x)):
        for word in text:
            tensor_index[word] = tensor_index.get(word, len(tensor_index))
    return tensor_index

train_tensor_index = generate_tensor_index(x_train)
test_tensor_index = generate_tensor_index(x_test)

  0%|          | 0/190102 [00:00<?, ?it/s]

  0%|          | 0/10562 [00:00<?, ?it/s]

In [30]:
print('unique tokens=', len(train_tensor_index))
train_tensor_index

unique tokens= 33222


{'scripture': 0,
 'quote': 1,
 'id': 2,
 'like': 3,
 'read': 4,
 'just': 5,
 'sure': 6,
 '?': 7,
 'good': 8,
 'need': 9,
 'people': 10,
 'dislike': 11,
 'commies': 12,
 'driving': 13,
 'nuts': 14,
 '!': 15,
 'thank': 16,
 'advice': 17,
 'dont': 18,
 'blanket': 19,
 'generalizations': 20,
 'false': 21,
 'unhelpful': 22,
 'separate': 23,
 'issues': 24,
 'sample': 25,
 'size': 26,
 'fur': 27,
 'son': 28,
 'came': 29,
 'important': 30,
 'really': 31,
 'appreciate': 32,
 'post': 33,
 'article': 34,
 'subscribed': 35,
 'sidebar': 36,
 'im': 37,
 'feeling': 38,
 'pumped': 39,
 'starting': 40,
 'learning': 41,
 'chuckle': 42,
 'hope': 43,
 'raptors': 44,
 'v': 45,
 'warriors': 46,
 'finals': 47,
 'thats': 48,
 'gonna': 49,
 'fun': 50,
 'series': 51,
 'watch': 52,
 'favorite': 53,
 'shows': 54,
 'time': 55,
 'theres': 56,
 'dirt': 57,
 'jerk': 58,
 'rid': 59,
 'oh': 60,
 'man': 61,
 'forgot': 62,
 'ebay': 63,
 'old': 64,
 'textbooks': 65,
 'ive': 66,
 'meaning': 67,
 'buncha': 68,
 'bastards': 

Create a tensor

In [32]:
def create_x_tensor(x, tensor_index):
    x_tensor = torch.zeros(len(x), len(tensor_index), dtype=torch.float16)
    for i, text in enumerate(tqdm(x)):
        for word in text:
            x_tensor[i, tensor_index[word]] += 1
    return x_tensor

In [33]:
# x_train
torch.save(create_x_tensor(x_train[:50000], train_tensor_index), BASE_PATH + 'tensors/x_train_tensor1.pt')
torch.save(create_x_tensor(x_train[50000:100_000], train_tensor_index), BASE_PATH + 'tensors/x_train_tensor2.pt')
torch.save(create_x_tensor(x_train[100_000:150_000], train_tensor_index), BASE_PATH + 'tensors/x_train_tensor3.pt')
torch.save(create_x_tensor(x_train[150_000:], train_tensor_index), BASE_PATH + 'tensors/x_train_tensor4.pt')

  0%|          | 0/50000 [00:00<?, ?it/s]

  0%|          | 0/50000 [00:00<?, ?it/s]

  0%|          | 0/50000 [00:00<?, ?it/s]

  0%|          | 0/40102 [00:00<?, ?it/s]

In [34]:
# x_test
torch.save(create_x_tensor(x_test, test_tensor_index), BASE_PATH + 'tensors/x_test_tensor.pt')

  0%|          | 0/10562 [00:00<?, ?it/s]

Convert labels to tensor

In [35]:
y_train.describe()

Unnamed: 0,praise,amusement,anger,disapproval,confusion,interest,sadness,fear,joy,love
count,190102.0,190102.0,190102.0,190102.0,190102.0,190102.0,190102.0,190102.0,190102.0,190102.0
mean,0.157705,0.069841,0.096201,0.077138,0.333463,0.063177,0.080909,0.021299,0.127563,0.118489
std,0.364465,0.25488,0.294867,0.26681,0.471452,0.243281,0.272696,0.14438,0.333604,0.323187
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [31]:
def create_y_tensor(y):
    y_tensor = torch.tensor(y.values, dtype=torch.float16)
    return y_tensor

In [32]:
# y_train
torch.save(create_y_tensor(y_train[:50000]), BASE_PATH + 'tensors/y_train_tensor1.pt')
torch.save(create_y_tensor(y_train[50000:100_000]), BASE_PATH + 'tensors/y_train_tensor2.pt')
torch.save(create_y_tensor(y_train[100_000:150_000]), BASE_PATH + 'tensors/y_train_tensor3.pt')
torch.save(create_y_tensor(y_train[150_000:]), BASE_PATH + 'tensors/y_train_tensor4.pt')

In [33]:
torch.save(create_y_tensor(y_test), BASE_PATH + 'tensors/y_test_tensor.pt')