In [2]:
from tqdm.notebook import tqdm
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import torch

# Load dataset

In [3]:
df = pd.read_csv('datasets/EmoTrain.csv')

In [4]:
df = df.drop(axis=1, columns=['Unnamed: 0'])
df

Unnamed: 0,text,praise,amusement,anger,disapproval,confusion,interest,sadness,fear,joy,love
0,Is there some scripture you could quote me? I'...,1,0,0,0,0,1,0,0,0,0
1,Good. Now we just need people to dislike commi...,1,0,0,0,0,0,0,0,0,0
2,This was driving me NUTS!,0,1,0,0,0,0,0,0,0,0
3,Thank you for your advice!,0,0,0,0,0,0,0,0,0,1
4,Some do. Some don't. Blanket generalizations a...,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
190097,They needed to insert the phrase “over mainten...,0,0,0,0,1,0,0,0,0,0
190098,Back in the seventies and eighties we all did ...,1,0,0,0,0,0,0,0,0,0
190099,"6lbs is a lap dog, if someone shoots that caus...",0,0,0,0,0,0,0,1,0,0
190100,This gets much worse on the 2nd loop.,0,0,0,0,1,0,0,0,0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 190102 entries, 0 to 190101
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   text         190102 non-null  object
 1   praise       190102 non-null  int64 
 2   amusement    190102 non-null  int64 
 3   anger        190102 non-null  int64 
 4   disapproval  190102 non-null  int64 
 5   confusion    190102 non-null  int64 
 6   interest     190102 non-null  int64 
 7   sadness      190102 non-null  int64 
 8   fear         190102 non-null  int64 
 9   joy          190102 non-null  int64 
 10  love         190102 non-null  int64 
dtypes: int64(10), object(1)
memory usage: 16.0+ MB


In [6]:
print(df.isnull().sum())

text           0
praise         0
amusement      0
anger          0
disapproval    0
confusion      0
interest       0
sadness        0
fear           0
joy            0
love           0
dtype: int64


In [7]:
x = df['text']
y = df.drop(axis=1, columns=['text'])

In [8]:
x

0         Is there some scripture you could quote me? I'...
1         Good. Now we just need people to dislike commi...
2                                 This was driving me NUTS!
3                                Thank you for your advice!
4         Some do. Some don't. Blanket generalizations a...
                                ...                        
190097    They needed to insert the phrase “over mainten...
190098    Back in the seventies and eighties we all did ...
190099    6lbs is a lap dog, if someone shoots that caus...
190100                This gets much worse on the 2nd loop.
190101    Any food shortages will be caused by these idi...
Name: text, Length: 190102, dtype: object

In [9]:
y

Unnamed: 0,praise,amusement,anger,disapproval,confusion,interest,sadness,fear,joy,love
0,1,0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1
4,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
190097,0,0,0,0,1,0,0,0,0,0
190098,1,0,0,0,0,0,0,0,0,0
190099,0,0,0,0,0,0,0,1,0,0
190100,0,0,0,0,1,0,0,0,0,0


In [10]:
print(len(ENGLISH_STOP_WORDS))
print(ENGLISH_STOP_WORDS)

318
frozenset({'as', 'amount', 'us', 'among', 'one', 'had', 'hereupon', 'most', 'or', 'namely', 'alone', 'up', 'find', 'mostly', 'nevertheless', 'for', 'onto', 'thence', 'hasnt', 'these', 'in', 'few', 'afterwards', 'call', 'throughout', 'six', 'eight', 'bill', 'interest', 'sincere', 'the', 'hereby', 'etc', 'couldnt', 'name', 'cant', 'become', 'please', 'less', 'off', 'whoever', 'against', 'became', 'eleven', 'inc', 'well', 'each', 'that', 'also', 'without', 'seeming', 'nobody', 'whom', 'beyond', 'such', 'do', 're', 'your', 'whatever', 'anyway', 'meanwhile', 'is', 'who', 'mill', 'below', 'under', 'further', 'everything', 'seems', 'whenever', 'into', 'what', 'were', 'still', 'every', 'we', 'his', 'on', 'again', 'it', 'whereby', 'being', 'many', 'several', 'above', 'therein', 'otherwise', 'they', 'myself', 'forty', 'while', 'yet', 'elsewhere', 'herein', 'twenty', 'ten', 'thus', 'until', 'himself', 'no', 'wherein', 'often', 'others', 'within', 'least', 'hers', 'during', 'noone', 'except', 

In [11]:
x = x.str.lower()
x

0         is there some scripture you could quote me? i'...
1         good. now we just need people to dislike commi...
2                                 this was driving me nuts!
3                                thank you for your advice!
4         some do. some don't. blanket generalizations a...
                                ...                        
190097    they needed to insert the phrase “over mainten...
190098    back in the seventies and eighties we all did ...
190099    6lbs is a lap dog, if someone shoots that caus...
190100                this gets much worse on the 2nd loop.
190101    any food shortages will be caused by these idi...
Name: text, Length: 190102, dtype: object

In [12]:
x = x.to_list()

In [13]:
print(len(x))
print(x[:5])

190102
["is there some scripture you could quote me? i'd like to read up on it just to be sure for myself", 'good. now we just need people to dislike commies more than they do now', 'this was driving me nuts!', 'thank you for your advice!', "some do. some don't. blanket generalizations are almost always false and unhelpful."]


In [14]:
x = list(map(lambda i: i.split(), x))

In [15]:
x

[['is',
  'there',
  'some',
  'scripture',
  'you',
  'could',
  'quote',
  'me?',
  "i'd",
  'like',
  'to',
  'read',
  'up',
  'on',
  'it',
  'just',
  'to',
  'be',
  'sure',
  'for',
  'myself'],
 ['good.',
  'now',
  'we',
  'just',
  'need',
  'people',
  'to',
  'dislike',
  'commies',
  'more',
  'than',
  'they',
  'do',
  'now'],
 ['this', 'was', 'driving', 'me', 'nuts!'],
 ['thank', 'you', 'for', 'your', 'advice!'],
 ['some',
  'do.',
  'some',
  "don't.",
  'blanket',
  'generalizations',
  'are',
  'almost',
  'always',
  'false',
  'and',
  'unhelpful.'],
 ['those', 'are', 'separate', 'issues', 'from', 'the', 'sample', 'size.'],
 ['my',
  'fur',
  'son',
  'was',
  'the',
  'first',
  'i',
  'came',
  'out',
  'to.',
  'most',
  'important',
  'also'],
 ['really',
  'appreciate',
  'this',
  'post',
  'and',
  'article.',
  'i',
  'just',
  'subscribed',
  'and',
  'between',
  'this',
  'and',
  'the',
  'sidebar',
  "i'm",
  'feeling',
  'pumped',
  'about',
  'start

In [16]:
characters = {'isalnum': 0}
for text in x:
    for word in text:
        for c in word:
            if c.isalnum():
                characters['isalnum'] += 1
            elif c in characters:
                characters[c] += 1
            else:
                characters[c] = 0

In [17]:
print(len(characters))
keys = list(characters.keys())
keys.sort()
print(keys)
characters

464
['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', 'isalnum', '{', '|', '}', '~', '¡', '£', '¦', '©', '«', '¬', '¯', '°', '´', '·', '»', '̕', '̖', '̗', '̘', '̙', '̜', '̝', '̞', '̟', '̠', '̢', '̣', '̤', '̥', '̦', '̧', '̨', '̪', '̫', '̬', '̭', '̮', '̯', '̰', '̱', '̲', '̳', '̶', '̷', '̹', '̺', '̻', '̼', 'ͅ', '͈', '͍', '͎', '͏', '͓', '͔', '͕', '͖', '͘', '͙', '͚', '͜', '͝', '͞', '͟', '͠', '͡', '͢', '׳', '\u200d', '–', '—', '‘', '’', '“', '”', '„', '•', '…', '\u202a', '\u202c', '‽', '€', '™', '√', '≈', '≠', '▀', '▫', '☁', '☂', '☆', '☕', '☝', '☠', '☹', '☺', '♀', '♂', '♡', '♥', '♪', '♫', '♭', '♾', '♿', '⚔', '⚡', '⚰', '⛏', '⛑', '✊', '✋', '✌', '✔', '✨', '❄', '❣', '❤', '⠀', '⠁', '⠃', '⠄', '⠇', '⠈', '⠉', '⠊', '⠋', '⠏', '⠓', '⠘', '⠙', '⠚', '⠛', '⠞', '⠟', '⠢', '⠦', '⠫', '⠳', '⠴', '⠶', '⠸', '⠹', '⠻', '⠾', '⠿', '⡀', '⡄', '⡆', '⡇', '⡏', '⡜', '⡟', '⡶', '⡷', '⡼', '⡾', '⡿', '⢀', '⢠', '⢣', '⢤', '⢧', '⢰', '⢸', '⢹'

{'isalnum': 10318511,
 '?': 23258,
 "'": 65641,
 '.': 207020,
 '!': 37673,
 '’': 29723,
 '[': 34053,
 ']': 34038,
 ',': 62975,
 '"': 10806,
 ':': 6846,
 ')': 5248,
 '-': 8630,
 '💩': 2,
 '/': 5537,
 '*': 8644,
 '(': 3894,
 '^': 1509,
 '>': 2991,
 '~': 999,
 '”': 2183,
 '😒': 18,
 '£': 50,
 '“': 2205,
 '🙄': 92,
 '🍻': 7,
 '—': 112,
 '😂': 1143,
 '=': 421,
 '$': 760,
 '😍': 107,
 '😪': 14,
 ';': 880,
 '&': 356,
 '😭': 226,
 '❤': 355,
 '️': 723,
 '%': 930,
 '͡': 49,
 '͜': 32,
 '🅱': 33,
 '😔': 47,
 '+': 516,
 '🤕': 7,
 '@': 122,
 '🤣': 319,
 '#': 755,
 '🙌': 46,
 '😇': 10,
 '🎶': 50,
 '💃': 22,
 '<': 484,
 '\u200d': 223,
 '♀': 143,
 '☺': 57,
 '😢': 79,
 '😦': 1,
 '😜': 12,
 '♡': 3,
 '_': 609,
 '‘': 340,
 '💪': 26,
 '🤔': 154,
 '😣': 12,
 '😊': 100,
 '🐢': 2,
 '😄': 26,
 '😐': 32,
 '😤': 273,
 '😞': 14,
 '¦': 2,
 '¯': 83,
 '😱': 24,
 '👍': 129,
 '🍑': 2,
 '😹': 2,
 '👌': 84,
 '🏾': 11,
 '😕': 19,
 '♂': 76,
 '🏻': 128,
 '–': 13,
 '💕': 37,
 '❣': 10,
 '😎': 86,
 '😡': 48,
 '🤬': 11,
 '😴': 5,
 '€': 32,
 '👏': 173,
 '™': 31,
 '🤢': 5

In [18]:
def remove_special_chars(word: str):
    if word.isalnum():
        return word, []
    specials = list(filter(lambda c: (c in ('?', '!') or c > '\u202c') and c.isprintable(), filter(lambda c: not c.isalnum(), word)))
    return ''.join(filter(lambda c: c.isalnum(), word)), specials
    

for text in tqdm(x):
    specials = []
    for i, word in enumerate(text):
        text[i], special_word = remove_special_chars(word)
        specials.extend(special_word)
    text.extend(specials)

  0%|          | 0/190102 [00:00<?, ?it/s]

In [19]:
x

[['is',
  'there',
  'some',
  'scripture',
  'you',
  'could',
  'quote',
  'me',
  'id',
  'like',
  'to',
  'read',
  'up',
  'on',
  'it',
  'just',
  'to',
  'be',
  'sure',
  'for',
  'myself',
  '?'],
 ['good',
  'now',
  'we',
  'just',
  'need',
  'people',
  'to',
  'dislike',
  'commies',
  'more',
  'than',
  'they',
  'do',
  'now'],
 ['this', 'was', 'driving', 'me', 'nuts', '!'],
 ['thank', 'you', 'for', 'your', 'advice', '!'],
 ['some',
  'do',
  'some',
  'dont',
  'blanket',
  'generalizations',
  'are',
  'almost',
  'always',
  'false',
  'and',
  'unhelpful'],
 ['those', 'are', 'separate', 'issues', 'from', 'the', 'sample', 'size'],
 ['my',
  'fur',
  'son',
  'was',
  'the',
  'first',
  'i',
  'came',
  'out',
  'to',
  'most',
  'important',
  'also'],
 ['really',
  'appreciate',
  'this',
  'post',
  'and',
  'article',
  'i',
  'just',
  'subscribed',
  'and',
  'between',
  'this',
  'and',
  'the',
  'sidebar',
  'im',
  'feeling',
  'pumped',
  'about',
  's

# Vectorize the dataset

In [20]:
df = pd.DataFrame(dtype=bool)
for i, text in enumerate(tqdm(x[:5])):
    for word in text:
        df.loc[i, word] = True
df.fillna(False, inplace=True)

  0%|          | 0/5 [00:00<?, ?it/s]

  df.fillna(False, inplace=True)


In [21]:
df

Unnamed: 0,is,there,some,scripture,you,could,quote,me,id,like,...,advice,dont,blanket,generalizations,are,almost,always,false,and,unhelpful
0,True,True,True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,False,False,True,False,False,False,False,False,False,False,...,False,True,True,True,True,True,True,True,True,True


In [22]:
tensor_index = {}
for i, text in enumerate(tqdm(x)):
    for word in text:
        tensor_index[word] = tensor_index.get(word, len(tensor_index))

  0%|          | 0/190102 [00:00<?, ?it/s]

In [23]:
print('unique tokens=', len(tensor_index))
tensor_index

unique tokens= 33525


{'is': 0,
 'there': 1,
 'some': 2,
 'scripture': 3,
 'you': 4,
 'could': 5,
 'quote': 6,
 'me': 7,
 'id': 8,
 'like': 9,
 'to': 10,
 'read': 11,
 'up': 12,
 'on': 13,
 'it': 14,
 'just': 15,
 'be': 16,
 'sure': 17,
 'for': 18,
 'myself': 19,
 '?': 20,
 'good': 21,
 'now': 22,
 'we': 23,
 'need': 24,
 'people': 25,
 'dislike': 26,
 'commies': 27,
 'more': 28,
 'than': 29,
 'they': 30,
 'do': 31,
 'this': 32,
 'was': 33,
 'driving': 34,
 'nuts': 35,
 '!': 36,
 'thank': 37,
 'your': 38,
 'advice': 39,
 'dont': 40,
 'blanket': 41,
 'generalizations': 42,
 'are': 43,
 'almost': 44,
 'always': 45,
 'false': 46,
 'and': 47,
 'unhelpful': 48,
 'those': 49,
 'separate': 50,
 'issues': 51,
 'from': 52,
 'the': 53,
 'sample': 54,
 'size': 55,
 'my': 56,
 'fur': 57,
 'son': 58,
 'first': 59,
 'i': 60,
 'came': 61,
 'out': 62,
 'most': 63,
 'important': 64,
 'also': 65,
 'really': 66,
 'appreciate': 67,
 'post': 68,
 'article': 69,
 'subscribed': 70,
 'between': 71,
 'sidebar': 72,
 'im': 73,
 'fee

In [42]:
vector = torch.zeros(len(x), len(tensor_index), dtype=torch.bool)
for i, text in enumerate(tqdm(x)):
    for word in text:
        vector[i, tensor_index[word]] = True

  0%|          | 0/1000 [00:00<?, ?it/s]

In [43]:
vector

tensor([[ True,  True,  True,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])