In [20]:
import nltk
from nltk.tokenize import RegexpTokenizer

In [42]:
with open('dataset2.csv') as dataset:
    lines = dataset.readlines()
    for line in lines:
        sentence, cat = line.strip().rsplit(',',1)

## Pre-processing the dataset

- converting to lower case
- tokenize(punctuation preserved as tokens)
- remove reviews of length < 5 words

In [43]:
def tokenize(sentence):
    pattern = r'''(?x)          # set flag to allow verbose regexps
            (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
          | \w+(?:['-]\w+)*     # words with optional internal hyphens, and apostrophe
          | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
          | \.\.\.              # ellipsis
          | [.,;"?():_`-]       # these are separate tokens
        '''
    return RegexpTokenizer(pattern).tokenize(text)

In [44]:
text = 'first time i went there was, i don\'t know. I am sad. 2nd was good[vf]creg?.'
tokenize(text)

['first',
 'time',
 'i',
 'went',
 'there',
 'was',
 ',',
 'i',
 "don't",
 'know',
 '.',
 'I',
 'am',
 'sad',
 '.',
 '2nd',
 'was',
 'good',
 'vf',
 'creg',
 '?',
 '.']

## Model for learning embeddings

Uni-labeled data : ISEAR 

LSTM architecture : input -> embedding layer -> hidden layer -> softmax

Trg objective : multinomial cross-entropy loss

Initialisation of word embeddings:
- Glove
- random initialisation from N(0,sigma^2) for words not in Glove

Model Hyperparameters, learners:
- Adam
- lr=0.001
- mini batch-size : 1024

## Classification model for emotion prediction

Compared 2 models : one with directly using features from emotion lexicon, other with learnt embeddings as features

For word embedding model,
    I/P to model -> sentence representation -> average of word vectors of all words in the sentence(size=300)
    
For model with features from lexicon : 
    I/P to model -> #words in sentence belonging to each category in the lexicon
    
2 classification models :
- L2 regularised multi class Logistic Regression
- SVM
10 FCV
macro F1-score -> avg F1 score over all emotion labels
F1 -> Harmonic mean of Precision and recall

### Using learnt emotion-enriched word embeddings

In [55]:
emb_file_path = './Datasets/Embeddings/ewe_uni_embeddings.txt' # 300 dimensional
with open(emb_file_path, 'r') as emb:
    lines = emb.readlines()[:4]
    for line in lines:
        dims = line.strip().split(" ")
        word = dims[0]
        emb = dims[1:] # embedding list