# Emotion classification experiments

## Data loading and preprocessing

In [1]:
import csv
import string

import re
import textacy
from textacy.text_utils import detect_language
from textacy.preprocess import preprocess_text

import os
os.chdir('../../')

EMOJI_PATTERN = re.compile(
    u"(\ud83d[\ude00-\ude4f])|"  # emoticons
    u"(\ud83c[\udf00-\uffff])|"  # symbols & pictographs (1 of 2)
    u"(\ud83d[\u0000-\uddff])|"  # symbols & pictographs (2 of 2)
    u"(\ud83d[\ude80-\udeff])|"  # transport & map symbols
    u"(\ud83c[\udde0-\uddff])"  # flags (iOS)
    "+", flags=re.UNICODE)
MENTIONS_PATTERN = re.compile(u"@[a-z]+")
HASHTAGS_PATTERN = re.compile(u"#[a-z]+")


def preprocess(text):
    new_text = preprocess_text(text, fix_unicode=True, lowercase=True, no_urls=True, 
                    no_emails=True, no_phone_numbers=True, no_numbers=True,
                    no_currency_symbols=True, no_contractions=True,
                    no_accents=True)
    no_mentions_text = re.sub(MENTIONS_PATTERN, u"", new_text)
    no_hashtags_text = re.sub(HASHTAGS_PATTERN, u"", no_mentions_text)
    no_emojis_text = re.sub(EMOJI_PATTERN, u"", no_hashtags_text)
    separated_punctuation_text = no_emojis_text.translate(
        str.maketrans({key: " {0} ".format(key) for key in string.punctuation}))
    return separated_punctuation_text
    

EMOTION_DATAPATH = 'data/processed/emotions_full.csv'
raw_data = []
with open(EMOTION_DATAPATH) as data_file:
    reader = csv.reader(data_file, quoting=csv.QUOTE_MINIMAL)
    reader.__next__()
    for line in reader:
        preprocessed_line = preprocess(line[1])
        if detect_language(preprocessed_line) == 'en':
            doc = textacy.Doc(preprocessed_line, lang='en_core_web_lg')
            raw_data.append((doc, line[2]))
        
for data in raw_data[:20]:
    print(data[0].text)

the moment when you get another follower and you cheer . 
be the greatest dancer of your life !  practice daily positive habits .    
if ur heart hurts all the time for tht person something is not right where ' s the
i feel awful ,  and it ' s way too freaking early .  now off to leadership highschool .  .  . 
so chuffed for safc fans !  bet me dar comes in mortz from the match
soooo dooowwwn !  !  move on ,  get some sleep .  .  .  me deserve better .   
 " we are sorry ,  but the clip you selected is not available from your location .  please select another clip .  "  no i refuse . 
my heart and soul  _ babebee is leaving me and i can not even see here
chips and curry sauce
soo if i hit youu  ,  i garrentee i will not stopp  .  type to keep going till i make a bitch bleed foreal  ! 
oh and off to work till midnight  -  .  - 
 bahahahaha so many things i could say .  .  .  rt  :  i just shit my pants .  pure  * number *  %  gravy . 
51 morning  :  )  oh what a bad episode to come in t

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

texts, labels = zip(*raw_data)
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
x_train, x_test, y_train, y_test = \
    train_test_split(texts, encoded_labels, shuffle=True, stratify=encoded_labels, 
                     random_state=42, test_size=0.2)

In [3]:
import numpy as np

x_train_vectors = np.array([x.spacy_doc.vector for x in x_train])
x_test_vectors = np.array([x.spacy_doc.vector for x in x_test])

In [4]:
x_train_vectors = [text.to_bag_of_words() for text in x_train]
x_test_vectors = [text.to_bag_of_words() for text in x_test]
vectorizer = textacy.Vectorizer(weighting='tfidf', normalize=True, smooth_idf=True,
min_df=1, max_df=1., max_n_terms=100000)
x_train_vectors = vectorizer.fit_transform(x_train_vectors)
x_test_vectors = vectorizer.transform(x_test_vectors)

## Build testing framework

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.metrics import classification_report
MODELS_TEST_RESULTS = 'reports/tune_test_scores.csv'


def hypertune(x, y, model, parameters):
    model_obj = model()
    clf = GridSearchCV(model_obj, parameters, scoring='f1_micro', 
                       n_jobs=-1, cv=10, verbose=1)
    clf.fit(x, y)
    return clf


def try_model(x, y, model, parameters, name):
    print("----Started tuning : " + name + "----")
    tune_result = hypertune(x, y, model, parameters)
    y_pred = tune_result.best_estimator_.predict(x_test_vectors)
    print("Classification report")
    print(classification_report(y_test, y_pred, 
                                labels=range(len(label_encoder.classes_)), 
                                target_names=label_encoder.classes_))
    test_score = tune_result.score(x_test_vectors, y_test)
    print("Testing f1_micro: " + str(test_score))
    save = 'models/' + name + '_emotion.pkl'
    joblib.dump(tune_result.best_estimator_, save)
    print("Saved best estimator to " + save)
    with open(MODELS_TEST_RESULTS, "a") as test_scores_table:
        writer = csv.writer(test_scores_table, quoting=csv.QUOTE_MINIMAL)
        writer.writerow([name, test_score, save, str(tune_result.best_params_)])

In [None]:
parameters = {'criterion': ['gini', 'entropy'],
              'n_estimators': [1000],
              'max_depth': [None],
              'class_weight': [None],
              'min_samples_split': [2]}

try_model(x_train_vectors, y_train, RandomForestClassifier,
          parameters, 'RF')


In [None]:
from sklearn.svm import SVC

parameters = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]
try_model(x_train_vectors, y_train, SVC, 
          parameters, 'SVM')

In [None]:
import xgboost as xgb

parameters = {'objective': ['multi:softmax'],
              'n_estimators': [50, 100, 500, 1000],
              'max_depth': [0, 2, 5, 10],
              'colsample_bytree': [0.2, 0.6, 0.8],
              'gamma': [0.1, 0.3, 0.5, 0.9]}
try_model(x_train_vectors, y_train, xgb.XGBClassifier,
          parameters, 'XGB')


In [None]:
from sklearn.linear_model import LogisticRegression

parameters = [{'multi_class': ['ovr'],
               'penalty': ['l1', 'l2']
             'C': [0.1, 0.5, 0.9, 1.],
             'class_weight': [None, 'balanced']},
             {'multi_class': ['multinomial'],
              'solver': ['lbfgs']
             'C': [0.1, 0.5, 0.9, 1.],
             'class_weight': [None, 'balanced']}]
try_model(x_train_vectors, y_train, LogisticRegression, 
          parameters, 'LR')

In [14]:
from sklearn.linear_model import LogisticRegression

parameters = [{'multi_class': ['ovr'],
               'penalty': ['l1', 'l2'],
             'C': [0.1, 0.5, 0.9, 1.]},
             {'multi_class': ['multinomial'],
              'solver': ['lbfgs'],
             'C': [0.1, 0.5, 0.9, 1.]}]
try_model(x_train_vectors, y_train, LogisticRegression, 
          parameters, 'LR-idf')

----Started tuning : LR-idf----
Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   38.4s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  3.6min finished


Classification report
             precision    recall  f1-score   support

      anger       0.44      0.42      0.43      2157
  happiness       0.53      0.70      0.60      2810
       love       0.64      0.59      0.61      1555
    neutral       0.43      0.30      0.36      1679
    sadness       0.46      0.40      0.43      1927

avg / total       0.50      0.50      0.49     10128

Testing f1_micro: 0.502764612954
Saved best estimator to models/LR-idf_emotion.pkl
Classification report full data
             precision    recall  f1-score   support

      anger       0.44      0.42      0.43      2157
  happiness       0.53      0.70      0.60      2810
       love       0.64      0.59      0.61      1555
    neutral       0.43      0.30      0.36      1679
    sadness       0.46      0.40      0.43      1927

avg / total       0.50      0.50      0.49     10128

Testing f1_micro full data: 0.502764612954


In [9]:
print(x_train_vectors[0])

  (0, 0)	0.254239026704
  (0, 1)	0.166102034462
  (0, 2)	0.309069475657
  (0, 3)	0.266305655528
  (0, 4)	0.351406771388
  (0, 5)	0.199882920765
  (0, 6)	0.0836146295847
  (0, 7)	0.126423577371
  (0, 8)	0.308263845653
  (0, 9)	0.146155809548
  (0, 10)	0.33412349698
  (0, 11)	0.104674221814
  (0, 12)	0.552182534244
  (0, 13)	0.105008980761


In [19]:
from sklearn.tree import DecisionTreeClassifier

parameters = {'criterion': ['gini', 'entropy'],
              'max_depth': [None, 2, 5, 10],
              'class_weight': [None, 'balanced'],
              'min_samples_split': [2, 3, 5]}
try_model(x_train_vectors, y_train, DecisionTreeClassifier, 
          parameters, 'DT')

----Started tuning : DT----
Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 17.3min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 17.8min finished


Classification report
             precision    recall  f1-score   support

      anger       0.35      0.27      0.30      2157
  happiness       0.36      0.72      0.48      2810
       love       0.56      0.39      0.46      1555
    neutral       0.27      0.19      0.22      1679
    sadness       0.49      0.14      0.22      1927

avg / total       0.40      0.38      0.35     10128

Testing f1_micro: 0.375493680885
Saved best estimator to models/DT_emotion.pkl
