# Emotion classification experiments

## Data loading and preprocessing

In [3]:
import csv
import string

import re
import textacy
from textacy.text_utils import detect_language
from textacy.preprocess import preprocess_text

EMOJI_PATTERN = re.compile(
    u"(\ud83d[\ude00-\ude4f])|"  # emoticons
    u"(\ud83c[\udf00-\uffff])|"  # symbols & pictographs (1 of 2)
    u"(\ud83d[\u0000-\uddff])|"  # symbols & pictographs (2 of 2)
    u"(\ud83d[\ude80-\udeff])|"  # transport & map symbols
    u"(\ud83c[\udde0-\uddff])"  # flags (iOS)
    "+", flags=re.UNICODE)
MENTIONS_PATTERN = re.compile(u"@[a-z]+")
HASHTAGS_PATTERN = re.compile(u"#[a-z]+")


def preprocess(text):
    new_text = preprocess_text(text, fix_unicode=True, lowercase=True, no_urls=True, 
                    no_emails=True, no_phone_numbers=True, no_numbers=True,
                    no_currency_symbols=True, no_contractions=True,
                    no_accents=True)
    no_mentions_text = re.sub(MENTIONS_PATTERN, u"", new_text)
    no_hashtags_text = re.sub(HASHTAGS_PATTERN, u"", no_mentions_text)
    no_emojis_text = re.sub(EMOJI_PATTERN, u"", no_hashtags_text)
    separated_punctuation_text = no_emojis_text.translate(
        str.maketrans({key: " {0} ".format(key) for key in string.punctuation}))
    return separated_punctuation_text
    

EMOTION_DATAPATH = 'data/processed/emotions_full.csv'
raw_data = []
with open(EMOTION_DATAPATH) as data_file:
    reader = csv.reader(data_file, quoting=csv.QUOTE_MINIMAL)
    reader.__next__()
    for line in reader:
        preprocessed_line = preprocess(line[1])
        if detect_language(preprocessed_line) == 'en':
            doc = textacy.Doc(preprocessed_line, lang='en_core_web_lg')
            raw_data.append((doc, line[2]))
        
for data in raw_data[:20]:
    print(data[0].text)

thinks that  had a great 50th birthday party  :  ) 
the moment when you get another follower and you cheer . 
be the greatest dancer of your life !  practice daily positive habits .    
if ur heart hurts all the time for tht person something is not right where ' s the
i feel awful ,  and it ' s way too freaking early .  now off to leadership highschool .  .  . 
so chuffed for safc fans !  bet me dar comes in mortz from the match
making art and viewing art are different at their core ! 
soooo dooowwwn !  !  move on ,  get some sleep .  .  .  me deserve better .   
 " we are sorry ,  but the clip you selected is not available from your location .  please select another clip .  "  no i refuse . 
people know they can pull you down and they dont give a care  & 
my heart and soul  _ babebee is leaving me and i can not even see here
chips and curry sauce
soo if i hit youu  ,  i garrentee i will not stopp  .  type to keep going till i make a bitch bleed foreal  ! 
oh and off to work till midni

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

texts, labels = zip(*raw_data)
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
x_train, x_test, y_train, y_test = \
    train_test_split(texts, encoded_labels, shuffle=True, stratify=encoded_labels, 
                     random_state=42, test_size=0.2)

In [7]:
import numpy as np

x_train_vectors = np.array([x.spacy_doc.vector for x in x_train])
x_test_vectors = np.array([x.spacy_doc.vector for x in x_test])

## Build testing framework

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.metrics import classification_report
MODELS_TEST_RESULTS = 'reports/tune_test_scores.csv'


def hypertune(x, y, model, parameters):
    model_obj = model()
    clf = GridSearchCV(model_obj, parameters, scoring='f1_micro', 
                       n_jobs=-1, cv=10, verbose=1)
    clf.fit(x, y)
    return clf


def try_model(x, y, model, parameters, name):
    print("----Started tuning : " + name + "----")
    tune_result = hypertune(x, y, model, parameters)
    y_pred = tune_result.best_estimator_.predict(x_test_vectors)
    print("Classification report")
    print(classification_report(y_test, y_pred, 
                                labels=range(len(label_encoder.classes_)), 
                                target_names=label_encoder.classes_))
    test_score = tune_result.score(x_test_vectors, y_test)
    print("Testing f1_micro: " + str(test_score))
    save = 'models/' + name + '_emotion.pkl'
    joblib.dump(tune_result.best_estimator_, save)
    print("Saved best estimator to " + save)
    with open(MODELS_TEST_RESULTS, "a") as test_scores_table:
        writer = csv.writer(test_scores_table, quoting=csv.QUOTE_MINIMAL)
        writer.writerow([name, test_score, save, str(tune_result.best_params_)])

----Started tuning : RF----
Fitting 10 folds for each of 8 candidates, totalling 80 fits




[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   23.9s finished


Classification report
             precision    recall  f1-score   support

      anger       0.00      0.00      0.00       482
       fear       0.60      0.00      0.01       682
  happiness       0.30      0.62      0.40      2810
       love       0.32      0.47      0.38      1555
    neutral       0.21      0.26      0.23      1679
    sadness       0.22      0.16      0.18      1927
   surprise       0.14      0.01      0.02      1067
      worry       0.25      0.02      0.04      1675

avg / total       0.26      0.28      0.22     11877



  'precision', 'predicted', average, warn_for)


Testing f1_micro: 0.275827229098
Saved best estimator to models/RF_emotion.pkl


In [None]:
parameters = {'criterion': ['gini', 'entropy'],
              'n_estimators': [10, 100, 1000, 2000],
              'max_depth': [None, 2, 5, 10],
              'class_weight': [None, 'balanced'],
              'min_samples_split': [2, 3, 5]}

try_model(x_train_vectors, y_train, RandomForestClassifier,
          parameters, 'RF')


In [20]:
from sklearn.svm import SVC

parameters = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]
try_model(x_train_vectors, y_train, SVC, 
          parameters, 'SVM')

----Started tuning : SVM----
Fitting 10 folds for each of 4 candidates, totalling 40 fits




[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    0.3s finished


Classification report
             precision    recall  f1-score   support

      anger       0.22      0.01      0.03       482
       fear       0.33      0.12      0.17       682
  happiness       0.30      0.53      0.38      2810
       love       0.34      0.35      0.35      1555
    neutral       0.20      0.24      0.22      1679
    sadness       0.20      0.25      0.22      1927
   surprise       0.13      0.03      0.05      1067
      worry       0.22      0.04      0.07      1675

avg / total       0.25      0.26      0.23     11877



Testing f1_micro: 0.262440010104
Saved best estimator to models/SVM_emotion.pkl


In [26]:
import xgboost as xgb

parameters = {'objective': ['multi:softmax'],
              'n_estimators': [50, 100, 500, 1000],
              'max_depth': [0, 2, 5, 10],
              'colsample_bytree': [0.2, 0.6, 0.8],
              'gamma': [0.1, 0.3, 0.5, 0.9]}
try_model(x_train_vectors, y_train, xgb.XGBClassifier,
          parameters, 'XGB')


----Started tuning : XGB----
Fitting 10 folds for each of 4 candidates, totalling 40 fits




[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   16.4s finished


Classification report
             precision    recall  f1-score   support

      anger       0.16      0.02      0.03       482
       fear       0.14      0.06      0.09       682
  happiness       0.30      0.40      0.35      2810
       love       0.26      0.36      0.30      1555
    neutral       0.22      0.24      0.23      1679
    sadness       0.20      0.17      0.18      1927
   surprise       0.12      0.12      0.12      1067
      worry       0.17      0.11      0.13      1675

avg / total       0.22      0.23      0.22     11877



Testing f1_micro: 0.233981645197
Saved best estimator to models/XGB_emotion.pkl


In [41]:
import fasttext
from itertools import product
from sklearn.metrics import f1_score

FASTTEXT_INPUT_FILE = 'data/processed/fasttext_input.txt'
MODEL_PATH = 'models/fasttext/model'
label_prefix = '__label__'
with open(FASTTEXT_INPUT_FILE, 'w') as input_file:
    for x, y in zip(x_train, y_train):
        input_file.write(' , '.join([label_prefix + str(y), x.text]) + '\n')

tested_dims = [10, 100, 300]
tested_lrs = [0.1, 0.01, 0.001]
combinations = product(tested_dims, tested_lrs)

epoch = 5
min_count = 1
word_ngrams = 3
thread = 6

best_params = None
best_score = 0
for dim, lr in combinations:
    model = fasttext.supervised(
        FASTTEXT_INPUT_FILE, MODEL_PATH, dim=dim, lr=lr, epoch=epoch,
        min_count=min_count, word_ngrams=word_ngrams,
        thread=thread, label_prefix=label_prefix
    )
    preds = model.predict([l.text for l in x_test])
    preds = [int(pred[0]) for pred in preds]
    score = f1_score(y_test, preds, average='micro')
    if best_score < score:
        best_score = score
        best_params = {"dim": dim, "lr": lr}

with open(MODELS_TEST_RESULTS, "a") as test_scores_table:
        writer = csv.writer(test_scores_table, quoting=csv.QUOTE_MINIMAL)
        writer.writerow(["FT", best_score, '', str(best_params)])
