In [1]:
import logging
logging.root.handlers = []  # Jupyter messes up logging so needs a reset
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import pandas as pd
import numpy as np
import gensim
import nltk

from sklearn import preprocessing, metrics, decomposition, pipeline, dummy
from sklearn import linear_model, svm, ensemble, multioutput

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

import matplotlib.pyplot as plt
%matplotlib inline

2017-06-03 20:52:43,467 : INFO : 'pattern' package not found; tag filters are not available for English


In [2]:
import sys
sys.path.append('../classifier/PatternEmotionClassifier/')
from pattern_classifier import  SimpleClassifier, PatternVectorizer

In [3]:
tweets = pd.read_csv('../classifier/PatternEmotionClassifier/test/annotated/annotated_tweets.tsv', sep='\t', names=['text', 'emo1', 'emo2', 'emo3'], dtype='str')
tweets.emo1.value_counts()

joy             456
anticipation    272
disgust         183
sadness         165
surprise        125
anger           107
trust           102
fear             87
Name: emo1, dtype: int64

In [4]:
train_data = pd.read_csv('../data/training/train_merged_8emo.tsv', sep='\t', names=['emotions', 'text'])
train_data.dropna(inplace=True)

In [5]:
test_data = pd.DataFrame()
test_data['text'] = tweets['text']
test_data['emotions'] = tweets['emo1']

In [6]:
tweets_emotions = [set(emos) & set(train_data.emotions) for emos in tweets[['emo1', 'emo2', 'emo3']].values]
mlb = preprocessing.MultiLabelBinarizer()
y_bin_emotions = mlb.fit_transform(tweets_emotions)

In [7]:
def evaluate_prediction(predictions, target):
    print(metrics.classification_report(target, predictions))
    print('Accuracy:')
    print(metrics.accuracy_score(target, predictions))

In [8]:
def evaluate_prob(prediction, target, binarizer):
    predict_class = binarizer.classes_[prediction.argmax(axis=1)]
    target_class = binarizer.classes_[target.argmax(axis=1)]
    evaluate_prediction(predict_class, target_class)
    
    print('average_precision_score: %f' % metrics.average_precision_score(target, prediction))
    print('label_ranking_average_precision_score: %f' % metrics.label_ranking_average_precision_score(target, prediction))
    print('label_ranking_loss: %f' % metrics.label_ranking_loss(target, prediction))

In [9]:
def predict(train, test, pipeline):
    pipeline.fit(train['text'], train['emotions'])
    predictions = pipeline.predict(test['text'])
    target = test['emotions']
    evaluate_prediction(predictions, target)

In [10]:
def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis] # dito
    return e_x / div

## Dummy

In [11]:
dummy_vectoriser = TfidfVectorizer(max_features=5)
dummy_vectoriser.fit(train_data.text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [12]:
dummy_pipe = pipeline.Pipeline([
    ('vect', TfidfVectorizer(
        max_features=5000)),
    ('cls', dummy.DummyClassifier())
])
predict(train_data, test_data, dummy_pipe)

              precision    recall  f1-score   support

       anger       0.07      0.15      0.09       107
anticipation       0.11      0.04      0.06       272
     disgust       0.14      0.02      0.03       183
        fear       0.05      0.14      0.07        87
         joy       0.29      0.19      0.23       456
     sadness       0.12      0.20      0.15       165
    surprise       0.05      0.05      0.05       125
       trust       0.05      0.09      0.07       102

 avg / total       0.15      0.12      0.12      1497

Accuracy:
0.118904475618


In [13]:
dummy_reg = dummy.DummyRegressor()
dummy_reg.fit(dummy_vectoriser.transform(train_data.text), mlb.transform(train_data.emotions.apply(lambda emo: set([emo])).values))
dummy_prob = dummy_reg.predict(dummy_vectoriser.transform(test_data.text))

In [14]:
evaluate_prob(dummy_prob, y_bin_emotions, mlb)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       anger       0.00      0.00      0.00       251
anticipation       0.00      0.00      0.00       552
     disgust       0.00      0.00      0.00       149
        fear       0.00      0.00      0.00        56
         joy       0.00      0.00      0.00       351
     sadness       0.04      1.00      0.08        62
    surprise       0.00      0.00      0.00        66
       trust       0.00      0.00      0.00        10

 avg / total       0.00      0.04      0.00      1497

Accuracy:
0.0414161656647
average_precision_score: 0.614980
label_ranking_average_precision_score: 0.427554
label_ranking_loss: 0.487380


## Python pattern baseline

In [15]:
pv, cls = SimpleClassifier.load_from_folder('../classifier/PatternEmotionClassifier/test/patterns/8_emos/')

In [16]:
pattern_vect = pv.transform_parrallel(test_data.text.values, n_jobs=-1, n_chunk=500)

In [17]:
pattern_es_prob = cls.get_emotion_prob(pattern_vect)

In [18]:
cls_to_mlb = [np.where(np.array(cls.classes) == emo)[0][0] for emo in mlb.classes_.tolist()]

In [19]:
pattern_es_mlb = pattern_es_prob[:,cls_to_mlb]

In [20]:
evaluate_prob(pattern_es_mlb, y_bin_emotions, mlb)

              precision    recall  f1-score   support

       anger       0.40      0.20      0.27       251
anticipation       0.52      0.38      0.44       552
     disgust       0.14      0.06      0.08       149
        fear       0.11      0.34      0.17        56
         joy       0.27      0.14      0.19       351
     sadness       0.16      0.15      0.15        62
    surprise       0.07      0.39      0.11        66
       trust       0.00      0.00      0.00        10

 avg / total       0.35      0.25      0.28      1497

Accuracy:
0.247828991316
average_precision_score: 0.322029
label_ranking_average_precision_score: 0.529072
label_ranking_loss: 0.389337


## Doc2vec Model

[https://github.com/RaRe-Technologies/movie-plots-by-genre/blob/master/Document%20classification%20with%20word%20embeddings%20tutorial.ipynb](https://github.com/RaRe-Technologies/movie-plots-by-genre/blob/master/Document%20classification%20with%20word%20embeddings%20tutorial.ipynb)

In [11]:
#removing retweet and tweet countaining url
retweet_bool = train_data.text.str.contains('RT |rt ')
http_bool = train_data.text.str.contains('http://') 
https_bool = train_data.text.str.contains('https://')
train_data = train_data[~(retweet_bool | http_bool | https_bool)]

In [12]:
#remove hastag and usermention
train_data.text = train_data.text.str.replace(r"#(\w+)", '<H>')\
    .str.replace(r"@(\w+)", '<M>')\
    .str.replace('"', '')\
    .str.replace("'", '')
train_data = train_data[train_data.text != '']

In [13]:
train_data.dropna(inplace=True)

In [14]:
train_data.drop_duplicates(inplace=True)

In [15]:
train_data

Unnamed: 0,emotions,text
1,joy,More than ready for Spring and Summer !! <H> <H>
2,trust,we wont work if you get jealous easily <H> <H>...
3,trust,But appreciate what your man does for you <H> ...
5,fear,One of my snow globes that hasnt been played i...
6,fear,"<M> i know, tell me about it :p and then there..."
7,joy,"Volunteering with the children this week, and ..."
8,sadness,Couldnt go to the batting cages today <H>
9,fear,starting the walking dead on netflix with <M> ...
10,fear,definitely need to prepare my speech earlier n...
11,sadness,got that shitty feeling <H> <H>


In [16]:
train_data.count()

emotions    1801966
text        1801966
dtype: int64

In [17]:
train_data.emotions.value_counts()

sadness     426413
fear        341707
joy         339877
anger       329212
trust       205040
surprise    159717
Name: emotions, dtype: int64

In [18]:
lower_preprocessor = lambda text: text.lower()

In [19]:
tweet_tokenizer = TweetTokenizer()

## Baseline: bag of words, n-grams, tf-idf
Let's start with some simple baselines before diving into more advanced methods.

### Bag of words
The simplest document feature is just a count of each word occurrence in a document.
We remove stop-words and use NLTK tokenizer then limit our vocabulary to 3k most frequent words.

In [24]:
%%time
# training

bow_pipe = pipeline.Pipeline([
    ('vect', CountVectorizer(
        analyzer="word", 
        tokenizer=tweet_tokenizer.tokenize,
        preprocessor=lower_preprocessor, 
        max_features=100000)),
    ('cls', linear_model.SGDClassifier(n_iter=100, loss='epsilon_insensitive'))
])

predict(train_data, test_data, bow_pipe, le)

             precision    recall  f1-score   support

      anger       0.29      0.42      0.35       107
       fear       0.17      0.41      0.24        87
        joy       0.76      0.42      0.54       456
    sadness       0.39      0.47      0.43       165
   surprise       0.19      0.04      0.07       125
      trust       0.28      0.54      0.37       102

avg / total       0.49      0.39      0.40      1042

Accuracy:
0.39443378119
CPU times: user 10min 34s, sys: 2.96 s, total: 10min 37s
Wall time: 10min 36s


Multi-modal logistic regression is a simple white-box classifier. We will use either logistic regression or KNN throughout this tutorial.

### Character N-grams

A character n-gram is a chunk of a document of length n. It is a poor man's tokenizer but sometimes works well. The parameter n depends on language and the corpus. We choose length between 3 and 6 characters and to only focus on 3k most popular ones.

In [24]:
%%time

ngram_pipe = pipeline.Pipeline([
    ('vect', CountVectorizer(
        analyzer="char",
        ngram_range=([2,5]),
        tokenizer=None,    
        preprocessor=lower_preprocessor,
        max_features=50000
    )),
    ('cls', linear_model.SGDClassifier(n_iter=20))
])

predict(train_data, test_data, ngram_pipe, le)

             precision    recall  f1-score   support

      anger       0.52      0.11      0.18       107
       fear       0.15      0.52      0.23        87
        joy       0.73      0.45      0.56       456
    sadness       0.39      0.46      0.42       165
   surprise       0.18      0.13      0.15       125
      trust       0.31      0.44      0.36       102

avg / total       0.50      0.38      0.40      1042

Accuracy:
0.384836852207
CPU times: user 12min 51s, sys: 1min 35s, total: 14min 26s
Wall time: 53min 40s


### TF-IDF

Term Frequency - Inverse Document Frequency is a little more advanced way to count words in a document. It adjusts for document length, word frequency and most importantly for frequency of a particular word in a particular document.

In [23]:
%%time

tfidf_pipe = pipeline.Pipeline([
    ('vect', TfidfVectorizer(
        min_df=10, 
        max_df=0.3,
        tokenizer=tweet_tokenizer.tokenize,
        ngram_range=(1,3),
        preprocessor=lower_preprocessor)),
    ('cls', linear_model.SGDClassifier(n_iter=20, loss='epsilon_insensitive'))
])

predict(train_data, test_data, tfidf_pipe, le)

             precision    recall  f1-score   support

      anger       0.30      0.47      0.36       107
       fear       0.13      0.43      0.20        87
        joy       0.73      0.46      0.57       456
    sadness       0.47      0.45      0.46       165
   surprise       0.15      0.02      0.04       125
      trust       0.38      0.43      0.41       102

avg / total       0.49      0.40      0.42      1042

Accuracy:
0.404030710173
CPU times: user 7min 41s, sys: 5.01 s, total: 7min 46s
Wall time: 8min 5s


In [22]:
%%time

tfidf_pipe = pipeline.Pipeline([
    ('vect', TfidfVectorizer(
        min_df=10,
        max_df=0.3,
        tokenizer=tweet_tokenizer.tokenize,
        preprocessor=lower_preprocessor)),
    ('cls', linear_model.SGDClassifier(n_iter=20, loss='epsilon_insensitive'))
])

predict(train_data, test_data, tfidf_pipe, le)

             precision    recall  f1-score   support

      anger       0.31      0.37      0.34       107
       fear       0.14      0.41      0.21        87
        joy       0.77      0.45      0.57       456
    sadness       0.43      0.50      0.46       165
   surprise       0.16      0.03      0.05       125
      trust       0.31      0.53      0.39       102

avg / total       0.50      0.40      0.42      1042

Accuracy:
0.404030710173
CPU times: user 4min 17s, sys: 5.55 s, total: 4min 22s
Wall time: 5min 44s


In [20]:
%%time

ngram_tfidf_pipe = pipeline.Pipeline([
    ('vect', TfidfVectorizer(
        min_df=10,
        analyzer="char_wb",
        ngram_range=(2,5),
        tokenizer=tweet_tokenizer.tokenize,    
        preprocessor=lower_preprocessor,
        max_features=50000
    )),
    ('cls', linear_model.SGDClassifier(n_iter=20, loss='epsilon_insensitive'))
])

predict(train_data, test_data, ngram_tfidf_pipe, le)

             precision    recall  f1-score   support

      anger       0.32      0.44      0.37       107
       fear       0.13      0.41      0.19        87
        joy       0.74      0.41      0.52       456
    sadness       0.41      0.42      0.41       165
   surprise       0.19      0.04      0.07       125
      trust       0.28      0.44      0.34       102

avg / total       0.48      0.37      0.39      1042

Accuracy:
0.37236084453
CPU times: user 11min 23s, sys: 42.5 s, total: 12min 5s
Wall time: 37min 54s


In [21]:
%%time

ngram_tfidf_pipe = pipeline.Pipeline([
    ('vect', TfidfVectorizer(
        min_df=10,
        analyzer="char",
        ngram_range=(2,5),
        preprocessor=lower_preprocessor,
        max_features=50000
    )),
    ('cls', linear_model.SGDClassifier(n_iter=20, loss='epsilon_insensitive'))
])

predict(train_data, test_data, ngram_tfidf_pipe, le)

             precision    recall  f1-score   support

      anger       0.38      0.44      0.41       107
       fear       0.13      0.47      0.20        87
        joy       0.76      0.41      0.53       456
    sadness       0.40      0.39      0.40       165
   surprise       0.19      0.04      0.07       125
      trust       0.29      0.48      0.36       102

avg / total       0.49      0.38      0.40      1042

Accuracy:
0.376199616123
CPU times: user 13min 28s, sys: 1min 40s, total: 15min 9s
Wall time: 1h 4min 35s


## Averaging word vectors
Now let's use more complex features rather than just counting words.
A great recent achievement of NLP is the word2vec embedding. See Chris Moody's video for a great introduction to word2vec.
First we load a word2vec model. It has been pre-trained by Google on a 100 billon word Google News corpus. You can play with this model using a fun web-app.
Link to the web-app: http://rare-technologies.com/word2vec-tutorial/#app
Vocabulary size: 3 mln words.
Warning: 3 mins to load, takes 4 GB of RAM.

In [27]:
%%time 
wv = KeyedVectors.load_word2vec_format(
    "/home/renaud/Documents/GoogleNews-vectors-negative300.bin.gz",
    binary=True)
wv.init_sims(replace=True)

2017-05-28 11:35:19,429 : INFO : loading projection weights from /home/renaud/Documents/GoogleNews-vectors-negative300.bin.gz
2017-05-28 12:07:25,109 : INFO : loaded (3000000, 300) matrix from /home/renaud/Documents/GoogleNews-vectors-negative300.bin.gz
2017-05-28 12:07:25,462 : INFO : precomputing L2-norms of word weight vectors


CPU times: user 4min 47s, sys: 40.4 s, total: 5min 27s
Wall time: 36min 29s


In [28]:
wv.vector_size

300

In [29]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, review) for review in text_list ])

In [30]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [31]:
test_tokenized = test_data.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values
train_tokenized = train_data[:500000].apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values

In [32]:
len(train_tokenized)

500000

In [33]:
%%time
X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)

CPU times: user 50.5 s, sys: 6.94 s, total: 57.5 s
Wall time: 3min 54s


In [34]:
X_train_word_average = X_train_word_average[(X_train_word_average != 0).any(axis=1)]
y_train = train_data[:500000].emotions[(X_train_word_average != 0).any(axis=1)]

In [35]:
%%time
w2v_avg_sgd = linear_model.SGDClassifier(n_iter=20, loss='epsilon_insensitive')
w2v_avg_sgd.fit(X_train_word_average, y_train)

CPU times: user 1min 4s, sys: 256 ms, total: 1min 4s
Wall time: 1min 7s


In [36]:
%%time
predicted = w2v_avg_sgd.predict(X_test_word_average)

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 360 ms


In [37]:
evaluate_prediction(predicted, test_data.emotions, le)

             precision    recall  f1-score   support

      anger       0.19      0.21      0.20       107
       fear       0.07      0.32      0.12        87
        joy       0.47      0.53      0.49       456
    sadness       0.00      0.00      0.00       165
   surprise       0.00      0.00      0.00       125
      trust       0.00      0.00      0.00       102

avg / total       0.23      0.28      0.25      1042

Accuracy:
0.280230326296


  'precision', 'predicted', average, warn_for)


## Doc2Vec
A paper by Google suggests a model for document classification called Paragraph Vectors Doc2Vec or Doc2vec in short. It is very similar to word2vec.
It introduces 'a tag' - a word that is in every context in the document.
For our first try we tag every plot with its genre. This makes it 'semi-supervised' learning - the genre labels is just one objective among many.

In [38]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

In [39]:
train_tagged = train_data.apply(
    lambda r: TaggedDocument(words=tweet_tokenizer.tokenize(r['text']), tags=[r.emotions]), axis=1)

In [40]:
test_tagged = test_data.apply(
    lambda r: TaggedDocument(words=tweet_tokenizer.tokenize(r['text']), tags=[r.emotions]), axis=1)

In [None]:
test_tagged.values[50]

TaggedDocument(words=['cant', 'believe', 'the', 'champions', 'league', 'is', 'back', 'next', 'week', 'and', 'the', 'pool', 'are', 'back', 'in', 'it', '😍', '😍', '😍'], tags=['joy'])

In [None]:
%%time
trainsent = train_tagged.values
testsent = test_tagged.values

# simple gensim doc2vec api
doc2vec_model = Doc2Vec(trainsent, workers=1, size=5, iter=20, dm=1)

train_targets, train_regressors = zip(
    *[(doc.tags[0], doc2vec_model.infer_vector(doc.words, steps=20)) for doc in trainsent])

2017-05-28 13:11:49,552 : INFO : collecting all words and their counts
2017-05-28 13:11:49,553 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2017-05-28 13:11:51,562 : INFO : PROGRESS: at example #10000, processed 150311 words (75982/s), 12909 word types, 6 tags
2017-05-28 13:11:51,885 : INFO : PROGRESS: at example #20000, processed 301643 words (492005/s), 19437 word types, 6 tags
2017-05-28 13:11:55,296 : INFO : PROGRESS: at example #30000, processed 453521 words (44539/s), 24654 word types, 6 tags
2017-05-28 13:11:55,447 : INFO : PROGRESS: at example #40000, processed 604627 words (1268511/s), 29071 word types, 6 tags
2017-05-28 13:11:55,598 : INFO : PROGRESS: at example #50000, processed 756586 words (1346237/s), 33072 word types, 6 tags
2017-05-28 13:11:55,716 : INFO : PROGRESS: at example #60000, processed 908064 words (1338281/s), 36634 word types, 6 tags
2017-05-28 13:11:55,847 : INFO : PROGRESS: at example #70000, processed 1058425 words (12941

Interesting thing about doc2vec is that we need to run gradient descent during prediction to infer the vector for an unseen document. An unseen document is initially assigned a random vector and then this vector fit by gradient descent. Because of this randomness we get different vectors on re-runs of the next cell.

Consequently, the accuracy of logistic regression changes when the test set vectors change.

In [None]:
%%time
test_targets, test_regressors = zip(
    *[(doc.tags[0], doc2vec_model.infer_vector(doc.words, steps=20)) for doc in testsent])

In [None]:
sgd = linear_model.SGDClassifier(n_iter=20)
sgd = sgd.fit(train_regressors, train_targets)
evaluate_prediction(sgd.predict(test_regressors), test_targets, le)

In [None]:
doc2vec_model.docvecs.most_similar('anger')

In [None]:
doc2vec_model.most_similar([doc2vec_model.docvecs['fear']])

## Deep IR

'Deep IR' is a technique developed by “Document Classification by Inversion of Distributed Language Representations”, Matt Taddy. Matt has contributed a gensim tutorial - great source of more in depth information.

In short the algorithm is:

1. Train a word2vec model only on comedy plots.
2. Trian another model only on sci-fi, another on romance etc. Get 6 models - one for each genre.
3. Take a plot and see which model fits it best using Bayes' Theorem

The tokenization is different from other methods. The reason for this is that we are following an original approach in the paper. The purpose of this tutorial is to see how the models behave out of the box.

We just clean non-alphanumeric characters and split by sentences.

In [None]:
import re
contractions = re.compile(r"'|-|\"")
# all non alphanumeric
symbols = re.compile(r'(\W+)', re.U)
# single character removal
singles = re.compile(r'(\s\S\s)', re.I|re.U)
# separators (any whitespace)
seps = re.compile(r'\s+')

# cleaner (order matters)
def clean(text): 
    text = text.lower()
    text = contractions.sub('', text)
    text = symbols.sub(r' \1 ', text)
    text = singles.sub(' ', text)
    text = seps.sub(' ', text)
    return text

# sentence splitter
alteos = re.compile(r'([!\?])')
def sentences(l):
    l = alteos.sub(r' \1 .', l).rstrip("(\.)*\n")
    return l.split(".")

In [None]:
def plots(label):
    my_df = None
    if label=='training':
        my_df = train_data
    else:
        my_df = test_data
    for i, row in my_df.iterrows():
        yield {'y':row['emotions'],\
        'x':[clean(s).split() for s in sentences(row['text'])]}

In [None]:
%%time
# The corpus is small so can be read into memory
revtrain = list(plots("training"))
revtest = list(plots("test"))

In [None]:
# shuffle training set for unbiased word2vec training
np.random.shuffle(revtrain)

In [None]:
my_tags = le.classes_

In [None]:
def tag_sentences(reviews, stars=my_tags):  
    for r in reviews:
        if r['y'] in stars:
            for s in r['x']:
                yield s

In [None]:
next(tag_sentences(revtrain, my_tags[0]))

In [None]:
%%time 
## training
from gensim.models import Word2Vec
import multiprocessing

## create a w2v learner 
basemodel = Word2Vec(
    workers=multiprocessing.cpu_count(), # use your cores
    iter=100, # iter = sweeps of SGD through the data; more is better
    hs=1, negative=0, # we only have scoring for the hierarchical softmax setup
    
    )
print(basemodel)
basemodel.build_vocab(tag_sentences(revtrain)) 
from copy import deepcopy
genremodels = [deepcopy(basemodel) for i in range(len(my_tags))]
for i in range(len(my_tags)):
    slist = list(tag_sentences(revtrain, my_tags[i]))
    print(my_tags[i], "emotion (", len(slist), ")")
    genremodels[i].train(slist, total_examples=len(slist), epochs=100 )
# get the probs (note we give docprob a list of lists of words, plus the models)

Now we will compute most likely class for a plot using Bayes' Theorem formula.

For any new sentence we can obtain its likelihood (lhd; actually, the composite likelihood approximation; see the paper) using the score function in the word2vec class. We get the likelihood for each sentence in the first test review, then convert to a probability over star ratings. Every sentence in the review is evaluated separately and the final star rating of the review is an average vote of all the sentences. This is all in the following handy wrapper. (from the original tutorial by Matt Taddy.

In [None]:
"""
docprob takes two lists
* docs: a list of documents, each of which is a list of sentences
* models: the candidate word2vec models (each potential class)

it returns the array of class probabilities.  Everything is done in-memory.
"""


def docprob(docs, mods):
    # score() takes a list [s] of sentences here; could also be a sentence generator
    sentlist = [s for d in docs for s in d]
    # the log likelihood of each sentence in this review under each w2v representation
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    # now exponentiate to get likelihoods, 
    lhd = np.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
    # normalize across models (stars) to get sentence-star probabilities
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    # and finally average the sentence probabilities to get the review probability
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").mean()
    return prob

In [None]:
%%time
## predict
probs = docprob( [r['x'] for r in revtest], genremodels )  
predictions = probs.idxmax(axis=1).apply(lambda x: my_tags[x])

In [None]:
tag_index = 0
col_name = "out-of-sample prob positive for " + my_tags[tag_index]
probpos = pd.DataFrame({col_name:probs[[tag_index]].sum(axis=1), 
                        "true genres": [r['y'] for r in revtest]})
probpos.boxplot(col_name,by="true genres", figsize=(12,5))

In [None]:
target = [r['y'] for r in revtest]

In [None]:
evaluate_prediction(predictions, target, le)

# Fasttext

https://github.com/salestock/fastText.py

### Train models

For training the models yourself, you'll need to have both Gensim and FastText set up on your machine.

In [20]:
import fasttext

In [21]:
def mlb_order(pred_prob, mlb_classes):
    res = np.zeros(len(mlb_classes))
    for prob in pred_prob:
        res[np.where(mlb_classes == prob[0])] = prob[1]
    return res.tolist()

In [22]:
target = test_data.emotions.values

In [26]:
train_data['label'] = ['__label__' + str(label) for label in train_data.emotions]
test_data['label'] = ['__label__' + str(label) for label in test_data.emotions]

In [27]:
train_data[['label', 'text']].to_csv('./fasttext/train.txt', index=False, header=None, sep=' ')
test_data[['label', 'text']].to_csv('./fasttext/test.txt', index=False, header=None, sep=' ')

In [63]:
classifier = fasttext.supervised('fasttext/train.txt', 'model', epoch=10, dim=300)

In [64]:
result = classifier.predict(test_data.text.values)

In [65]:
predictions = [res[0] for res in result]

In [66]:
evaluate_prediction(predictions, target, le)

             precision    recall  f1-score   support

      anger       0.41      0.31      0.35       107
       fear       0.18      0.43      0.25        87
        joy       0.72      0.36      0.48       456
    sadness       0.36      0.53      0.43       165
   surprise       0.24      0.17      0.20       125
      trust       0.27      0.50      0.35       102

avg / total       0.48      0.38      0.39      1042

Accuracy:
0.379078694818


In [67]:
fasttext_pred_prob = classifier.predict_proba(test_data.text.values, len(emotions))

In [68]:
fasttext_pred_prob[0]

[('sadness', 0.380859),
 ('trust', 0.333984),
 ('joy', 0.169922),
 ('fear', 0.0742188),
 ('surprise', 0.0214844),
 ('anger', 0.0136719)]

In [69]:
mlb.classes_

array(['anger', 'fear', 'joy', 'sadness', 'surprise', 'trust'], dtype=object)

In [70]:
fasttext_prob_mlb = [mlb_order(pred_prob, mlb.classes_) for pred_prob in fasttext_pred_prob]

In [71]:
average_precision_score(y_bin_emotions, fasttext_prob_mlb)

0.43511258556124094

In [72]:
label_ranking_average_precision_score(y_bin_emotions, fasttext_prob_mlb)

0.64302889741949409

In [73]:
label_ranking_loss(y_bin_emotions, fasttext_prob_mlb)

0.30066645340157822

In [74]:
target_full = [set(t) - {np.nan} for t in tweets[['emo1','emo2','emo3']].values]

In [75]:
pred = [set(pred) for pred in classifier.predict(test_data.text.values, 1)]
sum([len(target_full[i] & pred[i])>0 for i in range(len(pred))]) / len(test_data)

0.491362763915547

In [76]:
pred = [set(pred) for pred in classifier.predict(test_data.text.values, 2)]
sum([len(target_full[i] & pred[i])>0 for i in range(len(pred))]) / len(test_data)

0.7072936660268714

In [77]:
pred = [set(pred) for pred in classifier.predict(test_data.text.values, 3)]
sum([len(target_full[i] & pred[i])>0 for i in range(len(pred))]) / len(test_data)

0.8272552783109405

In [78]:
def prob_validation(pred, target):
    if pred[0] in target:
        return pred[1]
    return 0

In [23]:
pred = [pred[0] for pred in classifier.predict_proba(test_data.text.values, 1)]

NameError: name 'classifier' is not defined

In [80]:
sum([prob_validation(pred[i], target[i]) for i in range(len(pred))]) / sum(list(zip(*pred))[1])

0.41039154375911274

```
~/fastText/fasttext supervised -input train.txt -output output -minn 3 
-maxn 6
~/fastText/fasttext predict-prob output.bin test.txt 6 > res.txt
```

In [24]:
custom_results = pd.read_csv('./fasttext/res.txt', names=['pred_pob'])
custom_fasttext_prob = custom_results.pred_pob.apply(lambda pred: [tuple(p.split(' ')[:2]) for p in pred.split('__label__')[1:]]).tolist()
custom_pred = [pred[0][0] for pred in custom_fasttext_prob]
evaluate_prediction(custom_pred, target, le)
custom_prob_mlb = [mlb_order(pred, mlb.classes_) for pred in custom_fasttext_prob]
print('average_precision_score: %f' % average_precision_score(y_bin_emotions, custom_prob_mlb))
print('label_ranking_average_precision_score: %f' % label_ranking_average_precision_score(y_bin_emotions, custom_prob_mlb))
print('label_ranking_loss: %f' %label_ranking_loss(y_bin_emotions, custom_prob_mlb))

             precision    recall  f1-score   support

      anger       0.42      0.33      0.37       107
       fear       0.16      0.41      0.23        87
        joy       0.74      0.39      0.51       456
    sadness       0.35      0.51      0.42       165
   surprise       0.26      0.19      0.22       125
      trust       0.30      0.46      0.36       102

avg / total       0.50      0.39      0.41      1042

Accuracy:
0.389635316699
average_precision_score: 0.449240
label_ranking_average_precision_score: 0.656872
label_ranking_loss: 0.282600
