# Basic Example using patterns classifier

In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics
from modules.utils import settings
from sklearn import preprocessing
import joblib
from modules.pattern_classifier import  SimpleClassifier, PatternVectorizer

In [2]:
# Load annotated tweets
col = ["text", "emo1", "emo2", "emo3"]
filename = 'data/datasets/annotated_tweets.tsv'
tweets = pd.read_table(filename, header=None, names=col)

## 1. Load Vectorizer and Classifier Instance

Load a PatternVectorizer pv and SimpleClassifier cls instances from a folder containing files for each classes with pattern and corresponding score.

In [3]:
emo_pv, emo_clf = SimpleClassifier.load_from_folder('data/datasets/emotion_patterns/8_emos/')

## Optional: Persist Classifier Instance to disk

In [4]:
joblib.dump(emo_clf, settings.EMO_CLF, compress=True)
joblib.dump(emo_pv, settings.EMO_PV, compress=True)
new_clf = joblib.load(settings.EMO_CLF)
new_pv = joblib.load(settings.EMO_PV)
# You can load the models here, comment the dumps and change the variable names from cls and pv to new_cls
# and new_pv respectively

## 2. Vectorize the tweets

[tweet1, tweet2 ...] -> [[O11, O12, ...], [O21, O22, ...] ...]
Oij represent the number of occurence of the jth pattern in the ith tweet.

In [5]:
documentPatternVectors = new_pv.transform(tweets.text)
documentPatternVectors[:5]

array([[3, 2, 0, ..., 0, 0, 0],
       [4, 0, 1, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [2, 0, 1, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0]], dtype=int32)

## 3. Classify tweets

In [6]:
# using one emotion, guess the one with the smallest rank
Y_GUESS_1 = new_clf.get_min_score_class(documentPatternVectors)
Y_GUESS_1[:5]

['anticipation', 'trust', 'trust', 'anticipation', 'anticipation']

In [7]:
# using two emotion, guess the one with the smallest rank
Y_GUESS_2 = new_clf.get_top_classes(documentPatternVectors, ascending=True, n=2)
Y_GUESS_2[:5]

[['anticipation', 'trust'],
 ['trust', 'joy'],
 ['trust', 'anticipation'],
 ['anticipation', 'joy'],
 ['anticipation', 'joy']]

## 4. Evaluate your model

In [9]:
le = preprocessing.LabelEncoder()
le.fit(new_clf.classes)

LabelEncoder()

In [10]:
Y = le.transform(tweets.emo1.values.tolist())
Y_GUESS_1 = le.transform(Y_GUESS_1)

In [11]:
print(sklearn.metrics.classification_report(Y, Y_GUESS_1, target_names = le.classes_))
print('Accuracy:')
print(sklearn.metrics.accuracy_score(Y, Y_GUESS_1))

              precision    recall  f1-score   support

       anger       0.24      0.28      0.26       107
anticipation       0.27      0.40      0.32       272
     disgust       0.25      0.09      0.13       183
        fear       0.13      0.26      0.18        87
         joy       0.59      0.24      0.34       456
     sadness       0.44      0.15      0.23       165
    surprise       0.14      0.46      0.22       125
       trust       0.25      0.25      0.25       102

 avg / total       0.36      0.26      0.27      1497

Accuracy:
0.2625250501


In [12]:
# Average Jacqard similarity of the two guess with the 2 label
tweets = tweets.fillna('None')
tweets['emotions'] = tweets.apply(lambda t: set((t.emo1 , t.emo2, t.emo3)) - {'None'}, axis=1)
nb_tweets = len(tweets.emotions.values)
sum([len(tweets.emotions[i] & set(Y_GUESS_2[i])) / len(tweets.emotions[i] | set(Y_GUESS_2[i])) for i in range(nb_tweets)]) / nb_tweets

0.2592963705188151

In [13]:
# Guess 2 accuracy
sum([tweets.emo1[i] in set(Y_GUESS_2[i]) for i in range(nb_tweets)]) / nb_tweets

0.42752171008684037

In [14]:
# Accuracy conting positive if there is an intersection between the two guess and the labels
sum([len(tweets.emotions[i] & set(Y_GUESS_2[i])) > 0 for i in range(nb_tweets)]) / nb_tweets

0.5784903139612558