# Basic Example using patterns classifier

In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics

from sklearn import preprocessing
from sklearn.externals import joblib

from scipy.sparse import csr_matrix

#import PatternVectorizer and SimpleClassifier
from pattern_classifier import  SimpleClassifier, PatternVectorizer

In [2]:
# Load annotated tweets
col = ["text", "emo1", "emo2", "emo3"]
filename = 'test/annotated/annotated_tweets.tsv'
tweets = pd.read_table(filename, header=None, names=col)

## 1. Load Vectorizer and Classifier Instance

Load a PatternVectorizer pv and SimpleClassifier cls instances from a folder containing files for each classes with pattern and corresponding score.

In [3]:
pv, cls = SimpleClassifier.load_from_folder('test/patterns/8_emos/')

## Optional: Persist Classifier Instance to disk

In [4]:
cls_persistence = 'test/simple_classifier_model.pkl.compressed'
pv_persistence = 'test/pattern_vectorizer.pkl.compressed'

joblib.dump(cls, cls_persistence, compress=True)
joblib.dump(pv, pv_persistence, compress=True)
new_cls = joblib.load(cls_persistence)
new_pv = joblib.load(pv_persistence)
# You can load the models here, comment the dumps and change the variable names from cls and pv to new_cls
# and new_pv respectively

## 2. Vectorize the tweets

[tweet1, tweet2 ...] -> [[O11, O12, ...], [O21, O22, ...] ...]
Oij represent the number of occurence of the jth pattern in the ith tweet.

In [5]:
%%time
documentPatternVectors = pv.transform(tweets.text)

CPU times: user 6min 6s, sys: 1.66 s, total: 6min 8s
Wall time: 6min 16s


Can perform in parallel.

In [6]:
%%time
documentPatternVectors_parrallel = pv.transform_parrallel(tweets.text, n_jobs=-1, n_chunk=400)

CPU times: user 1.92 s, sys: 392 ms, total: 2.31 s
Wall time: 3min 12s


In [7]:
# check if the two matrix are equal
(documentPatternVectors_parrallel - documentPatternVectors).nnz == 0

True

The occurence matrix can be serealized

In [8]:
def save_sparse_csr(filename, array):
    # note that .npz extension is added automatically
    np.savez(filename, data=array.data, indices=array.indices,
             indptr=array.indptr, shape=array.shape)

def load_sparse_csr(filename):
    # here we need to add .npz extension manually
    loader = np.load(filename + '.npz')
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])

In [9]:
save_sparse_csr('test/occurence_matrix', documentPatternVectors)
documentPatternVectors_loaded = load_sparse_csr('test/occurence_matrix')

# check if the two matrix are equal
(documentPatternVectors_loaded - documentPatternVectors).nnz == 0

True

## 3. Classify tweets

In [10]:
# using one emotion, guess the one with the smallest rank
Y_GUESS_1 = cls.get_min_score_class(documentPatternVectors)
Y_GUESS_1[:5]

['anticipation', 'trust', 'trust', 'anticipation', 'anticipation']

In [11]:
# using two emotion, guess the one with the smallest rank
Y_GUESS_2 = cls.get_top_classes(documentPatternVectors, ascending=True, n=2)
Y_GUESS_2[:5]

[['anticipation', 'trust'],
 ['trust', 'joy'],
 ['trust', 'anticipation'],
 ['anticipation', 'joy'],
 ['anticipation', 'joy']]

## 4. Evaluate your model

### Classification evaluation

In [12]:
print(sklearn.metrics.classification_report(tweets.emo1.values, Y_GUESS_1))
print('Accuracy:')
print(sklearn.metrics.accuracy_score(tweets.emo1.values, Y_GUESS_1))

              precision    recall  f1-score   support

       anger       0.24      0.28      0.26       107
anticipation       0.27      0.40      0.32       272
     disgust       0.25      0.09      0.13       183
        fear       0.13      0.26      0.18        87
         joy       0.59      0.24      0.34       456
     sadness       0.44      0.15      0.23       165
    surprise       0.14      0.46      0.22       125
       trust       0.25      0.25      0.25       102

 avg / total       0.36      0.26      0.27      1497

Accuracy:
0.2625250501


In [13]:
# Average Jacqard similarity of the two guess with the 2 label
tweets = tweets.fillna('None')
tweets['emotions'] = tweets.apply(lambda t: set((t.emo1 , t.emo2, t.emo3)) - {'None'}, axis=1)
nb_tweets = len(tweets.emotions.values)
sum([len(tweets.emotions[i] & set(Y_GUESS_2[i])) / len(tweets.emotions[i] | set(Y_GUESS_2[i])) for i in range(nb_tweets)]) / nb_tweets

0.2592963705188151

In [14]:
# Guess 2 accuracy
sum([tweets.emo1[i] in set(Y_GUESS_2[i]) for i in range(nb_tweets)]) / nb_tweets

0.42752171008684037

In [15]:
# Accuracy conting positive if there is an intersection between the two guess and the labels
sum([len(tweets.emotions[i] & set(Y_GUESS_2[i])) > 0 for i in range(nb_tweets)]) / nb_tweets

0.5784903139612558

### Ranking evaluation

In [16]:
tweets_emotions = [set(emos) & set(cls.classes) for emos in tweets[['emo1', 'emo2', 'emo3']].values]
mlb = preprocessing.MultiLabelBinarizer()
y_bin_emotions = mlb.fit_transform(tweets_emotions)

In [17]:
y_guess_emotion_prob = cls.get_emotion_prob(documentPatternVectors)

In [18]:
# to order the classifier prediction in the binariser order
cls_to_mlb = [np.where(np.array(cls.classes) == emo)[0][0] for emo in mlb.classes_.tolist()]

In [19]:
y_guess_emotion_prob_mlb = y_guess_emotion_prob[:,cls_to_mlb]

In [20]:
print('coverage_error: %f' % sklearn.metrics.coverage_error(y_bin_emotions, y_guess_emotion_prob_mlb))
print('average_precision_score: %f' % sklearn.metrics.average_precision_score(y_bin_emotions, y_guess_emotion_prob_mlb))
print('label_ranking_average_precision_score: %f' % sklearn.metrics.label_ranking_average_precision_score(y_bin_emotions, y_guess_emotion_prob_mlb))
print('label_ranking_loss: %f' % sklearn.metrics.label_ranking_loss(y_bin_emotions, y_guess_emotion_prob_mlb))

coverage_error: 4.925852
average_precision_score: 0.322029
label_ranking_average_precision_score: 0.529072
label_ranking_loss: 0.389337
