# Basic Example using patterns classifier

In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics

from sklearn import preprocessing

#import PatternVectorizer and SimpleClassifier
from pattern_classifier import  SimpleClassifier, PatternVectorizer

In [2]:
# Load annotated tweets
col = ["text", "emo1", "emo2", "emo3"]
filename = 'data/annotated/annotated_tweets.tsv'
tweets = pd.read_table(filename, header=None, names=col)

## 1. Load Vectorizer and Classifier Instance

Load a PatternVectorizer pv and SimpleClassifier cls instances from a folder containing files for each classes with pattern and corresponding score.

In [None]:
pv, cls = SimpleClassifier.load_from_folder('data/patterns/8_emos/')

## 2. Vectorize the tweets

[tweet1, tweet2 ...] -> [[O11, O12, ...], [O21, O22, ...] ...]
Oij represent the number of occurence of the jth pattern in the ith tweet.

In [None]:
documentPatternVectors = pv.transform(tweets.text)
documentPatternVectors[:5]

## 3. Classify tweets

In [None]:
# using one guess the one with the smallest rank
Y_GUESS_1 = cls.get_min_score_class(documentPatternVectors)
Y_GUESS_1[:5]

In [None]:
# using two guess the one with the smallest rank
Y_GUESS_2 = cls.get_top_classes(documentPatternVectors, ascending=True, n=2)
Y_GUESS_2[:5]

## 4. Evaluate your model

In [None]:
le = preprocessing.LabelEncoder()
le.fit(cls.classes)

In [None]:
Y = le.transform(tweets.emo1.values.tolist())

In [None]:
Y_GUESS_1 = le.transform(Y_GUESS_1)

In [None]:
print(sklearn.metrics.classification_report(Y, Y_GUESS_1, target_names = le.classes_))
print('Accuracy:')
print(sklearn.metrics.accuracy_score(Y, Y_GUESS_1))

In [None]:
# Guess 2 accuracy
nb_tweets = len(tweets.emotion.values)
sum([tweets.emo1[i] in set(Y_GUESS_2[i]) for i in range(nb_tweets)]) / nb_tweets

In [None]:
# Average Jacqard similarity of the two guess with the 2 label
tweets = tweets.fillna('None')
tweets['emotions'] = tweets.apply(lambda t: set((t.emo1 , t.emo2, t.emo3)) - {'None'}, axis=1) 
sum([len(tweets.emotions[i] & set(Y_GUESS_2[i])) / len(tweets.emotions[i] | set(Y_GUESS_2[i])) for i in range(nb_tweets)]) / nb_tweets

In [None]:
# Accuracy conting positive if there is an intersection between the two guess and the labels
sum([len(tweets.emotions[i] & set(Y_GUESS_2[i])) > 0 for i in range(nb_tweets)]) / nb_tweets