#Data Preprocessing

### Remove empty rows

In [None]:
import pandas as pd
df = pd.read_csv('train.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,Time,data,label
0,0,[00:12.55,Another day wasted out of time,Angry
1,1,[00:14.69,I can't get out of this,Angry
2,2,[00:16.03,Altered state of mind,Angry
3,3,[00:17.41,I'm going overboard,Angry
4,4,[00:18.75,My conscience meets decline,Angry


In [None]:
len(df)

20115

In [None]:
df.dropna(
    axis=0,
    how='any',
    thresh=None,
    subset=None,
    inplace=True
)

In [None]:
len(df)

17674

### Label Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder
import random
import pickle
import numpy as np

X_train = df['data'].values 

y_train = df['label'].values

print('before: %s ...' %y_train[:5])

le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)

print('after: %s ...' %y_train[:5])

before: ['Angry' 'Angry' 'Angry' 'Angry' 'Angry'] ...
after: [0 0 0 0 0] ...


### Porter Stemmer

In [None]:
import nltk
import string
import re

porter_stemmer = nltk.stem.porter.PorterStemmer()

def porter_tokenizer(text, stemmer=porter_stemmer):
    """
    A Porter-Stemmer-Tokenizer hybrid to splits sentences into words (tokens) 
    and applies the porter stemming algorithm to each of the obtained token. 
    Tokens that are only consisting of punctuation characters are removed as well.
    Only tokens that consist of more than one letter are being kept.
    
    Parameters
    ----------
        
    text : `str`. 
      A sentence that is to split into words.
        
    Returns
    ----------
    
    no_punct : `str`. 
      A list of tokens after stemming and removing Sentence punctuation patterns.
    
    """
    lower_txt = text.lower()
    tokens = nltk.wordpunct_tokenize(lower_txt)
    stems = [porter_stemmer.stem(t) for t in tokens]
    no_punct = [s for s in stems if re.match('^[a-zA-Z]+$', s) is not None]
    return no_punct

In [None]:
porter_tokenizer("Don't !!! --- want swimming. ")

['don', 't', 'want', 'swim']

### Stop words

In [None]:
with open('stopwords.txt', 'r') as infile:
    stop_words = infile.read().splitlines()
print('stop words %s ...' %stop_words[:5])

stop words ["a's", 'able', 'about', 'above', 'according'] ...


###Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(
            encoding='utf-8',
            decode_error='replace',
            strip_accents='unicode',
            analyzer='word',
            binary=False,
            stop_words=stop_words,
            tokenizer=porter_tokenizer,
            ngram_range=(1,1)
    )

In [None]:
vocab = ["123 1 The\n swimmer likes swimming so he swims. Don't didn`t"]

vec = vec.fit(vocab)

sentence1 = vec.transform([u'The swimmer likes swimming.'])
sentence2 = vec.transform(['The\nswimmer \nswims.'])


print('TEST:')
print('Vocabulary: %s' %vec.get_feature_names())
print('Sentence 1: %s' %sentence1.toarray())
print('Sentence 2: %s' %sentence2.toarray())

TEST:
Vocabulary: ['didn', 'don', 'swim', 'swimmer', 't']
Sentence 1: [[0 0 1 1 0]]
Sentence 2: [[0 0 1 1 0]]


  "The parameter 'token_pattern' will not be used"
  % sorted(inconsistent)


In [None]:
X_train = df['data'].values 
vec = vec.fit(X_train.ravel())
print('Vocabulary size: %s' %len(vec.get_feature_names()))

Vocabulary size: 4333


In [None]:
vec = CountVectorizer(
            encoding='utf-8',
            decode_error='replace',
            strip_accents='unicode',
            analyzer='word',
            binary=False,
            stop_words=stop_words,
            tokenizer=porter_tokenizer,
            ngram_range=(2,2)
    )
# N-grams = 2
vocab = ["123 1 The\n swimmer likes swimming so he swims. Don't didn`t"]

vec = vec.fit(vocab)

sentence1 = vec.transform([u'The swimmer likes swimming.'])
sentence2 = vec.transform(['The\nswimmer \nswims.'])


print('TEST:')
print('Vocabulary: %s' %vec.get_feature_names())
print('Sentence 1: %s' %sentence1.toarray())
print('Sentence 2: %s' %sentence2.toarray())

TEST:
Vocabulary: ['didn t', 'don t', 'swim don', 'swim swim', 'swimmer swim', 't didn']
Sentence 1: [[0 0 0 0 1 0]]
Sentence 2: [[0 0 0 0 1 0]]


### Tfidf Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
            encoding='utf-8',
            decode_error='replace',
            strip_accents='unicode',
            analyzer='word',
            binary=False,
            stop_words=stop_words,
            tokenizer=porter_tokenizer
    )

In [None]:
vocab = ["123 1 The\n swimmer likes swimming so he swims. Don't didn`t"]

tfidf = tfidf.fit(vocab)

sentence1 = tfidf.transform([u'The swimmer likes swimming.'])
sentence2 = tfidf.transform(['The\nswimmer \nswims.'])


print('TEST:')
print('Vocabulary: %s' %tfidf.get_feature_names())
print('Sentence 1: %s' %sentence1.toarray())
print('Sentence 2: %s' %sentence2.toarray())

TEST:
Vocabulary: ['didn', 'don', 'swim', 'swimmer', 't']
Sentence 1: [[0.         0.         0.70710678 0.70710678 0.        ]]
Sentence 2: [[0.         0.         0.70710678 0.70710678 0.        ]]


In [None]:
tfidf = tfidf.fit(X_train.ravel())

print('Vocabulary size: %s' %len(tfidf.get_feature_names()))

Vocabulary size: 4333


In [None]:
print(tfidf.get_feature_names())

['a', 'aaa', 'aaaaaaah', 'aaaaahhhh', 'aaaaahhhhh', 'aaahhh', 'abandon', 'abl', 'abort', 'abound', 'abouta', 'abov', 'absent', 'absolut', 'absorb', 'abstract', 'absurd', 'abus', 'accent', 'accept', 'accus', 'ach', 'achiev', 'achin', 'acid', 'ackowledg', 'acquir', 'act', 'actin', 'action', 'actual', 'ad', 'add', 'addict', 'address', 'adelita', 'adio', 'admir', 'admit', 'admittedli', 'adolesc', 'ador', 'adrenalin', 'advanc', 'advantag', 'adventur', 'advic', 'aero', 'aeroplan', 'aerorplan', 'affect', 'afford', 'afraid', 'afterglow', 'afternoon', 'age', 'agenda', 'aggrav', 'aggress', 'ago', 'agoni', 'agre', 'ah', 'ahahahahah', 'ahead', 'ahh', 'ahhh', 'ahhhh', 'aid', 'aight', 'aim', 'ain', 'aint', 'air', 'airplan', 'ak', 'al', 'alaina', 'alanti', 'alarm', 'album', 'alcohol', 'alien', 'alison', 'aliv', 'alla', 'alley', 'alli', 'allright', 'alon', 'alongsid', 'alreadi', 'alright', 'alrightwith', 'alter', 'alway', 'amaz', 'amazingb', 'ambul', 'amen', 'america', 'american', 'amidst', 'ammunit',

# Model

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn import metrics

# `pos_label` for positive class, since we have sad=1, happy=0

f1_scorer = metrics.make_scorer(metrics.f1_score, greater_is_better=True, average = 'macro')

In [None]:
from sklearn.model_selection import GridSearchCV
from pprint import pprint

pipeline_1 = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', BernoulliNB())
])

parameters_1 = dict(
    vect__binary=[True],
    vect__stop_words=[stop_words, None],
    vect__tokenizer=[porter_tokenizer, None],
    vect__ngram_range=[(1,1), (2,2), (3,3)],
)

grid_search_1 = GridSearchCV(pipeline_1, 
                           parameters_1, 
                           n_jobs=1, 
                           verbose=1,
                           scoring=f1_scorer,
                           cv=10
                )


print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline_1.steps])
print("parameters:")
pprint(parameters_1, depth=2)
grid_search_1.fit(X_train, y_train)
print("Best score: %0.3f" % grid_search_1.best_score_)
print("Best parameters set:")
best_parameters_1 = grid_search_1.best_estimator_.get_params()
for param_name in sorted(parameters_1.keys()):
    print("\t%s: %r" % (param_name, best_parameters_1[param_name]))

Performing grid search...
pipeline: ['vect', 'clf']
parameters:
{'vect__binary': [True],
 'vect__ngram_range': [(...), (...), (...)],
 'vect__stop_words': [[...], None],
 'vect__tokenizer': [<function porter_tokenizer at 0x7f446d347710>, None]}
Fitting 10 folds for each of 12 candidates, totalling 120 fits


  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)


Best score: 0.339
Best parameters set:
	vect__binary: True
	vect__ngram_range: (1, 1)
	vect__stop_words: None
	vect__tokenizer: <function porter_tokenizer at 0x7f446d347710>


In [None]:
pipeline_3 = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])

parameters_3 = dict(
    vect__binary=[False],
    vect__stop_words=[stop_words, None],
    vect__tokenizer=[porter_tokenizer, None],
    vect__ngram_range=[(1,1), (2,2), (3,3)],
)

grid_search_3 = GridSearchCV(pipeline_3, 
                           parameters_3, 
                           n_jobs=1, 
                           verbose=1,
                           scoring=f1_scorer,
                           cv=10
                )


print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline_3.steps])
print("parameters:")
pprint(parameters_3, depth=2)
grid_search_3.fit(X_train, y_train)
print("Best score: %0.3f" % grid_search_3.best_score_)
print("Best parameters set:")
best_parameters_3 = grid_search_3.best_estimator_.get_params()
for param_name in sorted(parameters_3.keys()):
    print("\t%s: %r" % (param_name, best_parameters_3[param_name]))

Performing grid search...
pipeline: ['vect', 'clf']
parameters:
{'vect__binary': [False],
 'vect__ngram_range': [(...), (...), (...)],
 'vect__stop_words': [[...], None],
 'vect__tokenizer': [<function porter_tokenizer at 0x7f446d347710>, None]}
Fitting 10 folds for each of 12 candidates, totalling 120 fits


  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)


Best score: 0.341
Best parameters set:
	vect__binary: False
	vect__ngram_range: (1, 1)
	vect__stop_words: None
	vect__tokenizer: None


In [None]:
from sklearn.model_selection import GridSearchCV

pipeline_4 = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

parameters_4 = dict(
    vect__binary=[False],
    vect__stop_words=[stop_words, None],
    vect__tokenizer=[porter_tokenizer, None],
    vect__ngram_range=[(1,1), (2,2), (3,3)],
)

grid_search_4 = GridSearchCV(pipeline_4, 
                           parameters_4, 
                           n_jobs=1, 
                           verbose=1,
                           scoring=f1_scorer,
                           cv=10
                )


print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline_4.steps])
print("parameters:")
pprint(parameters_4, depth=2)
grid_search_4.fit(X_train, y_train)
print("Best score: %0.3f" % grid_search_4.best_score_)
print("Best parameters set:")
best_parameters_4 = grid_search_4.best_estimator_.get_params()
for param_name in sorted(parameters_4.keys()):
    print("\t%s: %r" % (param_name, best_parameters_4[param_name]))

Performing grid search...
pipeline: ['vect', 'clf']
parameters:
{'vect__binary': [False],
 'vect__ngram_range': [(...), (...), (...)],
 'vect__stop_words': [[...], None],
 'vect__tokenizer': [<function porter_tokenizer at 0x7f446d347710>, None]}
Fitting 10 folds for each of 12 candidates, totalling 120 fits


  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)
  % sorted(inconsistent)


Best score: 0.334
Best parameters set:
	vect__binary: False
	vect__ngram_range: (1, 1)
	vect__stop_words: None
	vect__tokenizer: None


In [None]:
from sklearn.metrics import roc_curve, auc
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import StratifiedKFold
from scipy import interp

sns.set()
sns.set_style("whitegrid")

clf_1 = Pipeline([
                  ('vect', CountVectorizer(
                                           binary=True,
                                           stop_words=stop_words,
                                           tokenizer=porter_tokenizer,
                                           ngram_range=(1,1),
                                           )
                 ),
                 ('clf', BernoulliNB()),
                 ])

clf_2 = Pipeline([
                  ('vect', CountVectorizer(
                                           binary=False,
                                           stop_words=stop_words,
                                           tokenizer=porter_tokenizer,
                                           ngram_range=(1,1),
                                           )
                 ),
                 ('clf', MultinomialNB()),
                 ])

clf_3 = Pipeline([
                  ('vect', TfidfVectorizer(
                                           binary=False,
                                           stop_words=stop_words,
                                           tokenizer=porter_tokenizer,
                                           ngram_range=(1,1),
                                           )
                 ),
                 ('clf', MultinomialNB()),
                 ])

colors = ['#1947D1', '#CC3300', 'k']
linestyles = ['-', '--', '-.']
classifiers = [clf_1, clf_2, clf_3]
labels = ['1: MV Bernoulli NB, stop words, porter stemmer, \nuni-gram, df', 
          '2: Multinomial NB, stop words, porter stemmer, \nuni-gram, tf',
          '3: Multinomial NB, stop words, porter stemmer, \nuni-gram, tf-idf',
          ]

for clf,col,ls,lab in zip(classifiers, colors, linestyles, labels):
    
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []
    cv = StratifiedKFold(n_splits = 10)

    for i, (train, test) in enumerate(cv.get_n_splits(y_train)):
        probas_ = clf.fit(X_train[train], y_train[train]).predict_proba(X_train[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y_train[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)

    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, 
             mean_tpr, 
             color=col, 
             linestyle=ls,
             label='%s (ROC AUC = %0.2f)' % (lab, mean_auc), 
             lw=2
    )

plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Guessing')    
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.savefig('./images/roc_gridsearch_1.eps', dpi=300)
plt.show()

TypeError: ignored