## Retrieve parsed collection

In [97]:
import pandas as pd
import joblib
from modules.db import mongo_base
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# connection_params = ["twitter", "crowdflower_features"]
# client = mongo_base.connect()
# db_name = connection_params[0]
# connection_params.insert(0, client)

# query = {}
# query["filter"] = {}
# query["projection"] = {}
# query["limit"] = 0
# query["skip"] = 0
# query["no_cursor_timeout"] = True
# cursor = mongo_base.finder(connection_params, query, False)
# df = pd.DataFrame(list(cursor))
# df.head()

In [2]:
crowdflower_persistence = 'data/persistence/crowdflower_features.pkl.compressed'
# joblib.dump(df, crowdflower_persistence, compress=True)

In [5]:
df = joblib.load(crowdflower_persistence)

In [6]:
df[0:1]

Unnamed: 0,_id,annotation_label,avg_token_length,bigrams,brown_cluster_ids,char_pentagrams,char_quadgrams,char_trigrams,comment_length,dep_bigrams,...,hashtags,hs_keyword_count,hs_keyword_matches,noun_chunks,text,tokens,trigrams,unknown_words,unknown_words_count,uppercase_token_count
0,58c659be6541913eb7f119dd,not_offensive,4.0,"[Warning penny, penny boards, boards will, wil...","[966, 228, 989, 333, 442, 4618, 602, 19]","[penny, warni, arnin, aggot, board, rning, fag...","[aggo, warn, arni, penn, oard, ards, enny, nin...","[pen, war, nin, mak, enn, you, agg, ggo, fag, ...",9,[warning ROOT NN warning | boards compound NN ...,...,[],1,[faggot],"[{'root': 'warning', 'text': 'warning'}, {'roo...",Warning : penny boards will make you a faggot,"[penny, faggot, warning, boards]","[Warning penny boards, penny boards will, boar...",[],0,0


## Setup data split

In [71]:
# let's pick the same random 10% of the data to train with
train_test_set = df.sample(n=int(len(df) / 2), random_state=1965)

X = train_test_set['text']
y = train_test_set['annotation_label']

In [28]:
print("Predicting the labels of the test set...")
print("%d documents" % len(X))
print("%d categories" % len(y.value_counts()))

Predicting the labels of the test set...
1450 documents
2 categories


### Setup naive baseline classification (countVectorizer)

In [98]:
# our two ingredients: the ngram counter and the classifier
vect = CountVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()

# There are just two steps to our process: extracting the ngrams and
# putting them through the classifier. So our Pipeline looks like this:

count_pipeline = Pipeline([
    ('vect', vect),  # extract ngrams from tweet text
    ('clf' , clf),   # feed the output through a classifier
])

def naive_experiment(X, y, pipeline, num_expts=1):
    scores = list()
    for i in range(num_expts):
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        model = count_pipeline.fit(X_train, y_train)  # train the classifier
        y_prediction = model.predict(X_test)          # apply the model to the test data
        report = classification_report(y_test, y_prediction)
        score = accuracy_score(y_prediction, y_test)  # compare the results to the gold standard
        scores.append(score)
        print("Classification Report: CountVectorizer")
        print(report)
        cm = confusion_matrix(y_test, y_prediction)
        print("Confusion matrix:")
        print(cm)
    print(sum(scores) / num_expts)

# Run the classifcation
naive_experiment(X, y, pipeline)

Classification Report: CountVectorizer
               precision    recall  f1-score   support

   hatespeech       0.42      0.41      0.42       304
not_offensive       0.88      0.89      0.88      1510

  avg / total       0.81      0.81      0.81      1814

Confusion matrix:
[[ 126  178]
 [ 173 1337]]
0.806504961411


### Setup naive baseline classification (hashingVectorizer)

In [103]:
# our two ingredients: the ngram counter and the classifier
vect = HashingVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()

# There are just two steps to our process: extracting the ngrams and
# putting them through the classifier. So our Pipeline looks like this:

hashing_pipeline = Pipeline([
    ('vect', vect),  # extract ngrams from tweet text
    ('clf' , clf),   # feed the output through a classifier
])

def hashing_experiment(X, y, pipeline, num_expts=1):
    scores = list()
    for i in range(num_expts):
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        model = hashing_pipeline.fit(X_train, y_train)  # train the classifier
        y_prediction = model.predict(X_test)          # apply the model to the test data
        report = classification_report(y_test, y_prediction)
        score = accuracy_score(y_prediction, y_test)  # compare the results to the gold standard
        scores.append(score)
        print("Classification Report : HashingVectorizer")
        print(report)
        cm = confusion_matrix(y_test, y_prediction)
        print("Confusion matrix:")
        print(cm)
    print(sum(scores) / num_expts)

# Run the classifcation
hashing_experiment(X, y, pipeline)

Classification Report : HashingVectorizer
               precision    recall  f1-score   support

   hatespeech       0.55      0.39      0.46       282
not_offensive       0.89      0.94      0.92      1532

  avg / total       0.84      0.86      0.85      1814

Confusion matrix:
[[ 111  171]
 [  90 1442]]
0.85611907387


### Setup tf-idf baseline classification

In [106]:
# our two ingredients: the ngram counter and the classifier
vect = TfidfVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()

# There are just two steps to our process: extracting the ngrams and
# putting them through the classifier. So our Pipeline looks like this:

hashing_pipeline = Pipeline([
    ('vect', vect),  # extract ngrams from tweet text
    ('clf' , clf),   # feed the output through a classifier
])

def tfidf_experiment(X, y, pipeline, num_expts=1):
    scores = list()
    for i in range(num_expts):
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        model = hashing_pipeline.fit(X_train, y_train)  # train the classifier
        y_prediction = model.predict(X_test)          # apply the model to the test data
        report = classification_report(y_test, y_prediction)
        score = accuracy_score(y_prediction, y_test)  # compare the results to the gold standard
        scores.append(score)
        print("Classification Report : TfidfVectorizer")
        print(report)
        cm = confusion_matrix(y_test, y_prediction)
        print("Confusion matrix:")
        print(cm)
    print(sum(scores) / num_expts)

# Run the classifcation
tfidf_experiment(X, y, pipeline)

Classification Report : TfidfVectorizer
               precision    recall  f1-score   support

   hatespeech       0.54      0.31      0.39       308
not_offensive       0.87      0.94      0.91      1506

  avg / total       0.81      0.84      0.82      1814

Confusion matrix:
[[  96  212]
 [  83 1423]]
0.837375964719
