# Pipeline Prototype

MultinomialNB = 0.768
SGDClassifier = 0.781

In [22]:
import pandas as pd
import numpy as np
import logging
from pprint import pprint
from time import time
from sklearn import metrics

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [23]:


testPath = '../data/hateval2019_en_test_clean.csv'
trainPath = '../data/hateval2019_en_train_clean.csv'

testSet = pd.read_csv(testPath)
trainSet = pd.read_csv(trainPath)

x = trainSet.text
y = trainSet.HS

x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=1)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

(6750,)
(2250,)
(6750,)


In [31]:
pipe = Pipeline(steps=[('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0, 0.9),
    'vect__stop_words': ('english',),
    'vect__min_df': (2, 0.1, 3, 0.2, 4),
    'vect__ngram_range': ((1, 1), (1, 2),),  
    'tfidf__use_idf': (True, False),
#     'tfidf__norm': ('l1','l2'),
#     'clf__max_iter': (100000,),
#    'clf__penalty': ('l2', 'elasticnet'),
#    'clf__max_iter': (10, 100, 500, 1000),
#     'clf__early_stopping': (True,False),
}


if __name__ == "__main__":
    grid_pipeline = GridSearchCV(pipe,parameters,n_jobs=4,verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipe.steps])
    print("classifier: SGDClassifier()")
    print("parameters:")
    print(parameters)
    t0 = time()
    grid_pipeline.fit(x_train,y_train)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_pipeline.best_score_)
    print("Best parameters set:")
    best_parameters = grid_pipeline.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
classifier: SGDClassifier()
parameters:
{'vect__max_df': (0.5, 0.75, 1.0, 0.9), 'vect__stop_words': ('english',), 'vect__min_df': (2, 0.1, 3, 0.2, 4), 'vect__ngram_range': ((1, 1), (1, 2)), 'tfidf__use_idf': (True, False)}
Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    5.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   17.0s
[Parallel(n_jobs=4)]: Done 400 out of 400 | elapsed:   32.5s finished


done in 32.882s

Best score: 0.768
Best parameters set:
	tfidf__use_idf: False
	vect__max_df: 0.5
	vect__min_df: 4
	vect__ngram_range: (1, 2)
	vect__stop_words: 'english'
