# Picking Out the HyperParameters in Bag of Words

Using the Bag of Words method of vectorising documents, there are several ways of manipulating how Bag of Words will interpret the words found within the document.

## CountVectorizer & TF-IDF
The options that can be found are:

- Stop Words
- N-gram Range
- Min-DF

(Question for Self: Why have I short-listed these?)

## Metrics

Using Naive-Bayes Classifier
Calculating the F1 Score of Each Set

In [1]:
import pandas as pd
import numpy as np
import logging
from pprint import pprint
from time import time
from sklearn import metrics
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import classification_report

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [2]:
trainPath = '../data/hateval2019_en_train_clean.csv'
testPath = '../data/hateval2019_en_test_clean.csv'

trainSet = pd.read_csv(trainPath)
testSet = pd.read_csv(testPath)

In [3]:
trainText = trainSet.text
trainHate = trainSet.HS
trainTarget = trainSet.TR
trainAggressive = trainSet.AG

In [4]:
def pipeSetUp(clf):
    pipe = Pipeline(steps=[('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', clf)])
    return pipe

In [5]:
def runPipe(training_text, training_score, parameters, pipe):
    if __name__ == "__main__":
        grid_pipeline = GridSearchCV(pipe,parameters,n_jobs=4,verbose=1, scoring='f1')

        print("Performing grid search...")
        print("pipeline:", [name for name, _ in pipe.steps])
        print("parameters:")
        pprint(parameters)
        t0 = time()
        grid_pipeline.fit(training_text, training_score)
        print("done in %0.3fs" % (time() - t0))
        print("scoring paramater: f1")

        print("Best score: %0.3f" % grid_pipeline.best_score_)
        F1 = grid_pipeline.best_score_
        print("Best parameters set:")
        best_parameters = grid_pipeline.best_estimator_.get_params()
        for param_name in sorted(parameters.keys()):
            print("\t%s: %r" % (param_name, best_parameters[param_name]))
        return F1;

In [6]:
pipe = pipeSetUp(MultinomialNB())

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0, 0.9),
    'vect__stop_words': ('english',),
    'vect__min_df': (2, 0.1, 3, 0.2, 4),
    'vect__ngram_range': ((1, 1), (1, 2),),  
    'tfidf__use_idf': (True, False),
}

print('Getting Hate Score...')
hate_F1 = runPipe(trainText, trainHate, parameters, pipe)
print('Getting Target Score...')
target_F1 = runPipe(trainText, trainTarget, parameters, pipe)
print('Getting Aggressive Score...')
aggressive_F1 = runPipe(trainText, trainAggressive, parameters, pipe)

overall_F1 = (hate_F1 + target_F1 + aggressive_F1)/3

print("Overall F1 Score : %0.3f" % (overall_F1))

Getting Hate Score...
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0, 0.9),
 'vect__min_df': (2, 0.1, 3, 0.2, 4),
 'vect__ngram_range': ((1, 1), (1, 2)),
 'vect__stop_words': ('english',)}
Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.6s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   15.3s
[Parallel(n_jobs=4)]: Done 400 out of 400 | elapsed:   29.8s finished


done in 30.163s
scoring paramater: f1
Best score: 0.654
Best parameters set:
	tfidf__use_idf: False
	vect__max_df: 0.5
	vect__min_df: 4
	vect__ngram_range: (1, 1)
	vect__stop_words: 'english'
Getting Target Score...
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0, 0.9),
 'vect__min_df': (2, 0.1, 3, 0.2, 4),
 'vect__ngram_range': ((1, 1), (1, 2)),
 'vect__stop_words': ('english',)}
Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.9s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   13.3s
[Parallel(n_jobs=4)]: Done 400 out of 400 | elapsed:   27.9s finished


done in 28.306s
scoring paramater: f1
Best score: 0.312
Best parameters set:
	tfidf__use_idf: True
	vect__max_df: 0.75
	vect__min_df: 4
	vect__ngram_range: (1, 2)
	vect__stop_words: 'english'
Getting Aggressive Score...
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0, 0.9),
 'vect__min_df': (2, 0.1, 3, 0.2, 4),
 'vect__ngram_range': ((1, 1), (1, 2)),
 'vect__stop_words': ('english',)}
Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   13.3s
[Parallel(n_jobs=4)]: Done 400 out of 400 | elapsed:   27.7s finished


done in 28.141s
scoring paramater: f1
Best score: 0.236
Best parameters set:
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__min_df: 4
	vect__ngram_range: (1, 2)
	vect__stop_words: 'english'
Overall F1 Score : 0.400


In [7]:
pipe = pipeSetUp(BernoulliNB())

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0, 0.9),
    'vect__stop_words': ('english',),
    'vect__min_df': (2, 0.1, 3, 0.2, 4),
    'vect__ngram_range': ((1, 1), (1, 2),),  
    'tfidf__use_idf': (True, False),
}

print('Getting Hate Score...')
hate_F1 = runPipe(trainText, trainHate, parameters, pipe)
print('Getting Target Score...')
target_F1 = runPipe(trainText, trainTarget, parameters, pipe)
print('Getting Aggressive Score...')
aggressive_F1 = runPipe(trainText, trainAggressive, parameters, pipe)

overall_F1 = (hate_F1 + target_F1 + aggressive_F1)/3

print("Overall F1 Score : %0.3f" % (overall_F1))

Getting Hate Score...
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0, 0.9),
 'vect__min_df': (2, 0.1, 3, 0.2, 4),
 'vect__ngram_range': ((1, 1), (1, 2)),
 'vect__stop_words': ('english',)}
Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.9s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   13.5s
[Parallel(n_jobs=4)]: Done 400 out of 400 | elapsed:   27.9s finished


done in 28.162s
scoring paramater: f1
Best score: 0.680
Best parameters set:
	tfidf__use_idf: True
	vect__max_df: 0.75
	vect__min_df: 3
	vect__ngram_range: (1, 1)
	vect__stop_words: 'english'
Getting Target Score...
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0, 0.9),
 'vect__min_df': (2, 0.1, 3, 0.2, 4),
 'vect__ngram_range': ((1, 1), (1, 2)),
 'vect__stop_words': ('english',)}
Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.9s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   13.2s
[Parallel(n_jobs=4)]: Done 400 out of 400 | elapsed:   27.7s finished


done in 28.002s
scoring paramater: f1
Best score: 0.558
Best parameters set:
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__min_df: 4
	vect__ngram_range: (1, 1)
	vect__stop_words: 'english'
Getting Aggressive Score...
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0, 0.9),
 'vect__min_df': (2, 0.1, 3, 0.2, 4),
 'vect__ngram_range': ((1, 1), (1, 2)),
 'vect__stop_words': ('english',)}
Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.9s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   13.7s
[Parallel(n_jobs=4)]: Done 400 out of 400 | elapsed:   28.4s finished


done in 28.855s
scoring paramater: f1
Best score: 0.386
Best parameters set:
	tfidf__use_idf: True
	vect__max_df: 0.75
	vect__min_df: 4
	vect__ngram_range: (1, 2)
	vect__stop_words: 'english'
Overall F1 Score : 0.541


In [8]:
pipe = pipeSetUp(SGDClassifier())

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0, 0.9),
    'vect__stop_words': ('english',),
    'vect__min_df': (2, 0.1, 3, 0.2, 4),
    'vect__ngram_range': ((1, 1), (1, 2),),  
    'tfidf__use_idf': (True, False),
#     'tfidf__norm': ('l1','l2'),
    'clf__max_iter': (10000,100000,),
    'clf__penalty': ('l1','l2', 'elasticnet'),
    'clf__alpha': (0.0001,0.00001,0.0002,0.00002),
}

print('Getting Hate Score...')
hate_F1 = runPipe(trainText, trainHate, parameters, pipe)
print('Getting Target Score...')
target_F1 = runPipe(trainText, trainTarget, parameters, pipe)
print('Getting Aggressive Score...')
aggressive_F1 = runPipe(trainText, trainAggressive, parameters, pipe)

overall_F1 = (hate_F1 + target_F1 + aggressive_F1)/3

print("Overall F1 Score : %0.3f" % (overall_F1))

Getting Hate Score...
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (0.0001, 1e-05, 0.0002, 2e-05),
 'clf__max_iter': (10000, 100000),
 'clf__penalty': ('l1', 'l2', 'elasticnet'),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0, 0.9),
 'vect__min_df': (2, 0.1, 3, 0.2, 4),
 'vect__ngram_range': ((1, 1), (1, 2)),
 'vect__stop_words': ('english',)}
Fitting 5 folds for each of 1920 candidates, totalling 9600 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   17.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   39.7s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:  2.8min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:  3.9min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed:  5.1min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed:  6.6min
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed:  8.1min
[Parallel(n_jobs=4)]: Done 6042 tasks      | elapsed:  9.6min
[Parallel(n_jobs=4)]: Done 7192 tasks      | elapsed: 11.4min
[Parallel(n_jobs=4)]: Done 8442 tasks      | elapsed: 13.4min
[Parallel(n_jobs=4)]: Done 9600 out of 9600 | elapsed: 15.3min finished


done in 916.191s
scoring paramater: f1
Best score: 0.689
Best parameters set:
	clf__alpha: 0.0001
	clf__max_iter: 10000
	clf__penalty: 'l2'
	tfidf__use_idf: False
	vect__max_df: 0.9
	vect__min_df: 2
	vect__ngram_range: (1, 2)
	vect__stop_words: 'english'
Getting Target Score...
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (0.0001, 1e-05, 0.0002, 2e-05),
 'clf__max_iter': (10000, 100000),
 'clf__penalty': ('l1', 'l2', 'elasticnet'),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0, 0.9),
 'vect__min_df': (2, 0.1, 3, 0.2, 4),
 'vect__ngram_range': ((1, 1), (1, 2)),
 'vect__stop_words': ('english',)}
Fitting 5 folds for each of 1920 candidates, totalling 9600 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   16.9s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   39.6s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:  2.7min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:  3.7min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed:  4.8min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed:  6.2min
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed:  7.6min
[Parallel(n_jobs=4)]: Done 6042 tasks      | elapsed:  9.2min
[Parallel(n_jobs=4)]: Done 7192 tasks      | elapsed: 10.9min
[Parallel(n_jobs=4)]: Done 8442 tasks      | elapsed: 12.8min
[Parallel(n_jobs=4)]: Done 9600 out of 9600 | elapsed: 14.6min finished


done in 877.191s
scoring paramater: f1
Best score: 0.535
Best parameters set:
	clf__alpha: 0.0001
	clf__max_iter: 10000
	clf__penalty: 'l2'
	tfidf__use_idf: False
	vect__max_df: 0.9
	vect__min_df: 2
	vect__ngram_range: (1, 2)
	vect__stop_words: 'english'
Getting Aggressive Score...
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (0.0001, 1e-05, 0.0002, 2e-05),
 'clf__max_iter': (10000, 100000),
 'clf__penalty': ('l1', 'l2', 'elasticnet'),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0, 0.9),
 'vect__min_df': (2, 0.1, 3, 0.2, 4),
 'vect__ngram_range': ((1, 1), (1, 2)),
 'vect__stop_words': ('english',)}
Fitting 5 folds for each of 1920 candidates, totalling 9600 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.6s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   17.5s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   39.7s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:  2.7min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:  3.6min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed:  4.7min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed:  6.0min
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed:  7.5min
[Parallel(n_jobs=4)]: Done 6042 tasks      | elapsed:  9.0min
[Parallel(n_jobs=4)]: Done 7192 tasks      | elapsed: 10.7min
[Parallel(n_jobs=4)]: Done 8442 tasks      | elapsed: 12.6min
[Parallel(n_jobs=4)]: Done 9600 out of 9600 | elapsed: 14.4min finished


done in 862.100s
scoring paramater: f1
Best score: 0.396
Best parameters set:
	clf__alpha: 2e-05
	clf__max_iter: 10000
	clf__penalty: 'l2'
	tfidf__use_idf: False
	vect__max_df: 0.75
	vect__min_df: 2
	vect__ngram_range: (1, 2)
	vect__stop_words: 'english'
Overall F1 Score : 0.540


In [9]:
pipe = pipeSetUp(LogisticRegression())

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0, 0.9),
    'vect__stop_words': ('english',),
    'vect__min_df': (2, 0.1, 3, 0.2, 4),
    'vect__ngram_range': ((1, 1), (1, 2),),  
    'tfidf__use_idf': (True, False),
#     'tfidf__norm': ('l1','l2'),
    'clf__max_iter': (10000 ,100000,),
#    'clf__penalty': ('l1','l2', 'elasticnet'),
#    'clf__alpha': (0.0001,0.00001,0.0002,0.00002),
}

print('Getting Hate Score...')
hate_F1 = runPipe(trainText, trainHate, parameters, pipe)
print('Getting Aggressive Score...')
aggressive_F1 = runPipe(trainText, trainAggressive, parameters, pipe)

pipe = pipeSetUp(GradientBoostingClassifier)

print('Getting Target Score...')
target_F1 = runPipe(trainText, trainTarget, parameters, pipe)

overall_F1 = (hate_F1 + target_F1 + aggressive_F1)/3

print("Overall F1 Score : %0.3f" % (overall_F1))

Getting Hate Score...
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__max_iter': (10000, 100000),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0, 0.9),
 'vect__min_df': (2, 0.1, 3, 0.2, 4),
 'vect__ngram_range': ((1, 1), (1, 2)),
 'vect__stop_words': ('english',)}
Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   19.3s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   45.6s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 800 out of 800 | elapsed:  1.4min finished


done in 82.090s
scoring paramater: f1
Best score: 0.683
Best parameters set:
	clf__max_iter: 10000
	tfidf__use_idf: False
	vect__max_df: 0.5
	vect__min_df: 4
	vect__ngram_range: (1, 1)
	vect__stop_words: 'english'
Getting Aggressive Score...
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__max_iter': (10000, 100000),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0, 0.9),
 'vect__min_df': (2, 0.1, 3, 0.2, 4),
 'vect__ngram_range': ((1, 1), (1, 2)),
 'vect__stop_words': ('english',)}
Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   19.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   44.8s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done 800 out of 800 | elapsed:  1.4min finished


done in 83.734s
scoring paramater: f1
Best score: 0.299
Best parameters set:
	clf__max_iter: 10000
	tfidf__use_idf: False
	vect__max_df: 0.75
	vect__min_df: 4
	vect__ngram_range: (1, 2)
	vect__stop_words: 'english'
Getting Target Score...
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__max_iter': (10000, 100000),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0, 0.9),
 'vect__min_df': (2, 0.1, 3, 0.2, 4),
 'vect__ngram_range': ((1, 1), (1, 2)),
 'vect__stop_words': ('english',)}
Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


TypeError: get_params() missing 1 required positional argument: 'self'

In [None]:
pipe = pipeSetUp(GradientBoostingClassifier())

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0, 0.9),
    'vect__stop_words': ('english',),
    'vect__min_df': (2, 0.1, 3, 0.2, 4),
    'vect__ngram_range': ((1, 1), (1, 2),),  
    'tfidf__use_idf': (True, False),
#     'tfidf__norm': ('l1','l2'),
#    'clf__max_iter': (100000,),
#    'clf__penalty': ('l1','l2', 'elasticnet'),
#    'clf__alpha': (0.0001,0.00001,0.0002,0.00002),
    'clf__n_estimators':(100, 1000),
    'clf__learning_rate':(0.5, 1.0 , 1.5),
    'clf__max_depth':(0, 1),
    'clf__random_state':(0, 1)
}

print('Getting Target Score...')
target_F1 = runPipe(trainText, trainTarget, parameters, pipe)



In [None]:
vect = CountVectorizer(stop_words='english', ngram_range=(1, 2), min_df=4, max_df=0.75)
# do comparison with one where you don't filter out HS
target_train_set = trainSet
target_test_set = testSet

x_train_dtm = vect.fit_transform(target_train_set.text)
x_test_dtm = vect.transform(target_test_set.text)

#bernoulli_nb = BernoulliNB()
lr = LogisticRegression(max_iter=100000)

#bernoulli_nb.fit(x_train_dtm, target_train_set.AG)

#y_pred_class_bernoulli_nb = bernoulli_nb.predict(x_test_dtm)
y_pred_class_lr = lr.predict(x_test_dtm)

#bernoulli_nb_acc = metrics.accuracy_score(target_test_set.AG, y_pred_class_bernoulli_nb)
#bernoulli_nb_acc

print("Aggressive Score")
#print(classification_report(target_test_set.AG, y_pred_class_bernoulli_nb, labels=[0,1]))
print(classification_report(target_test_set.AG, y_pred_class_lr, labels=[0,1]))

In [None]:
vect = CountVectorizer(stop_words='english', ngram_range=(1, 2), min_df=4, max_df=0.75)
# do comparison with one where you don't filter out HS
target_train_set = trainSet[(trainSet["HS"]==1)]
target_test_set = testSet[(testSet["HS"]==1)]

x_train_dtm = vect.fit_transform(target_train_set.text)
x_test_dtm = vect.transform(target_test_set.text)

# grid pipeline is missing these variables which is one of the reasons why the F1-score i slow
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.5,max_depth=2, random_state=0)

gb.fit(x_train_dtm, target_train_set.TR)

y_pred_class_gb = gb.predict(x_test_dtm)

gb_acc = metrics.accuracy_score(target_test_set.TR, y_pred_class_gb)
gb_acc

print("Target Score")
print(classification_report(target_test_set.TR, y_pred_class_gb, labels=[0,1]))

Target Score
              precision    recall  f1-score   support

           0       0.94      0.84      0.89       731
           1       0.81      0.93      0.87       529

    accuracy                           0.88      1260
   macro avg       0.88      0.89      0.88      1260
weighted avg       0.89      0.88      0.88      1260

Target Score
              precision    recall  f1-score   support

           0       0.86      0.91      0.88       731
           1       0.86      0.80      0.83       529

    accuracy                           0.86      1260
   macro avg       0.86      0.85      0.86      1260
weighted avg       0.86      0.86      0.86      1260




In [None]:
target_test_set[(y_pred_class_gb==1) & (target_test_set.TR==0)]

# Overall F1 Scores

F1 = (F1(HS) + F1(AR) + F1(TR))/3

MultinomialNB Score: 0.400

BernoulliNB Score: 0.541

Logistic Regression: 0.489

SGDClassifier Score: 0.539

