### binary-svc.py

In [None]:
from pandas.io.parsers import TextFileReader
from utils import DataLoader

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, balanced_accuracy_score, confusion_matrix

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
dl = DataLoader('/work/hatespeech-detection/data/Ethos_Dataset_Binary.csv')
X, y = dl.get_data()
print(len(X), len(y))
print(f'{len(y)-sum(y)} (label : 0) + {sum(y)} (label : 1) = {len(y)}')
print(X[0])
print(y[0])
class_names = ['not-hate-speech', 'hate-speech']

Loaded file : Ethos_Dataset_Binary.csv
998 998
565 (label : 0) + 433 (label : 1) = 998
they must be realli suffer if they are throw food all over the ground for the pigeon
0


In [None]:
# ML Model
kf = KFold(n_splits=10)
kf.get_n_splits()

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    vec = TfidfVectorizer(
        analyzer='word', ngram_range=(1, 5), max_features=50000)
    vec.fit(X_train)
    X_tr = vec.transform(X_train)
    X_te = vec.transform(X_test)
    X_tw = vec.transform(X)
    svm = SVC(kernel='rbf')
    svm.fit(X_tr, y_train)

    y_predict = svm.predict(X_te)
    print('F1 : ', f1_score(y_test, y_predict, average='weighted'))
    print('CF-Mat : \n', confusion_matrix(y_test, y_predict))


F1 :  0.49746808790630337
CF-Mat : 
 [[54  3]
 [38  5]]
F1 :  0.5963825363825364
CF-Mat : 
 [[57  2]
 [32  9]]
F1 :  0.4433622986907658
CF-Mat : 
 [[45  2]
 [45  8]]
F1 :  0.6202666666666665
CF-Mat : 
 [[59  2]
 [30  9]]
F1 :  0.5278853046594982
CF-Mat : 
 [[59  1]
 [36  4]]
F1 :  0.5694545454545455
CF-Mat : 
 [[54  4]
 [33  9]]
F1 :  0.5441580041580041
CF-Mat : 
 [[55  3]
 [35  7]]
F1 :  0.5509393161238004
CF-Mat : 
 [[57  3]
 [34  6]]
F1 :  0.4501224906053586
CF-Mat : 
 [[47  1]
 [44  7]]
F1 :  0.5732323232323232
CF-Mat : 
 [[54  3]
 [33  9]]


## binary-classics-setA.py

In [None]:
"""
In these experiments we will try logistic regression, svms, ridge, decision trees, naive bayes and random forests classifiers across a wide variety of parameters for each algorithm and test them via nested cross validation method.
"""

from utils import DataLoader, nested_cross_val
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer, confusion_matrix, f1_score, accuracy_score, precision_score, recall_score

# https://github.com/intelligence-csd-auth-gr/Ethos-Hate-Speech-Dataset/tree/master/ethos

In [None]:
dl = DataLoader('data/Ethos_Dataset_Binary.csv')
X, y = dl.get_data()
f = open("res/setA.txt", "w+")
f.write("{: <7} | {: <7} {: <7} {: <7} {: <7} {: <7} {: <7} {: <7} {: <7} \n"
        .format('Method', 'Duration', 'scoreTi', 'F1', 'Prec.', 'Recall', 'Acc.', 'Spec.', 'Sens.'))
f.write("=========================================================================\n")
f.close()

Loaded file : Ethos_Dataset_Binary.csv


In [None]:
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
                            Run Naive Bayes
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
mNB = MultinomialNB()
vec = TfidfVectorizer(analyzer='word')
pipe = Pipeline(
    steps=[('vec', vec), ('mNB', mNB)])
parameters = [{
    'vec__ngram_range': [(1, 1), (1, 2), (1, 5)],
    'vec__max_features':[5000, 10000, 50000, 100000],
    'vec__stop_words':['english', None],
    'mNB__alpha':[1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
}]
nested_cross_val(pipe, parameters, X, y, "MultiNB",
                 n_jobs=18, filename='setA.txt')


Fitting 3 folds for each of 144 candidates, totalling 432 fits
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Fitting 3 folds for each of 144 candidates, totalling 432 fits
{'fit_time': [52.19337773323059, 40.227173805236816, 41.05148124694824, 41.993287563323975, 40.82895588874817, 42.34011101722717, 41.18693542480469, 40.73002338409424, 41.2088828086853, 41.69671821594238], 'score_time': [0.004857063293457031, 0.00477910041809082, 0.0037631988525390625, 0.0063207149505615234, 0.00845479965209961, 0.003782033920288086, 0.005002260208129

In [None]:
bNB = BernoulliNB(binarize=0.5)
vec = TfidfVectorizer(analyzer='word')
pipe = Pipeline(steps=[('vec', vec), ('bNB', bNB)])
parameters = [{
    'vec__ngram_range': [(1, 1), (1, 2), (1, 5)],
    'vec__max_features':[5000, 10000, 50000, 100000],
    'vec__stop_words':['english', None],
    'bNB__alpha':[1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
}]
nested_cross_val(pipe, parameters, X, y, "BernouNB")

Fitting 3 folds for each of 144 candidates, totalling 432 fits
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Fitting 3 folds for each of 144 candidates, totalling 432 fits
{'fit_time': [41.276970624923706, 41.482919216156006, 42.26413941383362, 42.18300271034241, 41.987168073654175, 41.598944425582886, 41.87973093986511, 40.74082922935486, 41.292404651641846, 41.44654202461243], 'score_time': [0.010764598846435547, 0.0067751407623291016, 0.005350589752197266, 0.00328826904296875, 0.006110191345214844, 0.005009889602661133, 0.00563907623

In [None]:
# """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
#                     Run Logistic Regression
# """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
log = LogisticRegression(random_state=0, max_iter=1000, n_jobs=-1)
vec = TfidfVectorizer(analyzer='word')
pipe = Pipeline(steps=[('vec', vec), ('log', log)])
parameters = [{
    'vec__ngram_range': [(1, 1), (1, 2), (1, 5)],
    'vec__max_features':[5000, 10000, 50000, 100000],
    'vec__stop_words':['english', None],
    'log__C':[0.5, 1, 3, 5, 10, 1000],
    'log__solver':['newton-cg', 'lbfgs', 'sag'],
    'log__penalty':['l2']
}, {
    'vec__ngram_range': [(1, 1), (1, 2), (1, 5)],
    'vec__max_features':[5000, 10000, 50000, 100000],
    'vec__stop_words':['english', None],
    'log__C':[0.5, 1, 3, 5, 10, 1000],
    'log__solver':['saga'],
    'log__penalty':['l1']
}]
nested_cross_val(pipe, parameters, X, y, "LogReg")

Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Fitting 3 folds for each of 576 candidates, totalling 1728 fits
{'fit_time': [940.0284416675568, 945.3995358943939, 949.4848158359528, 981.4002497196198, 1331.0336298942566, 1324.063908815384, 989.1675162315369, 943.0378432273865, 1229.0421719551086, 1326.9282703399658], 'score_time': [0.011990785598754883, 0.01048135757446289, 0.006036996841430664, 0.012997865676879883, 0.019150257110595703, 0.007010221481323242, 0.0057

In [None]:
# """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
#                             Run RidgeClassifier
# """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
ridge = RidgeClassifier(random_state=0, fit_intercept=False)
vec = TfidfVectorizer(analyzer='word')
pipe = Pipeline(steps=[('vec', vec), ('ridge', ridge)])
parameters = [{
    'vec__ngram_range': [(1, 1), (1, 2), (1, 5)],
    'vec__max_features':[5000, 10000, 50000, 100000],
    'vec__stop_words':['english', None],
    'ridge__solver':['cholesky', 'lsqr', 'sparse_cg', 'saga'],
    'ridge__alpha':[1, 0.1, 0.01, 0.001, 0.0001, 0]
}]
nested_cross_val(pipe, parameters, X, y, "Ridge")

Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Fitting 3 folds for each of 576 candidates, totalling 1728 fits
{'fit_time': [327.88445520401, 283.4724087715149, 222.31396222114563, 223.00946736335754, 208.81837940216064, 221.33240866661072, 218.72612380981445, 215.41228675842285, 217.70665574073792, 210.70419836044312], 'score_time': [0.00557255744934082, 0.016356945037841797, 0.005475759506225586, 0.007506370544433594, 0.005076885223388672, 0.006999015808105469, 0.0

In [1]:
# """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
#                             Run DecisionTree
# """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
dTree = DecisionTreeClassifier(random_state=0)
vec = TfidfVectorizer(analyzer='word')
pipe = Pipeline(steps=[('vec', vec), ('dTree', dTree)])
parameters = [{
    'vec__ngram_range': [(1, 1), (1, 2), (1, 5)],
    'vec__max_features':[5000, 10000, 50000, 100000],
    'vec__stop_words':['english', None],
    'dTree__criterion':['gini', 'entropy'],
    'dTree__max_depth':[1, 2, 3, 4, 5, 10, 25, 50, 100, 200],
    'dTree__max_features':[2, 3, 4, 5, 'sqrt', 'log2', None],
    'dTree__min_samples_leaf': [1, 2, 3, 4, 5],
    'dTree__min_samples_split': [2, 4, 8, 10, 12]
}]
nested_cross_val(pipe, parameters, X, y, "DTree")



NameError: name 'DecisionTreeClassifier' is not defined

In [None]:
# """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
#                             Run RandomForest
# """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
randFor = RandomForestClassifier(random_state=0, n_jobs=-1)
vec = TfidfVectorizer(analyzer='word')
pipe = Pipeline(steps=[('vec', vec), ('randFor', randFor)])
parameters = [{
    'vec__ngram_range': [(1, 1), (1, 2), (1, 5)],
    'vec__max_features':[5000, 10000, 50000, 100000],
    'vec__stop_words':['english', None],
    'randFor__max_depth':[1, 10, 50, 100, 200],
    'randFor__max_features':['sqrt', 'log2', None],
    'randFor__bootstrap':[True, False],
    'randFor__n_estimators': [10, 100, 500, 1000]
}]
nested_cross_val(pipe, parameters, X, y, "RandomForest")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=82b9920f-8f69-4e4d-89ee-551e9484231d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>