In [16]:
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp, partial, space_eval
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score

### Data Preparing

In [17]:
df = pd.read_csv('../data/Ethos_Dataset_Binary.csv', on_bad_lines='skip', sep=';')

In [18]:
df.shape

(998, 2)

In [19]:
df.head()

Unnamed: 0,comment,isHate
0,You should know women's sports are a joke,1.0
1,You look like Sloth with deeper Down’s syndrome,1.0
2,You look like Russian and speak like Indian. B...,1.0
3,"Women deserve to be abused, I guess.",1.0
4,Women are made for making babies and cooking d...,1.0


In [20]:
df.isHate = df.isHate.apply(lambda x: 1 if x > 0.5 else 0)

In [21]:
df.to_csv('../data/Ethos_Dataset_Binary.csv', index=False, sep=';')

In [8]:
df.isHate.value_counts()

0    639
1    359
Name: isHate, dtype: int64

In [9]:
df_train, df_test = train_test_split(df, stratify=df.isHate, test_size=0.3)

In [10]:
df_train.shape, df_test.shape

((698, 2), (300, 2))

### Feature extraction

In [22]:
word_vect = TfidfVectorizer(
            sublinear_tf=True,
            strip_accents='unicode',
            analyzer='word',
            stop_words='english',
            ngram_range=(1, 2),
            max_features=2000)

In [23]:
word_vect.fit(df_train['comment'])
train_word_features  = word_vect.transform(df_train['comment'])
test_word_features  = word_vect.transform(df_test['comment'])

In [24]:
X_train = train_word_features.tocsr()
X_test = test_word_features.tocsr()
y_train = df_train.isHate
y_test = df_test.isHate

In [25]:
X_train.shape, X_test.shape

((698, 2000), (300, 2000))

### Model creation and evaluation

In [26]:
clf = LGBMClassifier(random_state=42) 

In [27]:
clf.fit(X_train, y_train)

LGBMClassifier(random_state=42)

In [28]:
probas_train = clf.predict_proba(X_train)
probas_test = clf.predict_proba(X_test)
predicts_train = clf.predict(X_train)
predicts_test = clf.predict(X_test)

In [29]:
metrics_train = {"roc_auc": roc_auc_score(y_train, probas_train[:, 1]),
                 "precision_macro": precision_score(y_train, predicts_train, average='macro'),
                 "recall_macro": recall_score(y_train, predicts_train, average='macro'),
                 "f1_macro": f1_score(y_train, predicts_train, average='macro')
                }

metrics_test = {"roc_auc": roc_auc_score(y_test, probas_test[:, 1]),
                "precision_macro": precision_score(y_test, predicts_test, average='macro'),
                "recall_macro": recall_score(y_test, predicts_test, average='macro'),
                "f1_macro": f1_score(y_test, predicts_test, average='macro')
                }

In [30]:
metrics_before_tuning = pd.DataFrame([metrics_train, metrics_test], index=['train', 'test']).T

In [31]:
metrics_before_tuning

Unnamed: 0,train,test
roc_auc,0.789696,0.610918
precision_macro,0.77088,0.618731
recall_macro,0.708736,0.584491
f1_macro,0.721081,0.581395


### Hyperparameters optimization

In [32]:
space = {"max_depth": hp.choice('max_depth', np.arange(4, 20, 2, dtype=int)),
         "learning_rate": hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
         "feature_fraction": hp.uniform('feature_fraction', 0.5, 1),
         "num_leaves": hp.choice('num_leaves', np.arange(2**4, 2**10, 10, dtype=int))
         }


In [33]:
def objective(params):
    lgbm = LGBMClassifier(**params)
    cv = StratifiedKFold(3)
    score = cross_val_score(lgbm, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1).mean()
    return 1 - score

In [34]:
best = fmin(objective, space, algo=tpe.suggest, max_evals=5)

100%|███████████████████████████████████████████████████| 5/5 [00:08<00:00,  1.75s/trial, best loss: 0.404428516238171]


In [35]:
best_params = space_eval(space, best)
best_params

{'feature_fraction': 0.7119758188269941,
 'learning_rate': 0.011279011841610266,
 'max_depth': 12,
 'num_leaves': 796}

### Model after tuning

In [36]:
clf = LGBMClassifier(**best_params)

In [37]:
clf.fit(X_train, y_train)



LGBMClassifier(feature_fraction=0.7119758188269941,
               learning_rate=0.011279011841610266, max_depth=12,
               num_leaves=796)

In [38]:
probas_train = clf.predict_proba(X_train)
probas_test = clf.predict_proba(X_test)
predicts_train = clf.predict(X_train)
predicts_test = clf.predict(X_test)

In [39]:
metrics_train_after_tuning = {"roc_auc": roc_auc_score(y_train, probas_train[:, 1]),
                 "precision_macro": precision_score(y_train, predicts_train, average='macro'),
                 "recall_macro": recall_score(y_train, predicts_train, average='macro'),
                 "f1_macro": f1_score(y_train, predicts_train, average='macro')
                }

metrics_test_after_tuning = {"roc_auc": roc_auc_score(y_test, probas_test[:, 1]),
                "precision_macro": precision_score(y_test, predicts_test, average='macro'),
                "recall_macro": recall_score(y_test, predicts_test, average='macro'),
                "f1_macro": f1_score(y_test, predicts_test, average='macro')
                }

In [40]:
metrics_after_tuning = pd.DataFrame([metrics_train_after_tuning, metrics_test_after_tuning], index=['train', 'test']).T

In [41]:
metrics_after_tuning

Unnamed: 0,train,test
roc_auc,0.72501,0.614439
precision_macro,0.72464,0.702976
recall_macro,0.601135,0.569734
f1_macro,0.58788,0.538787


In [42]:
metrics_after_tuning - metrics_before_tuning

Unnamed: 0,train,test
roc_auc,-0.064685,0.00352
precision_macro,-0.046241,0.084244
recall_macro,-0.107601,-0.014757
f1_macro,-0.133201,-0.042609
