In [1]:
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp, partial, space_eval
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score

### Data Preparing

In [2]:
df = pd.read_csv('../data/Ethos_Dataset_Binary.csv', on_bad_lines='skip', sep=';')

In [3]:
df.shape

(998, 2)

In [4]:
df.head()

Unnamed: 0,comment,isHate
0,You should know women's sports are a joke,1
1,You look like Sloth with deeper Down’s syndrome,1
2,You look like Russian and speak like Indian. B...,1
3,"Women deserve to be abused, I guess.",1
4,Women are made for making babies and cooking d...,1


In [5]:
# df.isHate = df.isHate.apply(lambda x: 1 if x > 0.5 else 0)

In [6]:
# df.to_csv('../data/Ethos_Dataset_Binary.csv', index=False, sep=';')

In [5]:
df.isHate.value_counts()

0    639
1    359
Name: isHate, dtype: int64

In [6]:
df_train, df_test = train_test_split(df, stratify=df.isHate, test_size=0.3)

In [7]:
df_train.shape, df_test.shape

((698, 2), (300, 2))

### Feature extraction

In [8]:
word_vect = TfidfVectorizer(
            sublinear_tf=True,
            strip_accents='unicode',
            analyzer='word',
            stop_words='english',
            ngram_range=(1, 2),
            max_features=2000)

In [9]:
word_vect.fit(df['comment'])
train_word_features  = word_vect.transform(df_train['comment'])
test_word_features  = word_vect.transform(df_test['comment'])

In [10]:
X_train = train_word_features.tocsr()
X_test = test_word_features.tocsr()
y_train = df_train.isHate
y_test = df_test.isHate

In [11]:
X_train.shape, X_test.shape

((698, 2000), (300, 2000))

### Model creation and evaluation

In [12]:
clf = LGBMClassifier(random_state=42) 

In [13]:
clf.fit(X_train, y_train)

LGBMClassifier(random_state=42)

In [14]:
probas_train = clf.predict_proba(X_train)
probas_test = clf.predict_proba(X_test)
predicts_train = clf.predict(X_train)
predicts_test = clf.predict(X_test)

In [15]:
metrics_train = {"roc_auc": roc_auc_score(y_train, probas_train[:, 1]),
                 "precision_macro": precision_score(y_train, predicts_train, average='macro'),
                 "recall_macro": recall_score(y_train, predicts_train, average='macro'),
                 "f1_macro": f1_score(y_train, predicts_train, average='macro')
                }

metrics_test = {"roc_auc": roc_auc_score(y_test, probas_test[:, 1]),
                "precision_macro": precision_score(y_test, predicts_test, average='macro'),
                "recall_macro": recall_score(y_test, predicts_test, average='macro'),
                "f1_macro": f1_score(y_test, predicts_test, average='macro')
                }

In [16]:
metrics_before_tuning = pd.DataFrame([metrics_train, metrics_test], index=['train', 'test']).T

In [17]:
metrics_before_tuning

Unnamed: 0,train,test
roc_auc,0.788288,0.577305
precision_macro,0.78196,0.591662
recall_macro,0.724672,0.578414
f1_macro,0.737773,0.579037


### Hyperparameters optimization

In [18]:
space = {"max_depth": hp.choice('max_depth', np.arange(4, 12, 2, dtype=int)),
         "learning_rate": hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
         "feature_fraction": hp.uniform('feature_fraction', 0.5, 1),
         "num_leaves": hp.choice('num_leaves', np.arange(16, 256, 2, dtype=int))
         }


In [19]:
def objective(params):
    lgbm = LGBMClassifier(**params)
    cv = StratifiedKFold(3)
    score = cross_val_score(lgbm, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1).mean()
    return 1 - score

In [20]:
best = fmin(objective, space, algo=tpe.suggest, max_evals=5)

100%|███████████████████████████████████████████████████| 5/5 [00:08<00:00,  1.67s/trial, best loss: 0.367389019163904]


In [21]:
best_params = space_eval(space, best)
best_params

{'feature_fraction': 0.8943702879024744,
 'learning_rate': 0.15311124281446548,
 'max_depth': 4,
 'num_leaves': 64}

### Model after tuning

In [22]:
clf = LGBMClassifier(random_state = 42, **best_params)

In [23]:
clf.fit(X_train, y_train)



LGBMClassifier(feature_fraction=0.8943702879024744,
               learning_rate=0.15311124281446548, max_depth=4, num_leaves=64,
               random_state=42)

In [24]:
probas_train = clf.predict_proba(X_train)
probas_test = clf.predict_proba(X_test)
predicts_train = clf.predict(X_train)
predicts_test = clf.predict(X_test)

In [25]:
metrics_train_after_tuning = {"roc_auc": roc_auc_score(y_train, probas_train[:, 1]),
                 "precision_macro": precision_score(y_train, predicts_train, average='macro'),
                 "recall_macro": recall_score(y_train, predicts_train, average='macro'),
                 "f1_macro": f1_score(y_train, predicts_train, average='macro')
                }

metrics_test_after_tuning = {"roc_auc": roc_auc_score(y_test, probas_test[:, 1]),
                "precision_macro": precision_score(y_test, predicts_test, average='macro'),
                "recall_macro": recall_score(y_test, predicts_test, average='macro'),
                "f1_macro": f1_score(y_test, predicts_test, average='macro')
                }

In [26]:
metrics_after_tuning = pd.DataFrame([metrics_train_after_tuning, metrics_test_after_tuning], index=['train', 'test']).T

In [27]:
metrics_after_tuning

Unnamed: 0,train,test
roc_auc,0.753986,0.600019
precision_macro,0.757307,0.615556
recall_macro,0.697412,0.594039
f1_macro,0.708617,0.595078


In [28]:
metrics_after_tuning - metrics_before_tuning

Unnamed: 0,train,test
roc_auc,-0.034301,0.022714
precision_macro,-0.024653,0.023893
recall_macro,-0.02726,0.015625
f1_macro,-0.029156,0.016041
