In [1]:
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp, partial, space_eval
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score

### Data Preparing

In [2]:
df = pd.read_csv('../data/Ethos_Dataset_Binary.csv', on_bad_lines='skip', sep=';')

In [3]:
df.shape

(998, 2)

In [4]:
df.head()

Unnamed: 0,comment,isHate
0,You should know women's sports are a joke,1.0
1,You look like Sloth with deeper Down’s syndrome,1.0
2,You look like Russian and speak like Indian. B...,1.0
3,"Women deserve to be abused, I guess.",1.0
4,Women are made for making babies and cooking d...,1.0


In [5]:
df.isHate = df.isHate.apply(lambda x: 1 if x > 0.5 else 0)

In [6]:
df.isHate.value_counts()

0    639
1    359
Name: isHate, dtype: int64

In [7]:
word_vect = TfidfVectorizer(
            sublinear_tf=True,
            strip_accents='unicode',
            analyzer='word',
            stop_words='english',
            ngram_range=(1, 2),
            max_features=2000)

In [8]:
df_train, df_test = train_test_split(df, stratify=df.isHate, test_size=0.3)

In [9]:
df_train.shape, df_test.shape

((698, 2), (300, 2))

In [10]:
word_vect.fit(df_train['comment'])
train_word_features  = word_vect.transform(df_train['comment'])
test_word_features  = word_vect.transform(df_test['comment'])

In [11]:
X_train = train_word_features.tocsr()
X_test = test_word_features.tocsr()
y_train = df_train.isHate
y_test = df_test.isHate

In [12]:
X_train.shape, X_test.shape

((698, 2000), (300, 2000))

### Model creation and evaluation

In [13]:
clf = LGBMClassifier(random_state=42) 

In [14]:
clf.fit(X_train, y_train)

LGBMClassifier(random_state=42)

In [15]:
probas_train = clf.predict_proba(X_train)
probas_test = clf.predict_proba(X_test)
predicts_train = clf.predict(X_train)
predicts_test = clf.predict(X_test)

In [16]:
metrics_train = {"roc_auc": roc_auc_score(y_train, probas_train[:, 1]),
                 "precision_macro": precision_score(y_train, predicts_train, average='macro'),
                 "recall_macro": recall_score(y_train, predicts_train, average='macro'),
                 "f1_macro": f1_score(y_train, predicts_train, average='macro')
                }

metrics_test = {"roc_auc": roc_auc_score(y_test, probas_test[:, 1]),
                "precision_macro": precision_score(y_test, predicts_test, average='macro'),
                "recall_macro": recall_score(y_test, predicts_test, average='macro'),
                "f1_macro": f1_score(y_test, predicts_test, average='macro')
                }

In [17]:
metrics_before_tuning = pd.DataFrame([metrics_train, metrics_test], index=['train', 'test']).T

In [18]:
metrics_before_tuning

Unnamed: 0,train,test
roc_auc,0.787575,0.597584
precision_macro,0.76504,0.636582
recall_macro,0.697902,0.586227
f1_macro,0.709569,0.57979


### Hyperparameters optimization

In [56]:
space = {"max_depth": hp.choice('max_depth', np.arange(4, 20, 2, dtype=int)),
         "learning_rate": hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
         "feature_fraction": hp.uniform('feature_fraction', 0.5, 1),
         "num_leaves": hp.choice('num_leaves', np.arange(2**4, 2**10, 10, dtype=int))
         }


In [57]:
def objective(params):
    lgbm = LGBMClassifier(**params)
    cv = StratifiedKFold(3)
    score = cross_val_score(lgbm, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1).mean()
    return 1 - score

In [58]:
best = fmin(objective, space, algo=tpe.suggest, max_evals=5)

100%|█████████████████████████████████████████████████| 5/5 [00:00<00:00, 12.91trial/s, best loss: 0.39483838197147814]


In [59]:
best_params = space_eval(space, best)
best_params

{'feature_fraction': 0.7456381857537797,
 'learning_rate': 0.018354543638902678,
 'max_depth': 8,
 'num_leaves': 66}

### Model after tuning

In [60]:
clf = LGBMClassifier(**best_params)

In [61]:
clf.fit(X_train, y_train)



LGBMClassifier(feature_fraction=0.7456381857537797,
               learning_rate=0.018354543638902678, max_depth=8, num_leaves=66)

In [62]:
probas_train = clf.predict_proba(X_train)
probas_test = clf.predict_proba(X_test)
predicts_train = clf.predict(X_train)
predicts_test = clf.predict(X_test)

In [63]:
metrics_train_after_tuning = {"roc_auc": roc_auc_score(y_train, probas_train[:, 1]),
                 "precision_macro": precision_score(y_train, predicts_train, average='macro'),
                 "recall_macro": recall_score(y_train, predicts_train, average='macro'),
                 "f1_macro": f1_score(y_train, predicts_train, average='macro')
                }

metrics_test_after_tuning = {"roc_auc": roc_auc_score(y_test, probas_test[:, 1]),
                "precision_macro": precision_score(y_test, predicts_test, average='macro'),
                "recall_macro": recall_score(y_test, predicts_test, average='macro'),
                "f1_macro": f1_score(y_test, predicts_test, average='macro')
                }

In [64]:
metrics_after_tuning = pd.DataFrame([metrics_train_after_tuning, metrics_test_after_tuning], index=['train', 'test']).T

In [65]:
metrics_after_tuning

Unnamed: 0,train,test
roc_auc,0.723018,0.620129
precision_macro,0.706724,0.650744
recall_macro,0.632793,0.58941
f1_macro,0.634796,0.581602


In [66]:
metrics_after_tuning - metrics_before_tuning

Unnamed: 0,train,test
roc_auc,-0.064556,0.022545
precision_macro,-0.058316,0.014162
recall_macro,-0.065109,0.003183
f1_macro,-0.074773,0.001812


In [69]:
df.to_csv('../data/ethos_data_updated.csv', index=False)