In [1]:
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp, partial, space_eval
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score

### Data Preparing

In [2]:
df = pd.read_csv('../data/Ethos_Dataset_Binary.csv', on_bad_lines='skip', sep=';')

In [3]:
df.shape

(998, 2)

In [4]:
df.head()

Unnamed: 0,comment,isHate
0,You should know women's sports are a joke,1.0
1,You look like Sloth with deeper Down’s syndrome,1.0
2,You look like Russian and speak like Indian. B...,1.0
3,"Women deserve to be abused, I guess.",1.0
4,Women are made for making babies and cooking d...,1.0


In [5]:
df.isHate = df.isHate.apply(lambda x: 1 if x > 0.5 else 0)

In [6]:
df.isHate.value_counts()

0    639
1    359
Name: isHate, dtype: int64

In [7]:
word_vect = TfidfVectorizer(
            sublinear_tf=True,
            strip_accents='unicode',
            analyzer='word',
            stop_words='english',
            ngram_range=(1, 2),
            max_features=2000)

In [8]:
df_train, df_test = train_test_split(df, stratify=df.isHate, test_size=0.3)

In [9]:
df_train.shape, df_test.shape

((698, 2), (300, 2))

In [10]:
word_vect.fit(df_train['comment'])
train_word_features  = word_vect.transform(df_train['comment'])
test_word_features  = word_vect.transform(df_test['comment'])

In [11]:
X_train = train_word_features.tocsr()
X_test = test_word_features.tocsr()
y_train = df_train.isHate
y_test = df_test.isHate

In [12]:
X_train.shape, X_test.shape

((698, 2000), (300, 2000))

### Model creation and evaluation

In [13]:
clf = LGBMClassifier(random_state=42) 

In [14]:
clf.fit(X_train, y_train)

LGBMClassifier(random_state=42)

In [15]:
probas_train = clf.predict_proba(X_train)
probas_test = clf.predict_proba(X_test)

In [16]:
train_auc = roc_auc_score(y_train, probas_train[:, 1])
test_auc = roc_auc_score(y_test, probas_test[:, 1])

In [17]:
train_auc, test_auc

(0.7958412435270106, 0.5913628472222223)

### Hyperparameters optimization

In [18]:
space = {"max_depth": hp.randint("max_depth", 15),
         "learning_rate": hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
         "feature_fraction": hp.uniform('feature_fraction', 0.5, 1),
         "num_leaves": hp.choice('num_leaves', np.arange(16, 200, 10, dtype=int))
         }


In [19]:
def objective(params):
    lgbm= LGBMClassifier(**params)
    cv = StratifiedKFold(5)
    score = cross_val_score(lgbm, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1).mean()
    return -score

In [20]:
best = fmin(objective, space, algo=tpe.suggest, max_evals=5)

100%|█████████████████████████████████████████████████| 5/5 [00:06<00:00,  1.32s/trial, best loss: -0.6143082470441359]


In [21]:
best_params = space_eval(space, best)
best_params

{'feature_fraction': 0.5224473675625827,
 'learning_rate': 0.06286942009633371,
 'max_depth': 7,
 'num_leaves': 56}

### Model after tuning

In [22]:
clf = LGBMClassifier(**best_params)

In [23]:
clf.fit(X_train, y_train)



LGBMClassifier(feature_fraction=0.5224473675625827,
               learning_rate=0.06286942009633371, max_depth=7, num_leaves=56)

In [24]:
probas_train = clf.predict_proba(X_train)
probas_test = clf.predict_proba(X_test)

In [25]:
train_auc = roc_auc_score(y_train, probas_train[:, 1])
test_auc = roc_auc_score(y_test, probas_test[:, 1])

In [26]:
train_auc, test_auc

(0.7598242377247163, 0.6057581018518519)