In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
from pandas import Series, DataFrame
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

np.set_printoptions(threshold=np.nan)

In [5]:
TRAIN_DATA_PATH = "../../train.csv"
TEST_DATA_PATH = "../../test.csv"
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
comment_col = 'comment_text'
train = pd.read_csv(TRAIN_DATA_PATH)
test = pd.read_csv(TEST_DATA_PATH)
v = TfidfVectorizer()
msk = np.random.rand(len(train)) < 0.7
x = train[~msk]
train_ = train[msk]
msk = np.random.rand(len(x)) < 0.5
test_ = x[~msk]
valid_ = x[msk]

In [9]:
X_train = v.fit_transform(train_[comment_col])
Y_train = train_[label_cols].values
X_test = v.transform(valid_[comment_col])
Y_test = valid_[label_cols].values

In [22]:
lgb_train = {}
lgb_test = {}
for i,j in enumerate(label_cols):
    lgb_train[j] = lgb.Dataset(X_train,Y_train[:,i])
    lgb_test[j] = lgb.Dataset(X_test,Y_test[:,i],reference=lgb_train[j])

In [23]:
auc_xgb = {}
acc_xgb = {}
y_pred = {}

In [30]:
# specify your configurations as a dict
params = {'learning_rate': 0.2,
            'application': 'binary',
            'num_leaves': 31,
            'verbosity': -1,
            'metric': 'auc',
            'data_random_seed': 2,
            'bagging_fraction': 0.8,
            'feature_fraction': 0.6,
            'nthread': 4,
            'lambda_l1': 1,
            'lambda_l2': 1
         }
for j,i in enumerate(label_cols):
    print("Start Training for "+str(i))
    gbm = lgb.train(params, lgb_train[i], num_boost_round=20, valid_sets=lgb_test[i], early_stopping_rounds=5)
    print("Saving the model")
    gbm.save_model('model_'+str(i)+'.txt')
    print("Getting Predictions")
    y_pred[i] = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    print('The rmse of prediction is:', mean_squared_error(Y_test[:,j], y_pred[i]) ** 0.5)
    auc_xgb[i] =  roc_auc_score(Y_test[:,j], y_pred[i])
    print("AUC for "+str(i)+" is "+str(auc_xgb[i]))
    print("\n\n")

Start Training for toxic
[1]	valid_0's auc: 0.762788
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's auc: 0.828874
[3]	valid_0's auc: 0.866898
[4]	valid_0's auc: 0.892643
[5]	valid_0's auc: 0.894773
[6]	valid_0's auc: 0.903066
[7]	valid_0's auc: 0.906259
[8]	valid_0's auc: 0.908955
[9]	valid_0's auc: 0.912861
[10]	valid_0's auc: 0.917946
[11]	valid_0's auc: 0.918471
[12]	valid_0's auc: 0.920555
[13]	valid_0's auc: 0.922369
[14]	valid_0's auc: 0.9241
[15]	valid_0's auc: 0.926662
[16]	valid_0's auc: 0.928649
[17]	valid_0's auc: 0.930418
[18]	valid_0's auc: 0.929728
[19]	valid_0's auc: 0.93138
[20]	valid_0's auc: 0.932296
Did not meet early stopping. Best iteration is:
[20]	valid_0's auc: 0.932296
Saving the model
Getting Predictions
The rmse of prediction is: 0.20528318256635625
AUC for toxic is 0.9322964139760639



Start Training for severe_toxic
[1]	valid_0's auc: 0.854463
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's auc: 0.9071

Took Inspiration from:
1. [github documentation's sample code](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py)
2. [Analytics Vidhya](https://www.analyticsvidhya.com/blog/2017/06/which-algorithm-takes-the-crown-light-gbm-vs-xgboost/#comment-152378)
3. NBSVM notebook
4. [Kaggle](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/52645)