In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
from pandas import Series, DataFrame
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score

np.set_printoptions(threshold=np.nan)

In [2]:
TRAIN_DATA_PATH = "../../train.csv"
TEST_DATA_PATH = "../../test.csv"
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
comment_col = 'comment_text'
train = pd.read_csv(TRAIN_DATA_PATH)
test = pd.read_csv(TEST_DATA_PATH)
v = TfidfVectorizer()
msk = np.random.rand(len(train)) < 0.8
valid_ = train[~msk]
train_ = train[msk]

In [3]:
X_train = v.fit_transform(train_[comment_col]).toarray()
Y_train = train_[label_cols].values
X_test = v.transform(valid_[comment_col]).toarray()
Y_test = valid_[label_cols].values

In [4]:
v.vocabulary_

{'congratulations': 36335,
 'from': 59872,
 'me': 93036,
 'as': 17008,
 'well': 158040,
 'use': 153295,
 'the': 144986,
 'tools': 147453,
 'talk': 142826,
 'your': 162898,
 'vandalism': 154134,
 'to': 147012,
 'matt': 92309,
 'shirvington': 132212,
 'article': 16755,
 'has': 67483,
 'been': 21725,
 'reverted': 124063,
 'please': 113027,
 'don': 47036,
 'do': 46633,
 'it': 77322,
 'again': 10958,
 'or': 106254,
 'you': 162836,
 'will': 159820,
 'be': 21375,
 'banned': 20416,
 'sorry': 136064,
 'if': 72574,
 'word': 160726,
 'nonsense': 102866,
 'was': 157294,
 'offensive': 104926,
 'anyway': 14978,
 'not': 103182,
 'intending': 75677,
 'write': 161213,
 'anything': 14965,
 'in': 73667,
 'wow': 161003,
 'they': 145589,
 'would': 160981,
 'jump': 80004,
 'on': 105613,
 'for': 58595,
 'merely': 94030,
 'requesting': 123250,
 'that': 144938,
 'more': 97364,
 'encyclopedic': 51425,
 'so': 135188,
 'one': 105647,
 'can': 28916,
 'school': 129178,
 'reference': 121600,
 'have': 67719,
 'select

In [5]:
lgb_train = {}
lgb_test = {}
for i,j in enumerate(label_cols):
    lgb_train[j] = lgb.Dataset(X_train,Y_train[i])
    lgb_test[j] = lgb.Dataset(X_test,Y_test[i],reference=lgb_train[j])

In [6]:
auc_xgb = {}
y_pred = {}

In [None]:
# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'auc'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}
for j,i in enumerate(label_cols):
    print("Start Training for "+str(i))
    gbm = lgb.train(params, lgb_train[i], num_boost_round=20, valid_sets=lgb_test[i], early_stopping_rounds=5)
    print("Saving the model")
    gbm.save_model('model_'+str(i)+'.txt')
    print("Getting Predictions")
    y_pred[i] = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    print('The rmse of prediction is:', mean_squared_error(Y_test[j], y_pred[i]) ** 0.5)
    auc_xgb[i] =  roc_auc_score(Y_test[j], y_pred[i])
    print(auc_xgb[i])
    print("\n\n")

Start Training for toxic


Took Inspiration from:
1. [github documentation's sample code](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py)
2. [Analytics Vidhya](https://www.analyticsvidhya.com/blog/2017/06/which-algorithm-takes-the-crown-light-gbm-vs-xgboost/#comment-152378)
3. NBSVM notebook
4. [Kaggle](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/52645)