In [17]:
import pandas as pd
import numpy as np
import os
import re
import lightgbm as lgb
import warnings
warnings.filterwarnings(action='ignore', module='sklearn')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from feature_engineering import *

In [22]:
root_path = os.getcwd()
data_path = os.path.join(root_path,"data")
meta_path = os.path.join(root_path,"meta")
sub_path = os.path.join(root_path,"submissions")

In [24]:
def get_subs(nums):
    subs = np.hstack([np.array(pd.read_csv(os.path.join(sub_path, "submission" + str(num) + ".csv"))[LABELS]) for num in subnums])
    oofs = np.hstack([np.array(pd.read_csv(os.path.join(sub_path, "oof_train" + str(num) + ".csv"))[LABELS]) for num in subnums])
    return subs, oofs

In [25]:
train = pd.read_csv(os.path.join(data_path, "train.csv")).fillna(' ')
test = pd.read_csv(os.path.join(data_path, "test.csv")).fillna(' ')
sub = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))
INPUT_COLUMN = "comment_text"
LABELS = train.columns[2:]

#### Handcrafted features

In [26]:
feature_functions = [len, asterix_freq, uppercase_freq, line_change_freq, rep_freq, question_freq, has_ip, has_talk_tag, link_count, starts_with_i, starts_with_you, about_image]
features = [f.__name__ for f in feature_functions]
F_train = engineer_features(train[INPUT_COLUMN], feature_functions)
F_test = engineer_features(test[INPUT_COLUMN], feature_functions)

#### Get out-of-fold predictions 

In [41]:
# subnums = [21,22,29,33,37,44,45,51,52,57,59,62,66,68,74,76,79,84,86]
# subnums = [21,22,29,33,37,44,45,51,52,57,59,62,66,68,74,76]
subnums = [21,22,29,33,37,44,45,51,52,57,59,62,66,68,74,95]
subs, oofs = get_subs(subnums)

#### Include Abhishek's models

In [42]:
abh_model_info = pd.read_csv(os.path.join(meta_path, "abh_models.csv"))

In [43]:
def get_best_abh_models(num):
    models = abh_model_info.sort_values("cv",ascending=False)[:num]
    return list(models.model)

In [8]:
abh_filenames = get_best_abh_models(62)

In [44]:
abh_filenames = ["gru_lstm_cnn_preprocess_fasttext_crawl_v2_500","gru_lstm_preprocess_fasttext_crawl_lr","pooled_dict_preprocess_bidir_gru_fasttext_wiki_100k_shuffle10","cnn_cnn_gru_preprocess_fasttext_crawl_shuffle10"]

In [45]:
abh_filenames

['gru_lstm_cnn_preprocess_fasttext_crawl_v2_500',
 'gru_lstm_preprocess_fasttext_crawl_lr',
 'pooled_dict_preprocess_bidir_gru_fasttext_wiki_100k_shuffle10',
 'cnn_cnn_gru_preprocess_fasttext_crawl_shuffle10']

In [46]:
abh_trainfiles = [model+"_train.csv" for model in abh_filenames]
abh_testfiles = [model+"_test.csv" for model in abh_filenames]

In [47]:
abh_train = [np.array(pd.read_csv(os.path.join(sub_path, model), header=None)) for model in abh_trainfiles]
abh_test = [np.array(pd.read_csv(os.path.join(sub_path, model), header=None)) for model in abh_testfiles]

In [48]:
for model in abh_train:
    oofs = np.hstack([oofs, model])
for model in abh_test:
    subs = np.hstack([subs, model])

#### Stack everything together 

In [49]:
X_train = np.hstack([F_train[features].as_matrix(), oofs])
X_test = np.hstack([F_test[features].as_matrix(), subs])

In [50]:
X_train.shape

(159571, 132)

In [51]:
stacker = lgb.LGBMClassifier(max_depth=3, metric="auc", n_estimators=125, num_leaves=10, boosting_type="gbdt", learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)
scores = []
for label in LABELS:
    print(label)
    cv = StratifiedKFold(5, random_state=42)
    score = cross_val_score(stacker, X_train, train[label], cv=cv, scoring='roc_auc')
    print("AUC:", score)
    scores.append(np.mean(score))
    stacker.fit(X_train, train[label])
    sub[label] = stacker.predict_proba(X_test)[:,1]
print("CV score:", np.mean(scores))

toxic
AUC: [0.98896157 0.98883888 0.98878462 0.98905488 0.9884646 ]
severe_toxic
AUC: [0.99229864 0.99214421 0.99088384 0.99210799 0.99292307]
obscene
AUC: [0.99553505 0.99529337 0.99602365 0.99553795 0.99512791]
threat
AUC: [0.99045136 0.99636158 0.99173286 0.99269199 0.99512093]
insult
AUC: [0.9902852  0.98942226 0.9909165  0.98984552 0.98998047]
identity_hate
AUC: [0.99239905 0.98883953 0.99108728 0.9913979  0.99337036]
CV score: 0.9918627673507275


In [52]:
sub.to_csv(os.path.join(sub_path,"submission96.csv"), index=False)

In [16]:
# submission90 AUC: 0.992135625004547
# submission91 AUC: 0.9922507801414321
# submission92 AUC: 0.991882486097003
# submission93 AUC: 0.9920944709837135
# submission94 AUC: 0.9922206603538549
# submission96 AUC: 0.9918627673507275