In [3]:
import pandas as pd
import numpy as np
import multiprocessing
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import gc
from time import time
import datetime
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss
warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

In [13]:
import os
import os
import pandas as pd
from sklearn import model_selection
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

def read_data(test_file, target_file, train_file, path='D:\\Dataset\\MOA\\',):
    
    df_train=pd.read_csv(os.path.join(path, train_file))
    df_test=pd.read_csv(os.path.join(path, test_file))
    df_target=pd.read_csv(os.path.join(path, target_file))
    
    print(df_train.shape, df_test.shape, df_target.shape, df_test.shape)
    
    return df_test, df_target, df_train


def process_data(train, test, target):
    df1 = train[train.cp_type!='ctl_vehicle'].reset_index(drop=True)
    df2 = test[test.cp_type!='ctl_vehicle'].reset_index(drop=True)
    
    #Handle categorical variables
    con_df = pd.concat([df1, df2], axis=0, ignore_index=True)
    con_df = pd.get_dummies(con_df, columns=['cp_type','cp_time','cp_dose']) 
    
    #Train test splitter
    train_df = con_df.iloc[:df1.shape[0],:]
    test_df   = con_df.iloc[df1.shape[0]:,:]
    
    #Train data combiner
    final_train = pd.merge(train_df, target, on='sig_id', how='left')

    
    print(final_train.shape)
    
    return final_train, test_df


def create_folds(train, target, path='D:\\Dataset\\MOA\\'):
    df=train.copy()
    df["kfold"] = -1
    df = df.sample(frac=1).reset_index(drop=True)
    y = target.values
    skf = MultilabelStratifiedKFold(n_splits=5,shuffle=True,random_state=42)
    for fold,(idxT,idxV) in enumerate(skf.split(X=df, y=target)):
        df.loc[idxV, "kfold"] = fold
    print(df.kfold.value_counts())
    df.to_csv(os.path.join(path, "train_folds.csv"), index=False)
    

def Multi_log_loss(y_true, y_pred):

    score = []
    for i in range(y_true.shape[1]):
        
        log_score = metrics.log_loss(y_true[:,i].astype(float) , 
                                     y_pred[:,i].astype(float),
                                     labels=sorted(np.unique(y_true))
                                    
                                    )
        score.append(log_score)  
        
    return np.mean(np.array(score))



In [14]:
test, train_target,train = read_data('test_features.csv',  'train_targets_scored.csv','train_features.csv')

(23814, 876) (3982, 876) (23814, 207) (3982, 876)


In [15]:
test

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,id_0004d9e33,trt_cp,24,D1,-0.5458,0.1306,-0.5135,0.4408,1.5500,-0.1644,...,0.0981,0.7978,-0.1430,-0.2067,-0.2303,-0.1193,0.0210,-0.0502,0.1510,-0.7750
1,id_001897cda,trt_cp,72,D1,-0.1829,0.2320,1.2080,-0.4522,-0.3652,-0.3319,...,-0.1190,-0.1852,-1.0310,-1.3670,-0.3690,-0.5382,0.0359,-0.4764,-1.3810,-0.7300
2,id_002429b5b,ctl_vehicle,24,D1,0.1852,-0.1404,-0.3911,0.1310,-1.4380,0.2455,...,-0.2261,0.3370,-1.3840,0.8604,-1.9530,-1.0140,0.8662,1.0160,0.4924,-0.1942
3,id_00276f245,trt_cp,24,D2,0.4828,0.1955,0.3825,0.4244,-0.5855,-1.2020,...,0.1260,0.1570,-0.1784,-1.1200,-0.4325,-0.9005,0.8131,-0.1305,0.5645,-0.5809
4,id_0027f1083,trt_cp,48,D1,-0.3979,-1.2680,1.9130,0.2057,-0.5864,-0.0166,...,0.4965,0.7578,-0.1580,1.0510,0.5742,1.0900,-0.2962,-0.5313,0.9931,1.8380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3977,id_ff7004b87,trt_cp,24,D1,0.4571,-0.5743,3.3930,-0.6202,0.8557,1.6240,...,-1.1790,-0.6422,-0.4367,0.0159,-0.6539,-0.4791,-1.2680,-1.1280,-0.4167,-0.6600
3978,id_ff925dd0d,trt_cp,24,D1,-0.5885,-0.2548,2.5850,0.3456,0.4401,0.3107,...,0.0210,0.5780,-0.5888,0.8057,0.9312,1.2730,0.2614,-0.2790,-0.0131,-0.0934
3979,id_ffb710450,trt_cp,72,D1,-0.3985,-0.1554,0.2677,-0.6813,0.0152,0.4791,...,0.4418,0.9153,-0.1862,0.4049,0.9568,0.4666,0.0461,0.5888,-0.4205,-0.1504
3980,id_ffbb869f2,trt_cp,48,D2,-1.0960,-1.7750,-0.3977,1.0160,-1.3350,-0.2207,...,0.3079,-0.4473,-0.8192,0.7785,0.3133,0.1286,-0.2618,0.5074,0.7430,-0.0484


In [16]:
# One-Hot encoding
for feature in ['cp_time', 'cp_type', 'cp_dose']:
    concat = pd.concat([train[feature], test[feature]], ignore_index=True)
    dummies = pd.get_dummies(concat, dummy_na=True, dtype=np.uint8, prefix=feature)
    train = pd.concat([train, dummies.iloc[:train.shape[0]]], axis=1)
    test = pd.concat([test, dummies.iloc[:test.shape[0]]], axis=1)

In [17]:
targets = [col for col in train_target.columns if col != 'sig_id']
print('Number of different labels:', len(targets))

Number of different labels: 206


In [18]:
features = [col for col in train.columns if col not in ['sig_id', 'cp_time', 'cp_type', 'cp_dose']]
print('Number of features:', len(features))

Number of features: 882


In [19]:
X = train[features]


In [20]:
params = {'num_leaves': 491,
          'min_child_weight': 0.03,
          'feature_fraction': 0.3,
          'bagging_fraction': 0.4,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.01,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'binary_logloss',
          "verbosity": 0,
          'reg_alpha': 0.4,
          'reg_lambda': 0.6,
          'random_state': 47
         }

In [22]:
sub = pd.read_csv(r'D:/Dataset/MOA/sample_submission.csv')

In [23]:
accumulative_loss = 0
skf = StratifiedKFold(n_splits=3, random_state=47, shuffle=True)

print('Execution time | Model number | logloss | new logloss | best coeff')
# 206 different models. One for each label
for model, target in enumerate(targets, 1):
    y = train_target[target]
    start_time = time()
    preds = np.zeros(test.shape[0])
    oof = np.zeros(X.shape[0])

    for trn_idx, test_idx in skf.split(X, y):
        
        trn_data = lgb.Dataset(X.iloc[trn_idx], label=y.iloc[trn_idx])
        val_data = lgb.Dataset(X.iloc[test_idx], label=y.iloc[test_idx])
        clf = lgb.train(params, trn_data, 10000, valid_sets = [trn_data, val_data], verbose_eval=0, early_stopping_rounds=20)
        oof[test_idx] = clf.predict(X.iloc[test_idx])
        preds += clf.predict(test[features]) / skf.n_splits

    loss = log_loss(y, oof)
    
    # Hacking the metric
    coeffs = [3, 2, 1.5, 1.4, 1.3, 1.2, 1.1, 1.0, 0.9, 0.8, 0.7]
    best_coeff = 0
    best_loss = loss
    for coeff in coeffs:
        new_oof = oof.copy()
        new_oof[new_oof < new_oof.mean() / coeff] = 0
        new_loss = log_loss(y, new_oof)
        if new_loss < loss:
            preds[preds < preds.mean() / coeff] = 0
            best_coeff = coeff
            best_loss = new_loss
    
    if best_coeff:
        preds[preds < preds.mean() / best_coeff] = 0
    # End of metric hacking
    sub[target] = preds

    accumulative_loss += best_loss
    print('{}\t\t{}\t{:.5f}\t\t{:.5f}\t\t{}'.format(str(datetime.timedelta(seconds=time() - start_time))[:7], model, loss, best_loss, best_coeff))
    del preds, oof, start_time, y, loss, best_loss, new_oof
    gc.collect();

Execution time | Model number | logloss | new logloss | best coeff
0:00:19		1	0.00571		0.00564		1.2
0:00:11		2	0.00613		0.00604		1.4
0:00:09		3	0.00796		0.00796		0
0:00:19		4	0.04608		0.04599		2
0:00:23		5	0.06681		0.06663		2
0:00:14		6	0.02062		0.02061		1.4
0:00:10		7	0.01603		0.01603		0
0:00:13		8	0.02601		0.02601		0
0:00:15		9	0.00302		0.00293		0.7
0:00:25		10	0.05886		0.05861		2
0:00:25		11	0.07658		0.07615		2
0:00:16		12	0.01505		0.01499		2
0:00:15		13	0.00193		0.00186		0.7
0:00:15		14	0.01151		0.01150		2
0:00:09		15	0.00432		0.00432		0
0:00:09		16	0.00432		0.00430		1.2
0:00:11		17	0.01444		0.01444		1.2
0:00:11		18	0.02456		0.02456		0
0:00:12		19	0.02229		0.02229		0
0:00:10		20	0.01127		0.01127		0
0:00:09		21	0.01154		0.01154		0
0:00:15		22	0.01980		0.01980		0
0:00:09		23	0.00234		0.00234		0
0:00:14		24	0.01248		0.01248		1.5
0:00:09		25	0.00433		0.00433		0
0:00:11		26	0.00451		0.00449		1.4
0:00:09		27	0.00431		0.00431		0
0:00:10		28	0.00615		0.00615		1.5
0:00:13		29	0.02057		0.020

In [26]:
sub.to_csv('lgbm_moa.csv', index=False)