In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
import gc
import os
import matplotlib.pyplot as plt
import seaborn as sns 
import lightgbm as lgb
import itertools
import pickle, gzip
import glob
from sklearn.preprocessing import StandardScaler
from tsfresh.feature_extraction import extract_features
np.warnings.filterwarnings('ignore')
import dask.dataframe as dd
import missingno as msno
from pandasql import sqldf
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import KFold
import matplotlib.gridspec as gridspec
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [2]:
#Always seed the randomness of this universe
np.random.seed(51)

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
%%time
train_metadata_kaggle = dd.read_csv('mydata_train_metadata.csv')
test_metadata_kaggle = dd.read_csv('mydata_test_metadata.csv')
train_metadata_kaggle = train_metadata_kaggle.compute()
test_metadata_kaggle = test_metadata_kaggle.compute()
print(train_metadata_kaggle.shape,test_metadata_kaggle.shape)

(7848, 100) (3492890, 99)
CPU times: user 2min 24s, sys: 7.97 s, total: 2min 32s
Wall time: 25.7 s


In [5]:
"""%%time
f1_train = dd.read_csv('cesium_train_embeddings.csv')
f1_test = dd.read_csv('cesium_test_embeddings.csv')
f1_train = f1_train.compute()
f1_test = f1_test.compute()
print(f1_train.shape,f1_test.shape)"""

"%%time\nf1_train = dd.read_csv('cesium_train_embeddings.csv')\nf1_test = dd.read_csv('cesium_test_embeddings.csv')\nf1_train = f1_train.compute()\nf1_test = f1_test.compute()\nprint(f1_train.shape,f1_test.shape)"

In [6]:
"""%%time
train_metadata_kaggle = train_metadata_kaggle.merge(f1_train,how='left',on = 'object_id')
test_metadata_kaggle = test_metadata_kaggle.merge(f1_test,how='left',on = 'object_id')
print(train_metadata_kaggle.shape,test_metadata_kaggle.shape)"""

"%%time\ntrain_metadata_kaggle = train_metadata_kaggle.merge(f1_train,how='left',on = 'object_id')\ntest_metadata_kaggle = test_metadata_kaggle.merge(f1_test,how='left',on = 'object_id')\nprint(train_metadata_kaggle.shape,test_metadata_kaggle.shape)"

In [7]:
"""%%time
f2_train = dd.read_csv('myfeatures_train_embeddings.csv')
f2_test = dd.read_csv('myfeatures_test_embeddings.csv')
f2_train = f2_train.compute()
f2_test = f2_test.compute()
print(f2_train.shape,f2_test.shape)"""

"%%time\nf2_train = dd.read_csv('myfeatures_train_embeddings.csv')\nf2_test = dd.read_csv('myfeatures_test_embeddings.csv')\nf2_train = f2_train.compute()\nf2_test = f2_test.compute()\nprint(f2_train.shape,f2_test.shape)"

In [8]:
"""%%time
train_metadata_kaggle = train_metadata_kaggle.merge(f2_train,how='left',on = 'object_id')
test_metadata_kaggle = test_metadata_kaggle.merge(f2_test,how='left',on = 'object_id')
print(train_metadata_kaggle.shape,test_metadata_kaggle.shape)"""

"%%time\ntrain_metadata_kaggle = train_metadata_kaggle.merge(f2_train,how='left',on = 'object_id')\ntest_metadata_kaggle = test_metadata_kaggle.merge(f2_test,how='left',on = 'object_id')\nprint(train_metadata_kaggle.shape,test_metadata_kaggle.shape)"

In [9]:
test_id = test_metadata_kaggle['object_id']

In [10]:
def multi_weighted_logloss(y_true, y_preds):
    """
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """
    # class_weights taken from Giba's topic : https://www.kaggle.com/titericz
    # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
    # with Kyle Boone's post https://www.kaggle.com/kyleboone
    classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
    class_weight = {6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1}
    if len(np.unique(y_true)) > 14:
        classes.append(99)
        class_weight[99] = 2
    y_p = y_preds
    # Trasform y_true in dummies
    y_ohe = pd.get_dummies(y_true)
    # Normalize rows and limit y_preds to 1e-15, 1-1e-15
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)
    # Transform to log
    y_p_log = np.log(y_p)
    # Get the log for ones, .values is used to drop the index of DataFrames
    # Exclude class 99 for now, since there is no class99 in the training set
    # we gave a special process for that class
    y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
    # Get the number of positives for each class
    nb_pos = y_ohe.sum(axis=0).values.astype(float)
    # Weight average and divide by the number of positives
    class_arr = np.array([class_weight[k] for k in sorted(class_weight.keys())])
    y_w = y_log_ones * class_arr / nb_pos

    loss = - np.sum(y_w) / np.sum(class_arr)
    return loss


def lgb_multi_weighted_logloss(y_true, y_preds):
    """
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """
    # class_weights taken from Giba's topic : https://www.kaggle.com/titericz
    # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
    # with Kyle Boone's post https://www.kaggle.com/kyleboone
    classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
    class_weight = {6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1}
    if len(np.unique(y_true)) > 14:
        classes.append(99)
        class_weight[99] = 2
    y_p = y_preds.reshape(y_true.shape[0], len(classes), order='F')

    # Trasform y_true in dummies
    y_ohe = pd.get_dummies(y_true)
    # Normalize rows and limit y_preds to 1e-15, 1-1e-15
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)
    # Transform to log
    y_p_log = np.log(y_p)
    # Get the log for ones, .values is used to drop the index of DataFrames
    # Exclude class 99 for now, since there is no class99 in the training set
    # we gave a special process for that class
    y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
    # Get the number of positives for each class
    nb_pos = y_ohe.sum(axis=0).values.astype(float)
    # Weight average and divide by the number of positives
    class_arr = np.array([class_weight[k] for k in sorted(class_weight.keys())])
    y_w = y_log_ones * class_arr / nb_pos

    loss = - np.sum(y_w) / np.sum(class_arr)
    return 'wloss', loss, False

In [11]:
"""%%time
cesium_drop = ['__flux_percentile_ratio_mid50___5_', '__flux_percentile_ratio_mid65___2_', '__median_absolute_deviation___2_',
  '__qso_log_chi2_qsonu___0_', '__stetson_k___1_', '__freq1_signif___2_', '__stetson_k___2_', '__freq3_amplitude1___1_',
   '__median_absolute_deviation___2_', '__percent_close_to_median___2_',
   '__freq_varrat___5_','__freq_varrat___4_','__qso_log_chi2_qsonu___3_','__qso_log_chi2_qsonu___1_',
'__qso_log_chi2_qsonu___5_','__std___4_', '__freq_varrat___3_','__amplitude___2_']
columns_from_my_data = ['A0_sum_flux', 'A0_mean_flux', 'A0_std_detected', 'A1_mean_detected', 'A2_sum_detected', 'A4_mean_detected',
 'A5_std_detected', 'A5_mean_detected', 'percent_p2_region_minus_1', 'A2_min_flux', 'A5_sum_detected']
train_metadata_kaggle.drop(cesium_drop,inplace=True,axis=1)
test_metadata_kaggle.drop(cesium_drop,inplace=True,axis=1)
train_metadata_kaggle.drop(columns_from_my_data,inplace=True,axis=1)
test_metadata_kaggle.drop(columns_from_my_data,inplace=True,axis=1)"""

"%%time\ncesium_drop = ['__flux_percentile_ratio_mid50___5_', '__flux_percentile_ratio_mid65___2_', '__median_absolute_deviation___2_',\n  '__qso_log_chi2_qsonu___0_', '__stetson_k___1_', '__freq1_signif___2_', '__stetson_k___2_', '__freq3_amplitude1___1_',\n   '__median_absolute_deviation___2_', '__percent_close_to_median___2_',\n   '__freq_varrat___5_','__freq_varrat___4_','__qso_log_chi2_qsonu___3_','__qso_log_chi2_qsonu___1_',\n'__qso_log_chi2_qsonu___5_','__std___4_', '__freq_varrat___3_','__amplitude___2_']\ncolumns_from_my_data = ['A0_sum_flux', 'A0_mean_flux', 'A0_std_detected', 'A1_mean_detected', 'A2_sum_detected', 'A4_mean_detected',\n 'A5_std_detected', 'A5_mean_detected', 'percent_p2_region_minus_1', 'A2_min_flux', 'A5_sum_detected']\ntrain_metadata_kaggle.drop(cesium_drop,inplace=True,axis=1)\ntest_metadata_kaggle.drop(cesium_drop,inplace=True,axis=1)\ntrain_metadata_kaggle.drop(columns_from_my_data,inplace=True,axis=1)\ntest_metadata_kaggle.drop(columns_from_my_data,inpl

In [12]:
train_metadata_kaggle.head()

Unnamed: 0,object_id,flux_min,flux_max,flux_mean,flux_median,flux_std,flux_skew,flux_err_min,flux_err_max,flux_err_mean,flux_err_median,flux_err_std,flux_err_skew,detected_mean,flux_ratio_sq_sum,flux_ratio_sq_skew,flux_by_flux_ratio_sq_sum,flux_by_flux_ratio_sq_skew,flux_w_mean,flux_diff1,flux_diff2,flux_diff3,"0__fft_coefficient__coeff_0__attr_""abs""","0__fft_coefficient__coeff_1__attr_""abs""",0__kurtosis,0__skewness,"1__fft_coefficient__coeff_0__attr_""abs""","1__fft_coefficient__coeff_1__attr_""abs""",1__kurtosis,1__skewness,"2__fft_coefficient__coeff_0__attr_""abs""","2__fft_coefficient__coeff_1__attr_""abs""",2__kurtosis,2__skewness,"3__fft_coefficient__coeff_0__attr_""abs""","3__fft_coefficient__coeff_1__attr_""abs""",3__kurtosis,3__skewness,"4__fft_coefficient__coeff_0__attr_""abs""","4__fft_coefficient__coeff_1__attr_""abs""",4__kurtosis,4__skewness,"5__fft_coefficient__coeff_0__attr_""abs""","5__fft_coefficient__coeff_1__attr_""abs""",5__kurtosis,5__skewness,flux__length,flux__longest_strike_above_mean,flux__longest_strike_below_mean,flux__mean_abs_change,flux__mean_change,flux_by_flux_ratio_sq__longest_strike_above_mean,flux_by_flux_ratio_sq__longest_strike_below_mean,mjd__mean_abs_change,mjd__mean_change,mjd_diff_det,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target,haversine,latlon1,hostgal_photoz_certain,A0_sum_flux,A0_mean_flux,A0_std_detected,A1_mean_detected,A2_sum_detected,A4_mean_detected,A5_std_detected,A5_mean_detected,percent_p2_region_minus_1,A2_min_flux,A5_sum_detected,__flux_percentile_ratio_mid50___5_,__flux_percentile_ratio_mid65___2_,__median_absolute_deviation___2_,__qso_log_chi2_qsonu___0_,__stetson_k___1_,__freq1_signif___2_,__stetson_k___2_,__freq3_amplitude1___1_,__median_absolute_deviation___2_.1,__percent_close_to_median___2_,__freq_varrat___5_,__freq_varrat___4_,__qso_log_chi2_qsonu___3_,__qso_log_chi2_qsonu___1_,__qso_log_chi2_qsonu___5_,__std___4_,__freq_varrat___3_,__amplitude___2_,outlierScore,hipd,lipd,highEnergy_transitory_1.0_TF,highEnergy_transitory_1.5_TF,lowEnergy_transitory_1.0_TF,lowEnergy_transitory_1.5_TF
0,615,-1100.440063,660.626343,-123.096998,-89.477524,394.109851,-0.34954,2.13051,12.845472,4.482743,3.835269,1.744747,1.62374,0.946023,2929669.0,0.812722,-960176600.0,-1.414322,-327.742307,1761.066406,-14.306331,-5.373326,205.036926,1628.427737,-1.475181,0.128917,22370.594834,2806.374162,-1.255123,0.41558,7780.500807,2805.598113,-1.409885,0.339918,7024.003068,2536.068846,-1.449858,0.293128,3245.366349,2741.539785,-1.548319,0.200096,2704.641265,2893.344217,-1.59282,0.125268,352.0,19.0,29.0,202.114067,1.999688,35.0,4.0,2.631898,2.631898,873.7903,0.0,0.0,,0.017,92,0.319006,-1.528827,0.0,-205.03693,-3.254554,0.3528,0.9653,57,0.983,0.2854,0.912,0.362,-682.0,52,5.56223e-26,6.71941e-20,368.129,6.21789,1.09173,5.49891,1.05349,114.465,368.129,0.172414,0.401664,0.129578,9.16612,9.50875,7.34498,289.277,0.110785,646.922,0.0,1.0,1.0,0,0,0,0
1,713,-14.735178,14.770886,-1.423351,-0.873033,6.471144,0.014989,0.639458,9.115748,2.35962,1.998217,1.509888,1.633246,0.171429,5886.068,3.439423,-28750.87,-3.454554,-4.884564,29.506064,-20.730002,-6.040676,190.427851,299.586559,-1.014003,0.260052,57.109047,192.539229,-1.09717,-0.087865,44.477327,191.057528,-1.188472,-0.022678,55.270113,212.522263,-1.142896,-0.167176,50.414646,203.892482,-1.190245,-0.064134,100.473776,143.963093,-0.797047,0.218182,350.0,50.0,73.0,2.935177,-0.050944,199.0,8.0,14.352571,14.352571,846.8017,1.6267,0.2552,45.4063,0.007,88,1.698939,3.258921,2.099614,-190.42786,-2.720398,0.3525,0.2678,15,0.0893,0.0,0.0,0.25,-10.07,0,0.0211907,0.0824318,5.10035,2.18719,1.0661,3.95669,1.08818,0.851103,5.10035,0.178571,0.369518,0.166179,2.79753,3.12481,0.659762,6.34953,0.111883,10.2985,0.875,1.909016,2.0,1,0,1,1
2,730,-19.159811,47.310059,2.267434,0.409172,8.022239,3.177854,0.695106,11.281384,2.471061,1.990851,1.721134,1.823726,0.069697,4124.452,5.480405,104650.2,5.989138,25.37311,66.46987,29.315018,2.619697,3.46179,4.729538,0.474215,0.35691,7.334944,13.515895,0.976374,0.471342,124.84525,119.500254,5.13129,2.385066,168.280524,162.799417,7.125665,2.662075,219.745132,202.532898,6.081065,2.537802,231.509177,199.28637,3.58313,1.680352,330.0,13.0,32.0,4.227614,-0.008131,4.0,222.0,3.580623,3.580623,78.7737,0.2262,0.0157,40.2561,0.021,42,1.81803,3.128522,0.229779,-3.46179,-0.04808,0.0,0.0,7,0.098,0.2715,0.0784,0.769,-2.85,4,0.000194228,0.55118,1.04253,-0.307228,0.933091,4.61663,0.634723,0.454918,1.04253,0.769231,0.500549,0.318256,3.04833,0.127758,1.66943,10.6048,0.292954,11.9218,0.0,1.0,1.0,0,0,0,0
3,745,-15.494463,220.795212,8.909206,1.035895,27.558208,4.979826,0.56717,55.892746,2.555576,1.819875,3.537324,10.741655,0.173789,94161.65,9.611274,14391250.0,11.141069,152.835617,236.289675,26.521968,1.546038,129.421659,123.298327,4.629801,2.023211,320.174052,280.440312,50.86888,7.007099,543.845781,491.54827,36.088137,5.688194,807.123762,710.721942,16.392533,3.751603,735.528417,680.05528,13.747434,3.47642,591.037583,523.503586,12.134629,3.170857,351.0,19.0,115.0,7.065548,0.008044,4.0,201.0,2.061453,2.061453,123.6872,0.2813,1.1523,40.7951,0.007,90,0.495223,6.893743,0.890445,129.42166,1.797523,0.1655,0.125,16,0.2322,0.3364,0.1273,0.768,-2.16,7,0.0084016,0.546369,1.41645,1.4322,0.295163,3.96789,0.394683,3.59567,1.41645,0.892857,0.489589,0.360868,6.06886,5.84082,2.82044,32.7725,0.290652,111.477,0.0,1.0,1.0,0,0,0,0
4,1124,-16.543753,143.600189,7.145702,1.141288,20.051722,4.406298,0.695277,11.38369,2.753004,2.214854,1.933837,1.794938,0.173295,34324.18,7.868462,3015599.0,7.908174,87.85639,160.143942,22.411225,1.822792,41.639721,32.987125,0.822496,-0.332169,268.808929,207.812015,6.112295,2.377222,594.150153,498.50982,10.343254,3.075437,643.020183,555.512641,14.095862,3.603208,574.553907,524.107264,16.377058,3.904008,393.114268,357.907185,14.43447,3.657305,352.0,19.0,158.0,6.727352,0.012543,10.0,231.0,2.231855,2.231855,133.9113,0.2415,0.0176,40.4166,0.024,90,0.395162,-1.928064,0.245788,41.63972,0.660948,0.0,0.2241,18,0.1724,0.2578,0.0702,0.707,-2.084,4,0.0270226,0.601811,1.33779,-0.064359,0.674119,5.24444,0.560453,1.20558,1.33779,0.741379,0.673592,0.382847,5.07231,3.4079,2.31292,26.6333,0.250639,54.3781,0.375,1.0,1.0,0,0,1,0


In [13]:
%%time
final_dict = {}

loss_list = []
temp = train_metadata_kaggle.copy()
#temp = temp.merge(train_metadata[['object_id',column_]],on = 'object_id',how = 'left')
y = temp['target']
del temp['target']
classes = sorted(y.unique())

# Taken from Giba's topic : https://www.kaggle.com/titericz
# https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
# with Kyle Boone's post https://www.kaggle.com/kyleboone
class_weight = {
    c: 1 for c in classes
}
for c in [64, 15]:
    class_weight[c] = 2

#print('Unique classes : ', classes)

train_id = temp['object_id']
del temp['object_id']
# Compute weights
w = y.value_counts()
weights = {i : np.sum(w) / w[i] for i in w.index}
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
clfs = []
importances = pd.DataFrame()
lgb_params = {
'random_state':51,
'device': 'cpu', 
'objective': 'multiclass', 
'num_class': 14, 
'boosting_type': 'gbdt', 
'n_jobs': -1, 
'max_depth': 7, 
'n_estimators': 1000, 
'subsample_freq': 2, 
'subsample_for_bin': 5000, 
'min_data_per_group': 100, 
'max_cat_to_onehot': 4, 
'cat_l2': 1.0, 
'cat_smooth': 59.5, 
'max_cat_threshold': 32, 
'metric_freq': 10, 
'verbosity': -1, 
'metric': 'multi_logloss', 
'xgboost_dart_mode': False, 
'uniform_drop': False, 
'colsample_bytree': 0.5, 
'drop_rate': 0.173, 
'learning_rate': 0.0267, 
'max_drop': 5, 
'min_child_samples': 10,
'min_child_weight': 200.0, 
#'min_child_weight': 100.0, 
'min_split_gain': 0.1, 
'num_leaves': 7, 
#'reg_alpha': 0.1,
'reg_alpha': 0.0, 
'reg_lambda': 0.00023, 
'skip_drop': 0.44, 
'subsample': 0.75}
oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))
for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
    trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
    val_x, val_y = temp.iloc[val_], y.iloc[val_]

    clf = lgb.LGBMClassifier(**lgb_params)
    clf.fit(
        trn_x, trn_y,
        eval_set=[(trn_x, trn_y), (val_x, val_y)],
        eval_metric=lgb_multi_weighted_logloss,
        verbose=False,
        early_stopping_rounds=50,
        sample_weight=trn_y.map(weights)
    )
    oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
    loss_oof = multi_weighted_logloss(val_y, oof_preds[val_, :])
    #loss_list.append(loss_oof)
    print(fold_,loss_oof)

    imp_df = pd.DataFrame()
    imp_df['feature'] = temp.columns
    imp_df['gain'] = clf.feature_importances_
    imp_df['fold'] = fold_ + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)

    clfs.append(clf)
print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
#final_dict[column_] = loss_list

0 0.6275658544463559
1 0.5990572385367394
2 0.6562404553921372
3 0.6097834282043219
4 0.6101227441786252
MULTI WEIGHTED LOG LOSS : 0.62060 
CPU times: user 8min 36s, sys: 1.28 s, total: 8min 38s
Wall time: 1min 7s


In [14]:
#modify to work with kfold
#def smoteAdataset(Xig, yig, test_size=0.2, random_state=0):
def smoteAdataset(Xig_train, yig_train, Xig_test, yig_test):
    
        
    sm=SMOTE(random_state=51)
    Xig_train_res, yig_train_res = sm.fit_sample(Xig_train, yig_train.ravel())

        
    return Xig_train_res, pd.Series(yig_train_res), Xig_test, pd.Series(yig_test)

In [15]:
%%time
final_dict = {}

loss_list = []
temp = train_metadata_kaggle.copy()
temp.fillna(0, inplace=True)
#temp = temp.merge(train_metadata[['object_id',column_]],on = 'object_id',how = 'left')
y = temp['target']
del temp['target']
classes = sorted(y.unique())

# Taken from Giba's topic : https://www.kaggle.com/titericz
# https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
# with Kyle Boone's post https://www.kaggle.com/kyleboone
class_weight = {
    c: 1 for c in classes
}
for c in [64, 15]:
    class_weight[c] = 2

#print('Unique classes : ', classes)

train_id = temp['object_id']
del temp['object_id']
# Compute weights
w = y.value_counts()
weights = {i : np.sum(w) / w[i] for i in w.index}
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
clfs = []
importances = pd.DataFrame()
lgb_params = {
'random_state':51,
'device': 'cpu', 
'objective': 'multiclass', 
'num_class': 14, 
'boosting_type': 'gbdt', 
'n_jobs': -1, 
'max_depth': 7, 
'n_estimators': 1000, 
'subsample_freq': 2, 
'subsample_for_bin': 5000, 
'min_data_per_group': 100, 
'max_cat_to_onehot': 4, 
'cat_l2': 1.0, 
'cat_smooth': 59.5, 
'max_cat_threshold': 32, 
'metric_freq': 10, 
'verbosity': -1, 
'metric': 'multi_logloss', 
'xgboost_dart_mode': False, 
'uniform_drop': False, 
'colsample_bytree': 0.5, 
'drop_rate': 0.173, 
'learning_rate': 0.0267, 
'max_drop': 5, 
'min_child_samples': 10, 
'min_child_weight': 200.0, 
'min_split_gain': 0.1, 
'num_leaves': 7, 
'reg_alpha': 0.0, 
'reg_lambda': 0.00023, 
'skip_drop': 0.44, 
'subsample': 0.75}
oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))
for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
    trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
    val_x, val_y = temp.iloc[val_], y.iloc[val_]

    trn_xa, trn_y, val_xa, val_y=smoteAdataset(trn_x.values, trn_y.values, val_x.values, val_y.values)
    trn_x=pd.DataFrame(data=trn_xa, columns=trn_x.columns)
    val_x=pd.DataFrame(data=val_xa, columns=val_x.columns)
    
    print(trn_x.shape,trn_y.shape,val_x.shape,val_y.shape)
    
    clf = lgb.LGBMClassifier(**lgb_params)
    clf.fit(
        trn_x, trn_y,
        eval_set=[(trn_x, trn_y), (val_x, val_y)],
        eval_metric=lgb_multi_weighted_logloss,
        verbose=False,
        early_stopping_rounds=50,
        sample_weight=trn_y.map(weights)
    )
    oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
    loss_oof = multi_weighted_logloss(val_y, oof_preds[val_, :])
    #loss_list.append(loss_oof)
    print(fold_,loss_oof)

    imp_df = pd.DataFrame()
    imp_df['feature'] = temp.columns
    imp_df['gain'] = clf.feature_importances_
    imp_df['fold'] = fold_ + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)

    clfs.append(clf)
print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
#final_dict[column_] = loss_list

(25900, 98) (25900,) (1574, 98) (1574,)
0 0.5773895028501574
(25900, 98) (25900,) (1572, 98) (1572,)
1 0.6010813522091171
(25900, 98) (25900,) (1571, 98) (1571,)
2 0.6550692442902669
(25914, 98) (25914,) (1567, 98) (1567,)
3 0.5967314617978657
(25914, 98) (25914,) (1564, 98) (1564,)
4 0.6124223750584648
MULTI WEIGHTED LOG LOSS : 0.60843 
CPU times: user 22min 31s, sys: 1.74 s, total: 22min 33s
Wall time: 3min 9s


In [16]:
temp_test = test_metadata_kaggle.copy()

In [17]:
del temp_test['object_id']

In [18]:
temp_test.fillna(0.0,inplace = True)

In [19]:
print(temp.shape,temp_test.shape)

(7848, 98) (3492890, 98)


In [20]:
list(temp.columns) == list(temp_test.columns)

True

In [21]:
%%time
test_pred0 = pd.DataFrame()
test_pred1 = pd.DataFrame()
test_pred2 = pd.DataFrame()
test_pred3 = pd.DataFrame()
test_pred4 = pd.DataFrame()

list_of_df = [test_pred0,test_pred1,test_pred2,test_pred3,test_pred4]

for num,c in enumerate(clfs):
    print(num)
    for k in range(0,len(temp_test),500000):
        test_pred = pd.DataFrame(c.predict_proba(temp_test[ k:k+500000] ))
        list_of_df[num] = pd.concat([list_of_df[num],test_pred],axis=0)
        del test_pred

0
1
2
3
4
CPU times: user 3h 8min 22s, sys: 23.2 s, total: 3h 8min 45s
Wall time: 24min 26s


In [22]:
test_pred2 = pd.DataFrame()
test_pred2 = (list_of_df[0] + list_of_df[1] + list_of_df[2] + list_of_df[3] + list_of_df[4])/5

In [23]:
print(test_pred2.shape)

(3492890, 14)


In [24]:
#test_pred2 = pd.DataFrame(np.random.rand(10,14))

In [25]:
test_pred2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.000259,0.00105,0.000127,0.613339,0.099597,0.000296,0.222354,3.3e-05,0.000115,0.002852,0.000276,0.058215,0.000156,0.00133
1,0.000163,0.008886,0.000166,0.167974,0.051766,0.000279,0.077005,0.000154,0.0006,0.040682,0.00169,0.638965,0.000169,0.011499
2,0.000257,0.012684,0.000259,0.135556,0.090075,0.000408,0.037354,0.0023,0.000548,0.160885,0.003061,0.544588,0.000174,0.011851
3,0.000281,0.001986,0.000299,0.051622,0.021756,0.000398,0.087818,0.005267,0.000436,0.49654,0.001185,0.291865,0.000163,0.040384
4,6.2e-05,0.001839,5.8e-05,0.09222,0.196609,0.000139,0.02104,2.2e-05,0.000102,0.016157,0.0001,0.671488,5.4e-05,0.00011


In [26]:
temp_columns = ['object_id','class_6','class_15','class_16','class_42','class_52','class_53','class_62','class_64','class_65','class_67','class_88','class_90','class_92','class_95','class_99']

In [27]:
test_pred2.columns = temp_columns[1:15]

In [28]:
def getUnknown(data):
    return ((((((data["mymedian"]) + (((data["mymean"]) / 2.0)))/2.0)) + (((((1.0) - (((data["mymax"]) * (((data["mymax"]) * (data["mymax"]))))))) / 2.0)))/2.0)

feats = ['class_6', 'class_15', 'class_16', 'class_42', 'class_52', 'class_53',
         'class_62', 'class_64', 'class_65', 'class_67', 'class_88', 'class_90',
         'class_92', 'class_95']

In [29]:
klm = pd.DataFrame()
klm['mymean'] = test_pred2[feats].mean(axis=1)
klm['mymedian'] = test_pred2[feats].median(axis=1)
klm['mymax'] = test_pred2[feats].max(axis=1)

In [30]:
test_pred2['class_99'] = getUnknown(klm)

In [31]:
test_pred2.tail()

Unnamed: 0,class_6,class_15,class_16,class_42,class_52,class_53,class_62,class_64,class_65,class_67,class_88,class_90,class_92,class_95,class_99
492885,0.000221,0.386889,0.000399,0.145173,0.249667,0.000421,0.096654,0.00015,0.002551,0.014446,0.003441,0.099421,0.000201,0.000367,0.2452
492886,0.000315,0.012543,0.000334,0.051576,0.17561,0.000374,0.02151,0.447852,0.000694,0.030112,0.001282,0.253879,0.000317,0.003602,0.23849
492887,0.001076,0.038638,0.000232,0.854522,0.016455,0.000608,0.070924,6.8e-05,0.000353,0.001631,0.000889,0.012521,0.000292,0.001793,0.103272
492888,8.4e-05,0.729113,0.000109,0.134453,0.007923,0.000109,0.011622,0.086651,0.000358,0.000482,0.000135,0.028598,7.1e-05,0.000291,0.162133
492889,0.000166,0.039832,0.000185,0.289172,0.517296,0.000309,0.036418,6.5e-05,0.000349,0.01475,0.000566,0.100198,0.000186,0.000507,0.224456


In [32]:
test_pred2 = test_pred2.reset_index(drop=True)

In [33]:
print(test_pred2.shape,test_id.shape)

(3492890, 15) (3492890,)


In [34]:
test_id.tail()

16860    130787966
16861    130787971
16862    130787974
16863    130788053
16864    130788054
Name: object_id, dtype: int64

In [35]:
test_id = test_id.reset_index(drop=True)

In [36]:
test_id.index == test_pred2.index

array([ True,  True,  True, ...,  True,  True,  True])

In [37]:
%%time
test_pred = pd.concat([test_id,test_pred2],axis=1)

CPU times: user 102 ms, sys: 120 ms, total: 222 ms
Wall time: 221 ms


In [38]:
test_pred = test_pred[temp_columns]

In [39]:
test_pred.head()

Unnamed: 0,object_id,class_6,class_15,class_16,class_42,class_52,class_53,class_62,class_64,class_65,class_67,class_88,class_90,class_92,class_95,class_99
0,13,0.000259,0.00105,0.000127,0.613339,0.099597,0.000296,0.222354,3.3e-05,0.000115,0.002852,0.000276,0.058215,0.000156,0.00133,0.201415
1,14,0.000163,0.008886,0.000166,0.167974,0.051766,0.000279,0.077005,0.000154,0.0006,0.040682,0.00169,0.638965,0.000169,0.011499,0.195032
2,17,0.000257,0.012684,0.000259,0.135556,0.090075,0.000408,0.037354,0.0023,0.000548,0.160885,0.003061,0.544588,0.000174,0.011851,0.220414
3,23,0.000281,0.001986,0.000299,0.051622,0.021756,0.000398,0.087818,0.005267,0.000436,0.49654,0.001185,0.291865,0.000163,0.040384,0.22923
4,34,6.2e-05,0.001839,5.8e-05,0.09222,0.196609,0.000139,0.02104,2.2e-05,0.000102,0.016157,0.0001,0.671488,5.4e-05,0.00011,0.183267


In [40]:
print(test_pred.shape)

(3492890, 16)


In [41]:
%%time
test_pred.to_csv('test_pred_20.csv',index=False)

CPU times: user 1min 26s, sys: 960 ms, total: 1min 27s
Wall time: 1min 27s


In [42]:
!kaggle competitions submit -c PLAsTiCC-2018 -f test_pred_20.csv -m "Message"

100%|███████████████████████████████████████| 1.06G/1.06G [40:34<00:00, 469kB/s]
Successfully submitted to PLAsTiCC Astronomical Classification