In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
import gc
import os
import matplotlib.pyplot as plt
import seaborn as sns 
import lightgbm as lgb
import itertools
import pickle, gzip
import glob
from sklearn.preprocessing import StandardScaler
from tsfresh.feature_extraction import extract_features
np.warnings.filterwarnings('ignore')
import dask.dataframe as dd
import missingno as msno
from pandasql import sqldf
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
import matplotlib.gridspec as gridspec

In [2]:
%%time
train_metadata_kaggle = dd.read_csv('klm_train.csv')
test_metadata_kaggle = dd.read_csv('klm_test.csv')
train_metadata_kaggle = train_metadata_kaggle.compute()
test_metadata_kaggle = test_metadata_kaggle.compute()
print(train_metadata_kaggle.shape,test_metadata_kaggle.shape)

(7848, 64) (3492890, 63)
CPU times: user 1min 46s, sys: 6.72 s, total: 1min 53s
Wall time: 20 s


In [3]:
#Always seed the randomness of this universe
np.random.seed(51)

In [10]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
%%time
train_metadata = dd.read_csv('cesium_train.csv')
test_metadata = dd.read_csv('cesium_test.csv')
train_metadata = train_metadata.compute()
test_metadata = test_metadata.compute()
print(train_metadata.shape,test_metadata.shape)

(7848, 288) (3492890, 283)
CPU times: user 6min 22s, sys: 23.6 s, total: 6min 45s
Wall time: 1min 30s


In [5]:
test_metadata.columns = [x.replace("('","__").replace("',","___").replace(")","_").replace(' ','') for x in test_metadata.columns]

In [6]:
drop_from_train_metadata = [x for x in train_metadata.columns if x not in test_metadata.columns]

In [7]:
%%time
train_metadata.drop(drop_from_train_metadata,axis=1,inplace=True)

CPU times: user 3.36 ms, sys: 0 ns, total: 3.36 ms
Wall time: 4.24 ms


In [8]:
print(train_metadata.shape,test_metadata.shape)

(7848, 283) (3492890, 283)


In [11]:
train_metadata.head()

Unnamed: 0,object_id,__amplitude___0_,__amplitude___1_,__amplitude___2_,__amplitude___3_,__amplitude___4_,__amplitude___5_,__flux_percentile_ratio_mid20___0_,__flux_percentile_ratio_mid20___1_,__flux_percentile_ratio_mid20___2_,__flux_percentile_ratio_mid20___3_,__flux_percentile_ratio_mid20___4_,__flux_percentile_ratio_mid20___5_,__flux_percentile_ratio_mid35___0_,__flux_percentile_ratio_mid35___1_,__flux_percentile_ratio_mid35___2_,__flux_percentile_ratio_mid35___3_,__flux_percentile_ratio_mid35___4_,__flux_percentile_ratio_mid35___5_,__flux_percentile_ratio_mid50___0_,__flux_percentile_ratio_mid50___1_,__flux_percentile_ratio_mid50___2_,__flux_percentile_ratio_mid50___3_,__flux_percentile_ratio_mid50___4_,__flux_percentile_ratio_mid50___5_,__flux_percentile_ratio_mid65___0_,__flux_percentile_ratio_mid65___1_,__flux_percentile_ratio_mid65___2_,__flux_percentile_ratio_mid65___3_,__flux_percentile_ratio_mid65___4_,__flux_percentile_ratio_mid65___5_,__flux_percentile_ratio_mid80___0_,__flux_percentile_ratio_mid80___1_,__flux_percentile_ratio_mid80___2_,__flux_percentile_ratio_mid80___3_,__flux_percentile_ratio_mid80___4_,__flux_percentile_ratio_mid80___5_,__max_slope___0_,__max_slope___1_,__max_slope___2_,__max_slope___3_,__max_slope___4_,__max_slope___5_,__linear_trend___0_,__linear_trend___1_,__linear_trend___2_,__linear_trend___3_,__linear_trend___4_,__linear_trend___5_,__median_absolute_deviation___0_,__median_absolute_deviation___1_,__median_absolute_deviation___2_,__median_absolute_deviation___3_,__median_absolute_deviation___4_,__median_absolute_deviation___5_,__percent_close_to_median___0_,__percent_close_to_median___1_,__percent_close_to_median___2_,__percent_close_to_median___3_,__percent_close_to_median___4_,__percent_close_to_median___5_,__percent_difference_flux_percentile___0_,__percent_difference_flux_percentile___1_,__percent_difference_flux_percentile___2_,__percent_difference_flux_percentile___3_,__percent_difference_flux_percentile___4_,__percent_difference_flux_percentile___5_,__percent_amplitude___0_,__percent_amplitude___1_,__percent_amplitude___2_,__percent_amplitude___3_,__percent_amplitude___4_,__percent_amplitude___5_,__qso_log_chi2_qsonu___0_,__qso_log_chi2_qsonu___1_,__qso_log_chi2_qsonu___2_,__qso_log_chi2_qsonu___3_,__qso_log_chi2_qsonu___4_,__qso_log_chi2_qsonu___5_,__qso_log_chi2nuNULL_chi2nu___0_,__qso_log_chi2nuNULL_chi2nu___1_,__qso_log_chi2nuNULL_chi2nu___2_,__qso_log_chi2nuNULL_chi2nu___3_,__qso_log_chi2nuNULL_chi2nu___4_,__qso_log_chi2nuNULL_chi2nu___5_,__skew___0_,__skew___1_,__skew___2_,__skew___3_,__skew___4_,__skew___5_,__std___0_,__std___1_,__std___2_,__std___3_,__std___4_,__std___5_,__stetson_j___0_,__stetson_j___1_,__stetson_j___2_,__stetson_j___3_,__stetson_j___4_,__stetson_j___5_,__stetson_k___0_,__stetson_k___1_,__stetson_k___2_,__stetson_k___3_,__stetson_k___4_,__stetson_k___5_,__freq1_amplitude1___0_,__freq1_amplitude1___1_,__freq1_amplitude1___2_,__freq1_amplitude1___3_,__freq1_amplitude1___4_,__freq1_amplitude1___5_,__freq1_amplitude2___0_,__freq1_amplitude2___1_,__freq1_amplitude2___2_,__freq1_amplitude2___3_,__freq1_amplitude2___4_,__freq1_amplitude2___5_,__freq1_amplitude3___0_,__freq1_amplitude3___1_,__freq1_amplitude3___2_,__freq1_amplitude3___3_,__freq1_amplitude3___4_,__freq1_amplitude3___5_,__freq1_amplitude4___0_,__freq1_amplitude4___1_,__freq1_amplitude4___2_,__freq1_amplitude4___3_,__freq1_amplitude4___4_,__freq1_amplitude4___5_,__freq1_freq___0_,__freq1_freq___1_,__freq1_freq___2_,__freq1_freq___3_,__freq1_freq___4_,__freq1_freq___5_,__freq1_rel_phase2___0_,__freq1_rel_phase2___1_,__freq1_rel_phase2___2_,__freq1_rel_phase2___3_,__freq1_rel_phase2___4_,__freq1_rel_phase2___5_,__freq1_rel_phase3___0_,__freq1_rel_phase3___1_,__freq1_rel_phase3___2_,__freq1_rel_phase3___3_,__freq1_rel_phase3___4_,__freq1_rel_phase3___5_,__freq1_rel_phase4___0_,__freq1_rel_phase4___1_,__freq1_rel_phase4___2_,__freq1_rel_phase4___3_,__freq1_rel_phase4___4_,__freq1_rel_phase4___5_,__freq2_amplitude1___0_,__freq2_amplitude1___1_,__freq2_amplitude1___2_,__freq2_amplitude1___3_,__freq2_amplitude1___4_,__freq2_amplitude1___5_,__freq2_amplitude2___0_,__freq2_amplitude2___1_,__freq2_amplitude2___2_,__freq2_amplitude2___3_,__freq2_amplitude2___4_,__freq2_amplitude2___5_,__freq2_amplitude3___0_,__freq2_amplitude3___1_,__freq2_amplitude3___2_,__freq2_amplitude3___3_,__freq2_amplitude3___4_,__freq2_amplitude3___5_,__freq2_amplitude4___0_,__freq2_amplitude4___1_,__freq2_amplitude4___2_,__freq2_amplitude4___3_,__freq2_amplitude4___4_,__freq2_amplitude4___5_,__freq2_freq___0_,__freq2_freq___1_,__freq2_freq___2_,__freq2_freq___3_,__freq2_freq___4_,__freq2_freq___5_,__freq2_rel_phase2___0_,__freq2_rel_phase2___1_,__freq2_rel_phase2___2_,__freq2_rel_phase2___3_,__freq2_rel_phase2___4_,__freq2_rel_phase2___5_,__freq2_rel_phase3___0_,__freq2_rel_phase3___1_,__freq2_rel_phase3___2_,__freq2_rel_phase3___3_,__freq2_rel_phase3___4_,__freq2_rel_phase3___5_,__freq2_rel_phase4___0_,__freq2_rel_phase4___1_,__freq2_rel_phase4___2_,__freq2_rel_phase4___3_,__freq2_rel_phase4___4_,__freq2_rel_phase4___5_,__freq3_amplitude1___0_,__freq3_amplitude1___1_,__freq3_amplitude1___2_,__freq3_amplitude1___3_,__freq3_amplitude1___4_,__freq3_amplitude1___5_,__freq3_amplitude2___0_,__freq3_amplitude2___1_,__freq3_amplitude2___2_,__freq3_amplitude2___3_,__freq3_amplitude2___4_,__freq3_amplitude2___5_,__freq3_amplitude3___0_,__freq3_amplitude3___1_,__freq3_amplitude3___2_,__freq3_amplitude3___3_,__freq3_amplitude3___4_,__freq3_amplitude3___5_,__freq3_amplitude4___0_,__freq3_amplitude4___1_,__freq3_amplitude4___2_,__freq3_amplitude4___3_,__freq3_amplitude4___4_,__freq3_amplitude4___5_,__freq3_freq___0_,__freq3_freq___1_,__freq3_freq___2_,__freq3_freq___3_,__freq3_freq___4_,__freq3_freq___5_,__freq3_rel_phase2___0_,__freq3_rel_phase2___1_,__freq3_rel_phase2___2_,__freq3_rel_phase2___3_,__freq3_rel_phase2___4_,__freq3_rel_phase2___5_,__freq3_rel_phase3___0_,__freq3_rel_phase3___1_,__freq3_rel_phase3___2_,__freq3_rel_phase3___3_,__freq3_rel_phase3___4_,__freq3_rel_phase3___5_,__freq3_rel_phase4___0_,__freq3_rel_phase4___1_,__freq3_rel_phase4___2_,__freq3_rel_phase4___3_,__freq3_rel_phase4___4_,__freq3_rel_phase4___5_,__freq1_signif___0_,__freq1_signif___1_,__freq1_signif___2_,__freq1_signif___3_,__freq1_signif___4_,__freq1_signif___5_,__freq_signif_ratio_21___0_,__freq_signif_ratio_21___1_,__freq_signif_ratio_21___2_,__freq_signif_ratio_21___3_,__freq_signif_ratio_21___4_,__freq_signif_ratio_21___5_,__freq_signif_ratio_31___0_,__freq_signif_ratio_31___1_,__freq_signif_ratio_31___2_,__freq_signif_ratio_31___3_,__freq_signif_ratio_31___4_,__freq_signif_ratio_31___5_,__freq_varrat___0_,__freq_varrat___1_,__freq_varrat___2_,__freq_varrat___3_,__freq_varrat___4_,__freq_varrat___5_,__freq_y_offset___0_,__freq_y_offset___1_,__freq_y_offset___2_,__freq_y_offset___3_,__freq_y_offset___4_,__freq_y_offset___5_
0,615,121.048,880.533,646.922,488.191,402.069,400.502,4.8645999999999995e-26,,8.33558e-109,2.1604e-86,1.00114e-78,6.43418e-93,1.1188e-20,,5.68011e-71,7.988229999999999e-48,1.24266e-45,2.8183800000000003e-62,1.16563e-10,,2.91488e-49,1.23322e-27,3.2272500000000003e-28,5.56223e-26,2e-05,,6.71941e-20,5.39884e-16,6.27374e-11,8.41541e-12,0.076025,,4e-06,9e-06,0.000174,0.007144,184.864,594.696,423.361,280.055,205.476,226.112,0.014427,0.095119,0.076147,0.078759,0.098386,0.20255,79.9587,509.249,368.129,314.089,276.594,291.95,0.15873,0.172414,0.172414,0.12069,0.155172,0.087719,1.96435e+41,inf,1.67921e+161,7.16994e+138,1.19726e+122,4.7286799999999996e+132,5.743800000000001e+42,inf,1.44602e+163,1.2399600000000002e+142,5.62023e+124,8.24736e+134,6.21789,9.50875,9.66446,9.16612,8.69763,7.34498,0.00897,0.230622,0.233882,0.083655,0.072571,0.031911,0.125827,0.404755,0.331063,0.285492,0.194883,0.121948,83.2758,596.577,451.181,332.521,289.277,292.182,745.681,5272.27,4007.53,2986.45,2606.34,2670.6,1.10948,1.09173,1.05349,1.11205,1.09943,1.12579,79.3142,404.103,426.907,324.045,261.344,129.553,2.4075,11.6994,13.9477,10.8481,6.77662,1.17588,0.147666,0.963328,2.76486,1.57327,0.67464,0.078317,0.047313,0.207639,0.530471,0.300601,0.254217,0.04715,3.08163,3.08156,3.08165,3.08165,3.08163,3.08169,-0.610732,-0.24967,-0.153939,-0.143364,-0.164394,0.118765,-0.675397,-0.336587,-0.214368,-0.405304,0.24848,1.21865,-0.448204,0.231825,0.017253,0.23567,-0.185805,-1.831,25.855,216.734,145.984,101.853,90.0104,84.6006,2.22024,9.02133,12.9104,9.03108,3.56772,1.08759,0.135884,1.32826,1.56763,1.16613,0.159939,0.083851,0.025398,0.136938,0.454945,0.189758,0.062346,0.039286,3.08173,3.08174,3.08174,3.08174,3.08182,3.08169,-0.68875,-0.44112,-0.277367,-0.201044,-0.09322,-0.090472,-0.285188,-0.337403,-0.227115,-0.373399,-0.189909,1.09237,-0.045816,-0.272616,-0.18958,-0.029936,-0.368443,-1.39124,11.1155,114.465,62.9608,42.9175,30.671,55.3669,0.405033,9.60237,2.7483,2.08305,2.63502,1.06301,0.057778,0.820395,0.319242,0.416029,0.101993,0.090237,0.019303,0.172244,0.26293,0.162004,0.090753,0.036506,6.16321,3.08174,6.16316,6.16325,27.2912,3.08169,0.26876,-0.316482,0.481503,1.38472,-2.0818,-0.215651,-0.241832,-0.535901,-1.66387,2.45692,-0.040614,1.04295,-2.66711,-0.063957,2.577,1.14662,0.925868,-0.995395,5.82526,5.02398,5.49891,5.5646,5.52345,4.52328,0.926934,0.94451,0.910704,0.919127,0.936241,0.980085,0.75043,0.798199,0.734428,0.731502,0.628905,0.915124,0.12755,0.269031,0.139362,0.110785,0.129578,0.401664,4.17331,-29.9084,18.7479,17.0908,20.412,12.1033
1,713,14.6225,10.4224,10.2985,11.8625,11.0574,14.491,0.0032378,0.001148,0.00188816,0.000140974,0.00191385,0.00160392,0.0297076,0.00831,0.0152164,0.00470506,0.00644686,0.00362693,0.0612078,0.051731,0.0303614,0.0145945,0.0136236,0.0211907,0.133755,0.128372,0.0824318,0.13861,0.0261937,0.108496,0.467636,0.547241,0.576291,0.361841,0.26958,0.319883,11.4187,1.73872,1.79864,2.73098,3.60766,6.12855,-0.019145,-0.017998,-0.016854,-0.019238,-0.017949,-0.01381,6.0546,4.69685,5.10035,5.48466,5.39618,5.70267,0.2,0.214286,0.178571,0.214286,0.160714,0.232143,3949.15,3783.86,6086.62,21997.5,11340.6,5094.89,40959.5,28087.1,9544.76,79258.7,39354.9,49421.8,2.18719,3.12481,3.15745,2.79753,2.12321,0.659762,0.12935,0.13155,0.286294,0.1118,0.080834,0.047643,0.254446,-0.085494,-0.022066,-0.162664,-0.062403,0.212294,7.06252,5.6611,5.71898,6.39256,6.34953,7.03045,62.8249,49.2606,50.4821,55.5883,56.4266,60.5598,1.09433,1.0661,1.08818,1.0789,1.10441,1.0593,2.4984,1.73275,1.78302,2.38189,2.59002,2.22385,0.265881,0.113988,0.280791,0.448027,0.368645,0.081285,0.034889,0.020348,0.024745,0.024587,0.050264,0.024074,0.015289,0.008581,0.009528,0.018124,0.006124,0.004096,9.00842,1.64114,1.65595,27.0399,1.64111,26.8973,-2.87888,-0.496265,1.22391,2.15327,1.65126,2.59844,0.214456,-1.50693,2.18614,-1.07818,-2.94819,2.52163,1.96166,-1.11793,-2.77159,-2.25502,-1.18923,0.369706,1.47058,1.22756,1.32617,1.50372,1.75498,1.68082,0.144053,0.058997,0.149601,0.083873,0.057065,0.054624,0.035268,0.016318,0.036273,0.028811,0.008559,0.02654,0.008795,0.001335,0.01042,0.014983,0.014894,0.007755,26.7317,28.4804,1.00944,14.0287,3.39931,8.37859,-0.624735,-1.85562,-1.01909,0.726941,2.48807,2.17215,2.6944,-1.59493,-0.176924,-0.430721,-1.49078,-2.57704,2.61207,-0.184375,2.99495,1.78742,2.37369,0.387926,1.78305,0.851103,0.829273,1.00101,1.58332,1.40961,0.114723,0.027591,0.054406,0.02445,0.043501,0.080486,0.005384,0.017424,0.013791,0.008728,0.018724,0.010559,0.002682,0.004102,0.006092,0.003539,0.005916,0.001995,0.998591,18.6353,26.5534,30.5712,5.97741,10.4357,-2.48973,-1.80732,0.015872,2.01652,3.02428,-1.38804,-2.80317,2.88679,-2.21147,-2.23555,-2.26244,-2.66987,-0.228342,1.67845,-0.920652,-0.126719,-2.25308,0.975023,4.06254,3.93278,3.95669,3.87299,3.78313,3.12284,0.80132,0.922938,0.895489,0.911566,0.942109,0.914162,0.834175,0.880634,0.801262,0.745575,0.866079,0.908687,0.113341,0.076214,0.086381,0.111883,0.166179,0.369518,0.535781,-0.255908,0.167986,0.680787,-0.218376,0.177517
2,730,4.70106,4.54309,11.9218,19.504,23.4981,33.2349,0.057271,0.033826,0.198029,0.0737937,0.0183605,1.24242e-05,0.123925,0.070197,0.265188,0.137245,0.0268019,3.93676e-05,0.171722,0.145034,0.425685,0.238248,0.0828896,0.000194228,0.415834,0.196468,0.55118,0.365292,0.280242,0.000941282,0.518963,0.409725,0.684219,0.585415,0.447971,0.024009,5.98781,2.43638,2.12803,3.72768,3.04558,5.9355,0.000452,0.000451,0.005154,0.007257,0.009337,0.022044,1.20585,0.995237,1.04253,2.01892,2.59619,5.69311,0.347222,0.5,0.769231,0.75,0.745098,0.568627,16.5705,18.1331,5.17796,17.688,73.4317,710340.0,23.6952,25.6541,20.6812,272.18,543.836,479715000.0,-0.307228,0.127758,2.93791,3.04833,3.00151,1.66943,0.19992,0.843385,0.302487,0.16924,0.028269,0.036756,0.349431,0.457635,2.31571,2.58466,2.46254,1.63052,1.81613,1.78977,5.50577,8.11284,10.6048,13.2014,14.5321,13.2787,30.8417,44.7415,57.4819,86.1979,0.996887,0.933091,0.634723,0.6402,0.641965,0.803801,0.898659,0.866224,7.47363,9.42453,9.99862,4.91671,0.016067,0.062706,1.51367,1.05349,1.20213,0.391449,0.015337,0.012959,0.366884,0.30918,0.280361,0.033597,0.006208,0.001437,0.057968,0.058596,0.039559,0.0119,30.1466,25.9923,0.002389,0.002483,0.002389,0.998304,-0.446542,0.284457,-2.19579,-2.52231,-2.4994,-0.804208,1.39649,1.10804,0.536474,0.606319,0.482152,2.25798,0.446571,1.37936,-1.83875,-2.34759,-1.98127,2.03298,1.00078,0.692884,3.93084,5.62856,5.6894,3.87182,0.077102,0.04719,0.381978,0.324125,0.682501,0.233841,0.032435,0.011543,0.127333,0.17459,0.137838,0.037536,0.008215,0.005527,0.040418,0.026825,0.017936,0.00168,12.3068,23.1681,0.001359,0.001359,0.998491,3.34234,1.52306,-2.82837,-2.47946,-2.16773,-1.19731,-2.31136,2.48965,0.098445,2.07332,1.91696,-3.10885,3.0202,2.45444,-2.27677,-1.81007,-1.26201,1.85523,-0.127533,0.643219,0.454918,1.05798,2.42474,3.7427,3.115,0.025988,0.023981,0.048426,0.183671,0.259183,0.04677,0.001565,0.005983,0.013966,0.045993,0.020493,0.028501,0.003445,0.002012,0.004898,0.016015,0.020978,0.003903,17.1986,23.193,27.5776,0.006231,0.319838,1.68249,3.12616,-0.735528,1.72637,-1.83653,-0.844921,-1.822,-1.49852,-2.50535,-2.21634,-2.25661,-2.162,-2.40752,-1.10017,-1.12716,1.74949,2.13798,2.09964,1.43128,3.33742,3.26171,4.61663,4.29916,4.09952,3.28863,1.17422,0.93558,1.02141,0.947548,1.02669,0.959602,0.969243,0.90363,0.82051,0.868992,0.920047,0.935167,0.758565,0.657343,0.204607,0.292954,0.318256,0.500549,-0.003923,0.211586,4.26353,5.71062,5.22649,-0.328019
3,745,10.9442,97.9314,111.477,104.097,99.5638,75.8813,0.0274764,0.07847,0.120848,0.0811019,0.0275933,0.000650935,0.0723564,0.220118,0.206468,0.121448,0.0388521,0.00518877,0.136321,0.310797,0.383269,0.191418,0.0739154,0.0084016,0.233305,0.424514,0.546369,0.368361,0.119664,0.0611463,0.35971,0.677855,0.800911,0.550913,0.283257,0.304134,13.5672,5.68226,4.86764,3.39333,3.22781,5.74937,-0.015354,-0.033786,-0.076955,-0.093522,-0.021154,-0.002782,1.72086,1.47119,1.41645,2.48301,2.38592,6.9776,0.583333,0.964286,0.892857,0.803571,0.857143,0.818182,41.7734,9.51891,6.63858,26.6277,73.1466,12893.2,92.8324,62.473,9.8055,331.78,4966920.0,158334.0,1.4322,5.84082,6.28535,6.06886,5.19069,2.82044,0.389783,0.72795,0.678761,0.180778,0.169103,0.3351,1.98081,6.81799,5.53468,3.65036,3.3826,3.08372,4.34396,25.7318,31.6714,34.6541,32.7725,25.8221,27.4434,62.1886,105.067,152.639,147.496,139.802,0.780128,0.295163,0.394683,0.510399,0.5203,0.642618,5.34481,14.4192,39.1614,50.1168,29.8,11.1069,0.556102,1.26364,3.05019,4.65041,3.65238,0.462575,0.13914,0.14447,1.86219,1.60299,0.735421,0.095047,0.057704,0.089458,1.12456,0.559455,0.191006,0.030171,1.00067,0.002109,0.001827,0.001827,0.001546,0.001546,0.495239,-2.72278,2.98164,-2.98993,-1.47095,-0.772286,-0.435034,1.83925,1.85286,1.86314,2.99902,-2.54788,-1.96658,0.628324,0.615321,0.673682,0.492975,1.92973,2.11606,4.26651,5.89367,11.7279,14.7688,5.77185,0.431063,0.189199,0.112212,1.70792,1.22995,0.399518,0.050483,0.041642,0.043772,0.269398,0.157096,0.084593,0.031696,0.01609,0.028623,0.08433,0.092573,0.027132,0.005537,0.001171,6.81787,25.6514,0.005201,0.000984,-1.91597,-0.711611,-1.37894,-1.55264,-1.51264,-1.46837,2.65631,-2.06788,2.50783,2.3765,-1.7114,-2.63585,-0.950885,2.5952,1.37746,0.748846,-2.5787,2.34365,0.9625,3.59567,3.85219,7.9925,9.33756,5.22332,0.056695,0.237613,0.331656,0.642495,0.497023,0.308552,0.015447,0.030773,0.030854,0.149847,0.071572,0.068668,0.002857,0.014542,0.011446,0.044537,0.014846,0.012457,14.4975,21.1682,15.3964,12.1605,1.00943,1.00793,0.16914,-0.737316,-1.64391,-1.38569,-1.74786,-0.729708,-2.4627,-2.78195,2.74585,2.9651,2.50162,-1.51659,-1.90781,2.22665,1.24284,1.5722,-2.18001,-2.25253,4.8993,2.2185,3.96789,4.81607,4.54482,4.02172,0.739592,0.825534,0.550192,0.626753,0.787122,0.706331,0.602989,0.711901,0.444723,0.549658,0.728291,0.705632,0.432147,0.793649,0.472495,0.290652,0.360868,0.489589,-0.930442,-0.175522,3.69561,4.48271,8.05597,4.17963
4,1124,6.06781,19.8961,54.3781,71.3093,80.072,60.0091,0.0317943,0.097834,0.170602,0.0963301,0.00551608,0.000277267,0.0507556,0.175489,0.290774,0.142121,0.0104615,0.000647547,0.100814,0.208365,0.392313,0.215281,0.0128407,0.0270226,0.161302,0.287111,0.601811,0.355955,0.0367944,0.128478,0.402783,0.414888,0.748329,0.56697,0.067562,0.608458,6.99273,2.12799,4.64707,7.22341,8.31856,6.67651,0.002552,0.008771,0.019272,0.019936,0.020095,0.019435,1.36537,1.69396,1.33779,2.08656,2.79877,5.05844,0.428571,0.724138,0.741379,0.810345,0.844828,0.859649,27.877,11.9952,5.36398,14.4389,391.721,5298.0,899.103,31.4085,14.4412,32.5544,20626400.0,135995.0,-0.064359,3.4079,5.3235,5.07231,4.44342,2.31292,0.3484,0.368181,0.407897,0.311666,0.256263,0.185791,-0.324207,2.3153,2.99532,3.50934,3.80231,3.56035,2.34128,8.03733,21.1353,26.0432,26.6333,21.2458,17.803,49.3371,104.873,116.831,112.844,99.6337,0.950639,0.674119,0.560453,0.516332,0.498085,0.567516,1.18626,11.7039,31.678,31.8963,23.5874,5.86325,0.064179,1.24074,3.80271,3.2148,2.11189,0.333689,0.023237,0.327161,1.09182,0.804713,0.456689,0.03749,0.002445,0.04372,0.138317,0.090076,0.067011,0.017527,13.492,0.002426,0.002426,0.002426,0.002426,0.002157,-2.73976,-2.57099,-2.36258,-2.57421,-2.7214,-2.35492,0.853739,0.51026,0.647402,0.574655,0.594187,1.64038,-1.31236,-2.64555,-2.38981,-2.1964,-1.86403,0.146706,1.06169,3.87226,9.92417,6.20088,7.49599,4.69888,0.053707,0.074926,0.352754,1.00437,0.88566,0.143223,0.010556,0.059627,0.205195,0.115795,0.147971,0.033483,0.005396,0.02772,0.092518,0.068684,0.067053,0.003758,3.12745,0.001328,0.001328,1.00738,1.00737,0.001973,-1.7544,-0.670355,-1.00285,-0.829034,-1.72877,-1.06273,1.05784,1.87107,2.56324,-2.49848,3.11141,-1.93777,2.37857,1.90333,1.58433,2.47369,1.11508,-1.5266,0.708762,1.20558,3.11993,4.73949,8.08215,3.99527,0.066682,0.101515,0.027359,0.345118,1.29959,0.347997,0.009251,0.031889,0.063553,0.053266,0.125172,0.035751,0.001932,0.00053,0.00901,0.012508,0.058966,0.014144,7.30264,0.653961,1.01031,0.659454,7.38992,0.002157,0.81084,-1.22023,0.503234,-1.65366,-1.49048,-2.43895,-0.845473,2.07781,2.98733,2.1378,-2.56805,1.0943,-0.98179,0.066219,0.444604,0.506634,2.33714,-0.590458,3.14047,5.28804,5.24444,4.87454,4.29543,2.80431,1.13056,0.856655,0.743433,0.571537,0.617427,0.9688,0.993363,0.612514,0.606935,0.551963,0.636599,0.997381,0.559273,0.122167,0.147048,0.250639,0.382847,0.673592,-0.021837,8.16335,21.5958,21.4253,15.3577,3.31896


In [12]:
test_id = test_metadata_kaggle['object_id']

In [13]:
cesium_features = pd.read_csv('cesium_features.csv')
my_features = pd.read_csv('my_features.csv')

In [14]:
def multi_weighted_logloss(y_true, y_preds):
    """
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """
    # class_weights taken from Giba's topic : https://www.kaggle.com/titericz
    # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
    # with Kyle Boone's post https://www.kaggle.com/kyleboone
    classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
    class_weight = {6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1}
    if len(np.unique(y_true)) > 14:
        classes.append(99)
        class_weight[99] = 2
    y_p = y_preds
    # Trasform y_true in dummies
    y_ohe = pd.get_dummies(y_true)
    # Normalize rows and limit y_preds to 1e-15, 1-1e-15
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)
    # Transform to log
    y_p_log = np.log(y_p)
    # Get the log for ones, .values is used to drop the index of DataFrames
    # Exclude class 99 for now, since there is no class99 in the training set
    # we gave a special process for that class
    y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
    # Get the number of positives for each class
    nb_pos = y_ohe.sum(axis=0).values.astype(float)
    # Weight average and divide by the number of positives
    class_arr = np.array([class_weight[k] for k in sorted(class_weight.keys())])
    y_w = y_log_ones * class_arr / nb_pos

    loss = - np.sum(y_w) / np.sum(class_arr)
    return loss


def lgb_multi_weighted_logloss(y_true, y_preds):
    """
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """
    # class_weights taken from Giba's topic : https://www.kaggle.com/titericz
    # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
    # with Kyle Boone's post https://www.kaggle.com/kyleboone
    classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
    class_weight = {6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1}
    if len(np.unique(y_true)) > 14:
        classes.append(99)
        class_weight[99] = 2
    y_p = y_preds.reshape(y_true.shape[0], len(classes), order='F')

    # Trasform y_true in dummies
    y_ohe = pd.get_dummies(y_true)
    # Normalize rows and limit y_preds to 1e-15, 1-1e-15
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)
    # Transform to log
    y_p_log = np.log(y_p)
    # Get the log for ones, .values is used to drop the index of DataFrames
    # Exclude class 99 for now, since there is no class99 in the training set
    # we gave a special process for that class
    y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
    # Get the number of positives for each class
    nb_pos = y_ohe.sum(axis=0).values.astype(float)
    # Weight average and divide by the number of positives
    class_arr = np.array([class_weight[k] for k in sorted(class_weight.keys())])
    y_w = y_log_ones * class_arr / nb_pos

    loss = - np.sum(y_w) / np.sum(class_arr)
    return 'wloss', loss, False

In [15]:
cesium_features['fold_sum'].value_counts()

2    121
1    105
3     50
0      6
Name: fold_sum, dtype: int64

In [16]:
columns_0 = list(cesium_features[cesium_features['fold_sum'] == 0]['column_name'])

In [18]:
columns_0.append('__stetson_k___2_')
columns_0.append('__freq3_amplitude1___1_')
columns_0.append('__median_absolute_deviation___2_')
columns_0.append('__percent_close_to_median___2_')

In [19]:
print(columns_0)

['__flux_percentile_ratio_mid50___5_', '__flux_percentile_ratio_mid65___2_', '__median_absolute_deviation___2_', '__qso_log_chi2_qsonu___0_', '__stetson_k___1_', '__freq1_signif___2_', '__stetson_k___2_', '__freq3_amplitude1___1_', '__median_absolute_deviation___2_', '__percent_close_to_median___2_']


In [20]:
cesium_imps = ['object_id','target','flux_max','flux_mean','flux_median','flux_skew','flux_err_min',
               'flux_err_mean','detected_mean','flux_by_flux_ratio_sq_skew','flux_w_mean','flux_diff1',
              'flux_diff2','flux_diff3','0__fft_coefficient__coeff_0__attr_"abs"',
               '0__fft_coefficient__coeff_1__attr_"abs"','0__skewness',
               '1__fft_coefficient__coeff_0__attr_"abs"','1__fft_coefficient__coeff_1__attr_"abs"',
               '1__kurtosis','1__skewness','2__fft_coefficient__coeff_0__attr_"abs"',
               '2__fft_coefficient__coeff_1__attr_"abs"','2__kurtosis','2__skewness',
               '3__fft_coefficient__coeff_0__attr_"abs"','3__fft_coefficient__coeff_1__attr_"abs"',
               '3__kurtosis','3__skewness','4__fft_coefficient__coeff_0__attr_"abs"',
               '4__fft_coefficient__coeff_1__attr_"abs"','4__kurtosis','4__skewness',
               '5__fft_coefficient__coeff_0__attr_"abs"','5__fft_coefficient__coeff_1__attr_"abs"',
               '5__kurtosis','5__skewness','flux__longest_strike_above_mean','flux__longest_strike_below_mean',
               'flux_by_flux_ratio_sq__longest_strike_above_mean','flux_by_flux_ratio_sq__longest_strike_below_mean',
               'mjd__mean_abs_change','mjd__mean_change','mjd_diff_det','hostgal_photoz','hostgal_photoz_err',
               'distmod','hostgal_photoz_certain','__amplitude___0_','__amplitude___1_',
               '__amplitude___2_','__amplitude___3_','__amplitude___4_','__amplitude___5_',
               '__flux_percentile_ratio_mid20___1_','__flux_percentile_ratio_mid20___2_',
               '__flux_percentile_ratio_mid35___1_','__flux_percentile_ratio_mid35___2_',
               '__max_slope___1_','__max_slope___2_','__max_slope___3_','__max_slope___5_',
               '__median_absolute_deviation___0_','__median_absolute_deviation___1_',
               '__median_absolute_deviation___2_','__median_absolute_deviation___3_',
               '__median_absolute_deviation___4_','__median_absolute_deviation___5_',
               '__percent_close_to_median___2_','__percent_close_to_median___3_',
               '__percent_close_to_median___4_','__percent_close_to_median___5_',
               '__percent_difference_flux_percentile___1_','__percent_difference_flux_percentile___2_',
               '__percent_difference_flux_percentile___3_','__percent_difference_flux_percentile___5_',
               '__percent_amplitude___1_','__percent_amplitude___2_','__qso_log_chi2_qsonu___0_',
               '__qso_log_chi2_qsonu___1_','__qso_log_chi2_qsonu___3_','__qso_log_chi2_qsonu___4_',
               '__qso_log_chi2_qsonu___5_','__skew___0_','__skew___1_','__skew___2_',
               '__skew___3_','__skew___4_','__skew___5_',
               '__std___0_','__std___1_','__std___2_','__std___3_',
               '__std___4_','__std___5_','__stetson_j___0_','__stetson_j___1_','__stetson_j___2_',
               '__stetson_j___3_','__stetson_j___4_','__stetson_j___5_','__stetson_k___0_',
               '__stetson_k___2_','__stetson_k___3_','__stetson_k___4_','__stetson_k___5_',
               '__freq1_amplitude1___1_','__freq1_amplitude1___5_','__freq1_amplitude3___3_',
               '__freq1_amplitude3___4_','__freq1_amplitude4___5_','__freq2_amplitude1___5_',
               '__freq2_amplitude2___5_','__freq3_amplitude1___0_','__freq_varrat___0_',
               '__freq_varrat___1_','__freq_varrat___2_','__freq_varrat___3_','__freq_varrat___4_',
               '__freq_varrat___5_'
              ]

In [21]:
cesium_imps2 = [x for x in cesium_imps if x not in columns_0 + ['object_id','target']]

In [22]:
cesium_imps2 = [x for x in cesium_imps2 if x in train_metadata.columns]

In [23]:
print(len(cesium_imps2))

68


In [24]:
temp = train_metadata_kaggle.copy()
temp = temp.merge(train_metadata[['object_id'] + columns_0 ],on = 'object_id',how = 'left')

In [25]:
%%time
final_dict = {}

loss_list = []
#temp = train_metadata_kaggle.copy()
#temp = temp.merge(train_metadata[['object_id',column_]],on = 'object_id',how = 'left')
y = temp['target']
del temp['target']
classes = sorted(y.unique())

# Taken from Giba's topic : https://www.kaggle.com/titericz
# https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
# with Kyle Boone's post https://www.kaggle.com/kyleboone
class_weight = {
    c: 1 for c in classes
}
for c in [64, 15]:
    class_weight[c] = 2

#print('Unique classes : ', classes)

train_id = temp['object_id']
del temp['object_id']
# Compute weights
w = y.value_counts()
weights = {i : np.sum(w) / w[i] for i in w.index}
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
clfs = []
importances = pd.DataFrame()
lgb_params = {
'random_state':51,
'device': 'cpu', 
'objective': 'multiclass', 
'num_class': 14, 
'boosting_type': 'gbdt', 
'n_jobs': -1, 
'max_depth': 7, 
'n_estimators': 500, 
'subsample_freq': 2, 
'subsample_for_bin': 5000, 
'min_data_per_group': 100, 
'max_cat_to_onehot': 4, 
'cat_l2': 1.0, 
'cat_smooth': 59.5, 
'max_cat_threshold': 32, 
'metric_freq': 10, 
'verbosity': -1, 
'metric': 'multi_logloss', 
'xgboost_dart_mode': False, 
'uniform_drop': False, 
'colsample_bytree': 0.5, 
'drop_rate': 0.173, 
'learning_rate': 0.0267, 
'max_drop': 5, 
'min_child_samples': 10, 
'min_child_weight': 100.0, 
'min_split_gain': 0.1, 
'num_leaves': 7, 
'reg_alpha': 0.1, 
'reg_lambda': 0.00023, 
'skip_drop': 0.44, 
'subsample': 0.75}
oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))
for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
    trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
    val_x, val_y = temp.iloc[val_], y.iloc[val_]

    clf = lgb.LGBMClassifier(**lgb_params)
    clf.fit(
        trn_x, trn_y,
        eval_set=[(trn_x, trn_y), (val_x, val_y)],
        eval_metric=lgb_multi_weighted_logloss,
        verbose=False,
        early_stopping_rounds=50,
        sample_weight=trn_y.map(weights)
    )
    oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
    loss_oof = multi_weighted_logloss(val_y, oof_preds[val_, :])
    #loss_list.append(loss_oof)
    print(fold_,loss_oof)

    imp_df = pd.DataFrame()
    imp_df['feature'] = temp.columns
    imp_df['gain'] = clf.feature_importances_
    imp_df['fold'] = fold_ + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)

    clfs.append(clf)
print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
#final_dict[column_] = loss_list

0 0.662155061168403
1 0.6191581770694822
2 0.6793342588613245
3 0.6112001870680623
4 0.6139900864584982
MULTI WEIGHTED LOG LOSS : 0.63737 
CPU times: user 8min 15s, sys: 830 ms, total: 8min 16s
Wall time: 1min 6s


In [28]:
%%time
final_dict = {}
for column_ in cesium_imps2:
    temp = train_metadata_kaggle.copy()
    temp = temp.merge(train_metadata[['object_id'] + columns_0 + [column_] ],on = 'object_id',how = 'left')
    

    loss_list = []
    #temp = train_metadata_kaggle.copy()
    #temp = temp.merge(train_metadata[['object_id',column_]],on = 'object_id',how = 'left')
    y = temp['target']
    del temp['target']
    classes = sorted(y.unique())

    # Taken from Giba's topic : https://www.kaggle.com/titericz
    # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
    # with Kyle Boone's post https://www.kaggle.com/kyleboone
    class_weight = {
        c: 1 for c in classes
    }
    for c in [64, 15]:
        class_weight[c] = 2

    #print('Unique classes : ', classes)

    train_id = temp['object_id']
    del temp['object_id']
    # Compute weights
    w = y.value_counts()
    weights = {i : np.sum(w) / w[i] for i in w.index}
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
    clfs = []
    importances = pd.DataFrame()
    lgb_params = {
    'random_state':51,
    'device': 'cpu', 
    'objective': 'multiclass', 
    'num_class': 14, 
    'boosting_type': 'gbdt', 
    'n_jobs': -1, 
    'max_depth': 7, 
    'n_estimators': 500, 
    'subsample_freq': 2, 
    'subsample_for_bin': 5000, 
    'min_data_per_group': 100, 
    'max_cat_to_onehot': 4, 
    'cat_l2': 1.0, 
    'cat_smooth': 59.5, 
    'max_cat_threshold': 32, 
    'metric_freq': 10, 
    'verbosity': -1, 
    'metric': 'multi_logloss', 
    'xgboost_dart_mode': False, 
    'uniform_drop': False, 
    'colsample_bytree': 0.5, 
    'drop_rate': 0.173, 
    'learning_rate': 0.0267, 
    'max_drop': 5, 
    'min_child_samples': 10, 
    'min_child_weight': 100.0, 
    'min_split_gain': 0.1, 
    'num_leaves': 7, 
    'reg_alpha': 0.1, 
    'reg_lambda': 0.00023, 
    'skip_drop': 0.44, 
    'subsample': 0.75}
    oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
        val_x, val_y = temp.iloc[val_], y.iloc[val_]

        clf = lgb.LGBMClassifier(**lgb_params)
        clf.fit(
            trn_x, trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            eval_metric=lgb_multi_weighted_logloss,
            verbose=False,
            early_stopping_rounds=50,
            sample_weight=trn_y.map(weights)
        )
        oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
        loss_oof = multi_weighted_logloss(val_y, oof_preds[val_, :])
        loss_list.append(loss_oof)
        #print(fold_,loss_oof)

        imp_df = pd.DataFrame()
        imp_df['feature'] = temp.columns
        imp_df['gain'] = clf.feature_importances_
        imp_df['fold'] = fold_ + 1
        importances = pd.concat([importances, imp_df], axis=0, sort=False)

        clfs.append(clf)
    print(column_,'MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
    final_dict[column_] = loss_list

__amplitude___0_ MULTI WEIGHTED LOG LOSS : 0.63561 
__amplitude___1_ MULTI WEIGHTED LOG LOSS : 0.63490 
__amplitude___2_ MULTI WEIGHTED LOG LOSS : 0.63352 
__amplitude___3_ MULTI WEIGHTED LOG LOSS : 0.63475 
__amplitude___4_ MULTI WEIGHTED LOG LOSS : 0.63502 
__amplitude___5_ MULTI WEIGHTED LOG LOSS : 0.63566 
__flux_percentile_ratio_mid20___1_ MULTI WEIGHTED LOG LOSS : 0.63634 
__flux_percentile_ratio_mid20___2_ MULTI WEIGHTED LOG LOSS : 0.63560 
__flux_percentile_ratio_mid35___1_ MULTI WEIGHTED LOG LOSS : 0.63602 
__flux_percentile_ratio_mid35___2_ MULTI WEIGHTED LOG LOSS : 0.63603 
__max_slope___1_ MULTI WEIGHTED LOG LOSS : 0.63657 
__max_slope___2_ MULTI WEIGHTED LOG LOSS : 0.63425 
__max_slope___3_ MULTI WEIGHTED LOG LOSS : 0.63431 
__max_slope___5_ MULTI WEIGHTED LOG LOSS : 0.63524 
__median_absolute_deviation___0_ MULTI WEIGHTED LOG LOSS : 0.63572 
__median_absolute_deviation___1_ MULTI WEIGHTED LOG LOSS : 0.63686 
__median_absolute_deviation___3_ MULTI WEIGHTED LOG LOSS : 0.636

In [37]:
final_dict2 = pd.Series(final_dict)

In [38]:
final_dict2 = pd.DataFrame(final_dict2)

In [39]:
final_dict2['fold1'] = final_dict2[0].apply(lambda x: x[0])
final_dict2['fold2'] = final_dict2[0].apply(lambda x: x[1])
final_dict2['fold3'] = final_dict2[0].apply(lambda x: x[2])
final_dict2['fold4'] = final_dict2[0].apply(lambda x: x[3])
final_dict2['fold5'] = final_dict2[0].apply(lambda x: x[4])

In [40]:
del final_dict2[0]

In [41]:
final_dict2 = final_dict2.reset_index(drop=False)

In [42]:
final_dict2 = final_dict2.rename(columns={'index':'column_name'})

In [43]:
final_dict2['fold1_1'] = (final_dict2['fold1'] > 0.66215) * 1
final_dict2['fold2_1'] = (final_dict2['fold2'] > 0.61915) * 1
final_dict2['fold3_1'] = (final_dict2['fold3'] > 0.67933) * 1
final_dict2['fold4_1'] = (final_dict2['fold4'] > 0.61120) * 1
final_dict2['fold5_1'] = (final_dict2['fold5'] > 0.61399) * 1
final_dict2['fold_sum'] = final_dict2['fold1_1']+ final_dict2['fold2_1'] + final_dict2['fold3_1'] + final_dict2['fold4_1'] + final_dict2['fold5_1']

In [44]:
final_dict2.head()

Unnamed: 0,column_name,fold1,fold2,fold3,fold4,fold5,fold1_1,fold2_1,fold3_1,fold4_1,fold5_1,fold_sum
0,__amplitude___0_,0.656316,0.621495,0.682503,0.611011,0.605606,0,1,1,0,0,2
1,__amplitude___1_,0.655486,0.622676,0.678,0.609506,0.60761,0,1,0,0,0,1
2,__amplitude___2_,0.654242,0.621004,0.677131,0.608331,0.605653,0,1,0,0,0,1
3,__amplitude___3_,0.658166,0.623345,0.678774,0.607894,0.604255,0,1,0,0,0,1
4,__amplitude___4_,0.659924,0.620617,0.676092,0.608964,0.608068,0,1,0,0,0,1


In [45]:
final_dict2.to_csv('cesium_features_2.csv',index=False)

In [48]:
final_dict2[final_dict2['fold_sum'] == 0]

Unnamed: 0,column_name,fold1,fold2,fold3,fold4,fold5,fold1_1,fold2_1,fold3_1,fold4_1,fold5_1,fold_sum
66,__freq_varrat___4_,0.648047,0.61772,0.677394,0.605851,0.612149,0,0,0,0,0,0


# GOOD FEATURES

In [83]:
good_features = ['__freq_varrat___5_',
                 '__freq_varrat___4_',
                 '__qso_log_chi2_qsonu___3_',
                '__qso_log_chi2_qsonu___1_',
                 '__qso_log_chi2_qsonu___5_',
                '__std___4_',
                 '__freq_varrat___3_',
                 '__amplitude___2_'
                ]

In [84]:
#good_features = ['__freq_varrat___4_','__freq_varrat___5_']

In [85]:
%%time
final_dict = {}

loss_list = []
#temp = train_metadata_kaggle.copy()
#temp = temp.merge(train_metadata[['object_id',column_]],on = 'object_id',how = 'left')

temp = train_metadata_kaggle.copy()
temp = temp.merge(train_metadata[['object_id'] + columns_0 + good_features  ],on = 'object_id',how = 'left')
y = temp['target']
del temp['target']
classes = sorted(y.unique())

print(temp.shape)

# Taken from Giba's topic : https://www.kaggle.com/titericz
# https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
# with Kyle Boone's post https://www.kaggle.com/kyleboone
class_weight = {
    c: 1 for c in classes
}
for c in [64, 15]:
    class_weight[c] = 2

#print('Unique classes : ', classes)

train_id = temp['object_id']
del temp['object_id']
# Compute weights
w = y.value_counts()
weights = {i : np.sum(w) / w[i] for i in w.index}
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
clfs = []
importances = pd.DataFrame()
lgb_params = {
'random_state':51,
'device': 'cpu', 
'objective': 'multiclass', 
'num_class': 14, 
'boosting_type': 'gbdt', 
'n_jobs': -1, 
'max_depth': 7, 
'n_estimators': 500, 
'subsample_freq': 2, 
'subsample_for_bin': 5000, 
'min_data_per_group': 100, 
'max_cat_to_onehot': 4, 
'cat_l2': 1.0, 
'cat_smooth': 59.5, 
'max_cat_threshold': 32, 
'metric_freq': 10, 
'verbosity': -1, 
'metric': 'multi_logloss', 
'xgboost_dart_mode': False, 
'uniform_drop': False, 
'colsample_bytree': 0.5, 
'drop_rate': 0.173, 
'learning_rate': 0.0267, 
'max_drop': 5, 
'min_child_samples': 10, 
'min_child_weight': 100.0, 
'min_split_gain': 0.1, 
'num_leaves': 7, 
'reg_alpha': 0.1, 
'reg_lambda': 0.00023, 
'skip_drop': 0.44, 
'subsample': 0.75}
oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))
for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
    trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
    val_x, val_y = temp.iloc[val_], y.iloc[val_]

    clf = lgb.LGBMClassifier(**lgb_params)
    clf.fit(
        trn_x, trn_y,
        eval_set=[(trn_x, trn_y), (val_x, val_y)],
        eval_metric=lgb_multi_weighted_logloss,
        verbose=False,
        early_stopping_rounds=50,
        sample_weight=trn_y.map(weights)
    )
    oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
    loss_oof = multi_weighted_logloss(val_y, oof_preds[val_, :])
    #loss_list.append(loss_oof)
    print(fold_,loss_oof)

    imp_df = pd.DataFrame()
    imp_df['feature'] = temp.columns
    imp_df['gain'] = clf.feature_importances_
    imp_df['fold'] = fold_ + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)

    clfs.append(clf)
print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))
#final_dict[column_] = loss_list

(7848, 81)
0 0.6462657114002985
1 0.607965791018667
2 0.6748395712796345
3 0.601918405841696
4 0.6131804779219846
MULTI WEIGHTED LOG LOSS : 0.62891 
CPU times: user 7min 38s, sys: 824 ms, total: 7min 39s
Wall time: 1min


In [None]:
#temp = temp[cesium_imps2]