In [23]:
import pandas as pd
import numpy as np
import random as rnd
import os
import datetime
import pickle

from scipy import stats
from scipy.special import boxcox1p

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm_notebook

from itertools import product

import gc

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error,r2_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold

pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 200)

import warnings
warnings.filterwarnings('ignore')

In [24]:
meta_train = pd.read_csv('../input/metadata_train.csv')
meta_test = pd.read_csv('../input/metadata_test.csv')

In [25]:
numpeaks = np.load('../features/numpeaks.npy')
numpospeaks = np.load('../features/numpospeaks.npy')
numnegpeaks = np.load('../features/numnegpeaks.npy')

maxpeakwidth = np.load('../features/maxpeakwidth.npy')
minpeakwidth = np.load('../features/minpeakwidth.npy')
meanpeakwidth = np.load('../features/meanpeakwidth.npy')
maxamp = np.load('../features/maxamp.npy')
minamp = np.load('../features/minamp.npy')
meanamp = np.load('../features/meanamp.npy')

stdampall = np.load('../features/stdampall.npy')
stdposall = np.load('../features/stdposall.npy')
stdwidthall = np.load('../features/stdwidthall.npy')

stdampq2 = np.load('../features/stdampq2.npy')
stdposq2 = np.load('../features/stdposq2.npy')
stdwidthq2 = np.load('../features/stdwidthq2.npy')
stdampq3 = np.load('../features/stdampq3.npy')
stdposq3 = np.load('../features/stdposq3.npy')
stdwidthq3 = np.load('../features/stdwidthq3.npy')
stdampq4 = np.load('../features/stdampq4.npy')
stdposq4 = np.load('../features/stdposq4.npy')
stdwidthq4 = np.load('../features/stdwidthq4.npy')


In [26]:
train_features = pd.DataFrame({'NumPeaks':numpeaks, 'NumPosPeaks':numpospeaks, 'NumNegPeaks':numnegpeaks, 'MaxWidth':maxpeakwidth,
                         'MinWidth':minpeakwidth, 'MeanWidth':meanpeakwidth, 'MaxAmp':maxamp, 'MinAmp':minamp, 'MeanAmp':meanamp,
                        'StdAmpAll':stdampall, 'StdPosAll':stdposall, 'StdWidthAll':stdwidthall, 'StdAmpQ2':stdampq2, 'StdPosQ2':stdposq2,
                         'StdWidthQ2':stdwidthq2, 'StdAmpQ3':stdampq3, 'StdPosQ3':stdposq3, 'StdWidthQ3':stdwidthq3,
                        'StdAmpQ4':stdampq4, 'StdPosQ4':stdposq4, 'StdWidthQ4':stdwidthq4,})

In [27]:
train = pd.concat([meta_train, train_features], axis=1)

In [28]:
train.head()

Unnamed: 0,signal_id,id_measurement,phase,target,NumPeaks,NumPosPeaks,NumNegPeaks,MaxWidth,MinWidth,MeanWidth,MaxAmp,MinAmp,MeanAmp,StdAmpAll,StdPosAll,StdWidthAll,StdAmpQ2,StdPosQ2,StdWidthQ2,StdAmpQ3,StdPosQ3,StdWidthQ3,StdAmpQ4,StdPosQ4,StdWidthQ4
0,0,0,0,0,105,48,57,41741,1,3676.451923,46.676929,-21.931899,-0.000125,11.866878,77424.493902,7099.85749,11.214203,25011.751218,6165.933632,12.479814,22686.469829,2801.657114,8.081718,22092.125444,8984.882428
1,1,0,1,0,23,14,9,69734,1,10977.954545,13.454715,-12.666357,9.4e-05,8.66317,76017.661321,16116.21244,9.545417,8608.0,0.0,8.286657,19963.709475,6986.037612,5.719954,15027.735166,9284.827461
2,2,0,2,0,121,67,54,58988,1,3175.458333,26.176798,-30.415865,0.000112,11.285462,74713.050192,7570.433485,14.920124,24129.37109,7265.204158,11.041417,20894.19756,1152.704984,11.427326,21303.210731,5779.776683
3,3,1,0,1,217,99,118,44999,1,1820.212963,91.14762,-38.193195,0.000239,11.473998,131660.062871,4198.542517,5.535926,25928.72229,12482.994099,5.538974,22936.531117,3615.659712,5.086507,34060.095196,2594.629391
4,4,1,1,1,141,68,73,37017,1,2803.257143,14.246623,-9.247347,-6.5e-05,4.289404,126355.390058,5433.900865,4.720264,21722.232143,7837.139326,3.000449,25632.115546,9773.675906,3.425179,29998.890319,4142.901139


In [29]:
signal_id = train['signal_id'].values
del train['signal_id']

In [30]:
target = train['target'].values
del train['target']

In [31]:
from sklearn.metrics import matthews_corrcoef

In [32]:
skf = StratifiedKFold(n_splits=4, random_state=123, shuffle=True)

In [34]:
clf = []
val_mcc = []
lgb_params = {
               'feature_fraction': 0.8,
               'metric': 'binary_logloss',
               'nthread':8, 
               'learning_rate': 0.1, 
               'objective': 'binary', 
               'num_leaves': 2**4,
               'verbose':0, 
               'seed':123
              }


for train_idx, val_idx in skf.split(train, target):
    X_tr, y_tr = train.iloc[train_idx, :], target[train_idx]
    X_val, y_val = train.iloc[val_idx, :], target[val_idx]
 
    model_lgb = lgb.train(lgb_params, lgb.Dataset(X_tr, label=y_tr), 500,\
                           valid_sets=lgb.Dataset(X_val, label=y_val), early_stopping_rounds=30, )
    pred_lgb = model_lgb.predict(X_val)
    val_mcc.append(matthews_corrcoef(y_val, (pred_lgb>0.5).astype(int)))
    clf.append(model_lgb)

[1]	valid_0's binary_logloss: 0.193931
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's binary_logloss: 0.181205
[3]	valid_0's binary_logloss: 0.171893
[4]	valid_0's binary_logloss: 0.164794
[5]	valid_0's binary_logloss: 0.158944
[6]	valid_0's binary_logloss: 0.154606
[7]	valid_0's binary_logloss: 0.151593
[8]	valid_0's binary_logloss: 0.148168
[9]	valid_0's binary_logloss: 0.14553
[10]	valid_0's binary_logloss: 0.143149
[11]	valid_0's binary_logloss: 0.140537
[12]	valid_0's binary_logloss: 0.138668
[13]	valid_0's binary_logloss: 0.137481
[14]	valid_0's binary_logloss: 0.136696
[15]	valid_0's binary_logloss: 0.135746
[16]	valid_0's binary_logloss: 0.13459
[17]	valid_0's binary_logloss: 0.132824
[18]	valid_0's binary_logloss: 0.131348
[19]	valid_0's binary_logloss: 0.13074
[20]	valid_0's binary_logloss: 0.130459
[21]	valid_0's binary_logloss: 0.130309
[22]	valid_0's binary_logloss: 0.130155
[23]	valid_0's binary_logloss: 0.12973
[24]	valid_0's binary_logloss: 

[117]	valid_0's binary_logloss: 0.128638
[118]	valid_0's binary_logloss: 0.128967
[119]	valid_0's binary_logloss: 0.128764
[120]	valid_0's binary_logloss: 0.128709
[121]	valid_0's binary_logloss: 0.128985
[122]	valid_0's binary_logloss: 0.12894
[123]	valid_0's binary_logloss: 0.129115
[124]	valid_0's binary_logloss: 0.129188
[125]	valid_0's binary_logloss: 0.129421
[126]	valid_0's binary_logloss: 0.129468
[127]	valid_0's binary_logloss: 0.129255
[128]	valid_0's binary_logloss: 0.129282
[129]	valid_0's binary_logloss: 0.129065
[130]	valid_0's binary_logloss: 0.129532
[131]	valid_0's binary_logloss: 0.129581
[132]	valid_0's binary_logloss: 0.129644
[133]	valid_0's binary_logloss: 0.129962
[134]	valid_0's binary_logloss: 0.129936
[135]	valid_0's binary_logloss: 0.129694
[136]	valid_0's binary_logloss: 0.129715
[137]	valid_0's binary_logloss: 0.129927
[138]	valid_0's binary_logloss: 0.12979
[139]	valid_0's binary_logloss: 0.12976
[140]	valid_0's binary_logloss: 0.130046
Early stopping, bes

In [35]:
val_mcc

[0.5259937621920572,
 0.5190263078112791,
 0.5306633314339549,
 0.5173957411456653]

In [56]:
pd.DataFrame(pred_lgb, index=val_idx)

Unnamed: 0,0
2,0.027527
4,0.018875
5,0.067489
6,0.008316
11,0.001188
17,0.004191
19,0.008317
20,0.001827
25,0.015846
26,0.046588


In [31]:
feature_imp = np.zeros(train.shape[1])
for model in clf:
    feature_imp+=model.feature_importance(importance_type='gain')

pd.DataFrame(feature_imp/4, index=train.columns).sort_values(by=0, ascending=False)

Unnamed: 0,0
NumNegPeaks,4197.729534
MaxAmp,874.203
MinAmp,860.397295
NumPosPeaks,695.708682
NumPeaks,561.339129
id_measurement,546.572717
StdAmpQ2,526.164296
StdAmpAll,473.767522
StdAmpQ3,449.386954
StdAmpQ4,434.159136


In [62]:
negtar = pd.concat([train['NumNegPeaks'], pd.Series(target)], axis=1)
negtar[negtar[0]==1]

Unnamed: 0,NumNegPeaks,0
3,118,1
4,73,1
5,133,1
201,98,1
202,15,1
228,0,1
229,8,1
230,50,1
270,90,1
271,16,1


In [64]:
train.iloc[2322:2346, :]

Unnamed: 0,id_measurement,phase,NumPeaks,NumPosPeaks,NumNegPeaks,MaxWidth,MinWidth,MeanWidth,MaxAmp,MinAmp,MeanAmp,StdAmpAll,StdPosAll,StdWidthAll,StdAmpQ2,StdPosQ2,StdWidthQ2,StdAmpQ3,StdPosQ3,StdWidthQ3,StdAmpQ4,StdPosQ4,StdWidthQ4
2322,774,0,0,0,0,0,0,0.0,0.0,0.0,0.0,,,,,,,,,,,,
2323,774,1,1,0,1,0,0,0.0,0.0,-30.44127,-7.6e-05,0.0,0.0,,,,,,,,0.0,0.0,
2324,774,2,2,2,0,1,1,1.0,19.084034,0.0,9e-05,1.048194,0.5,0.0,,,,,,,1.048194,0.5,0.0
2325,775,0,16,8,8,130520,1,23989.533333,21.210753,-51.611196,-0.000155,20.69558,116842.46072,33895.500732,0.0,0.0,,15.109178,21660.259699,13113.380835,22.042148,18901.750056,17818.383821
2326,775,1,11,9,2,106074,2,35954.7,64.383449,-72.162144,0.000221,40.747387,131386.71763,35971.917138,0.0,0.0,,0.0,0.0,,20.490371,26717.511788,14073.834573
2327,775,2,39,19,20,50594,1,9141.526316,34.6478,-29.679071,-5.9e-05,18.841521,109997.427269,14695.650794,17.927309,31581.430885,15545.042111,19.319627,21391.966408,7919.083548,17.811874,25196.37148,13819.290148
2328,776,0,23,12,11,139127,1,17486.863636,29.719951,-31.171995,2.3e-05,19.785435,88319.204753,36655.465444,0.0,0.0,,0.0,0.0,,0.0,0.0,
2329,776,1,1,0,1,0,0,0.0,0.0,-13.396848,-3.3e-05,0.0,0.0,,0.0,0.0,,,,,,,
2330,776,2,9,3,6,16111,1,4592.5,36.05559,-26.889329,-0.000167,24.293204,13731.811663,5656.798808,,,,,,,,,
2331,777,0,2,2,0,85,85,85.0,20.531934,0.0,7.1e-05,6.375201,42.5,0.0,,,,6.375201,42.5,0.0,,,


In [14]:
numpeaks = np.load('../features/numpeaks_test.npy')
numpospeaks = np.load('../features/numpospeaks_test.npy')
numnegpeaks = np.load('../features/numnegpeaks_test.npy')

maxpeakwidth = np.load('../features/maxpeakwidth_test.npy')
minpeakwidth = np.load('../features/minpeakwidth_test.npy')
meanpeakwidth = np.load('../features/meanpeakwidth_test.npy')
maxamp = np.load('../features/maxamp_test.npy')
minamp = np.load('../features/minamp_test.npy')
meanamp = np.load('../features/meanamp_test.npy')

stdampall = np.load('../features/stdampall_test.npy')
stdposall = np.load('../features/stdposall_test.npy')
stdwidthall = np.load('../features/stdwidthall_test.npy')

stdampq2 = np.load('../features/stdampq2_test.npy')
stdposq2 = np.load('../features/stdposq2_test.npy')
stdwidthq2 = np.load('../features/stdwidthq2_test.npy')
stdampq3 = np.load('../features/stdampq3_test.npy')
stdposq3 = np.load('../features/stdposq3_test.npy')
stdwidthq3 = np.load('../features/stdwidthq3_test.npy')
stdampq4 = np.load('../features/stdampq4_test.npy')
stdposq4 = np.load('../features/stdposq4_test.npy')
stdwidthq4 = np.load('../features/stdwidthq4_test.npy')


test_features = pd.DataFrame({'NumPeaks':numpeaks, 'NumPosPeaks':numpospeaks, 'NumNegPeaks':numnegpeaks, 'MaxWidth':maxpeakwidth,
                         'MinWidth':minpeakwidth, 'MeanWidth':meanpeakwidth, 'MaxAmp':maxamp, 'MinAmp':minamp, 'MeanAmp':meanamp,
                        'StdAmpAll':stdampall, 'StdPosAll':stdposall, 'StdWidthAll':stdwidthall, 'StdAmpQ2':stdampq2, 'StdPosQ2':stdposq2,
                         'StdWidthQ2':stdwidthq2, 'StdAmpQ3':stdampq3, 'StdPosQ3':stdposq3, 'StdWidthQ3':stdwidthq3,
                        'StdAmpQ4':stdampq4, 'StdPosQ4':stdposq4, 'StdWidthQ4':stdwidthq4,})

In [15]:
test = pd.concat([meta_test, test_features], axis=1)

In [18]:
test_id = test['signal_id'].values
del test['signal_id']

In [39]:
preds = None
for model in clf:
    if preds is None:
        preds = model.predict(test)
    else:
        preds += model.predict(test)
    
preds = preds/4

In [40]:
submission = pd.DataFrame({'signal_id':test_id,'target':(preds>0.5).astype(int)})

In [44]:
submission.to_csv('../output/submission_1.csv', index=False)