In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
%%html

<style>
table {float:left}
</style>

In [3]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 500
pd.options.display.max_rows = 50

# Training Data

In [4]:
train_data = pd.read_csv('../input/car_breakdown_train.tsv', sep='\t', header=0)
train_data.head()

Unnamed: 0,vehicleId,days,ecoMode,cityMode,sportMode,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,-0.0007,-0.0004,100,518.67,641.82,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19,1.3,47.47,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100,39.06,23.419
1,1,2,0.0019,-0.0003,100,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,47.49,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100,39.0,23.4236
2,1,3,-0.0043,0.0003,100,518.67,642.35,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94,1.3,47.27,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100,38.95,23.3442
3,1,4,0.0007,0.0,100,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,47.13,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100,38.88,23.3739
4,1,5,-0.0019,-0.0002,100,518.67,642.37,1582.85,1406.22,14.62,21.61,554.0,2388.06,9055.15,1.3,47.28,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100,38.9,23.4044


## How the training data is arranged.

|**Field**|**Description**|
|:---------|:---------------|
|**vechicleId**|unique id of the vehicle in the fleet|
|**days**|number of days passed so far|
|**ecoMode**|eco mode nob setting used for the day|
|**cityMode**|city mode nob setting used for the day|
|**sportMode**|sport mode nob setting used for the day|
|**s1**|reading form sensor 1|
|**s2**|reading form sensor 2|
|**s3**|reading form sensor 3|
| ... | ... |
|**s20**|reading form sensor 20|
|**s21**|reading form sensor 21|

The data is arranged as per above columns. Rows are grouped by **vehicleId**, with **days** in increasing order, representing the state of the car on that day, in a time series manner.

The last day of for the particular **vehicleId** is the day, when the state of it was so bad, that it broke down.
e.g. in the following case, **vehicleId** = 1, broke down on 192nd day

In [5]:
train_data[train_data["vehicleId"] == 1]

Unnamed: 0,vehicleId,days,ecoMode,cityMode,sportMode,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,-0.0007,-0.0004,100,518.67,641.82,1589.70,1400.60,14.62,21.61,554.36,2388.06,9046.19,1.3,47.47,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100,39.06,23.4190
1,1,2,0.0019,-0.0003,100,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,47.49,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100,39.00,23.4236
2,1,3,-0.0043,0.0003,100,518.67,642.35,1587.99,1404.20,14.62,21.61,554.26,2388.08,9052.94,1.3,47.27,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100,38.95,23.3442
3,1,4,0.0007,0.0000,100,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,47.13,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100,38.88,23.3739
4,1,5,-0.0019,-0.0002,100,518.67,642.37,1582.85,1406.22,14.62,21.61,554.00,2388.06,9055.15,1.3,47.28,522.19,2388.04,8133.80,8.4294,0.03,393,2388,100,38.90,23.4044
5,1,6,-0.0043,-0.0001,100,518.67,642.10,1584.47,1398.37,14.62,21.61,554.67,2388.02,9049.68,1.3,47.16,521.68,2388.03,8132.85,8.4108,0.03,391,2388,100,38.98,23.3669
6,1,7,0.0010,0.0001,100,518.67,642.48,1592.32,1397.77,14.62,21.61,554.34,2388.02,9059.13,1.3,47.36,522.32,2388.03,8132.32,8.3974,0.03,392,2388,100,39.10,23.3774
7,1,8,-0.0034,0.0003,100,518.67,642.56,1582.96,1400.97,14.62,21.61,553.85,2388.00,9040.80,1.3,47.24,522.47,2388.03,8131.07,8.4076,0.03,391,2388,100,38.97,23.3106
8,1,9,0.0008,0.0001,100,518.67,642.12,1590.98,1394.80,14.62,21.61,553.69,2388.05,9046.46,1.3,47.29,521.79,2388.05,8125.69,8.3728,0.03,392,2388,100,39.05,23.4066
9,1,10,-0.0033,0.0001,100,518.67,641.71,1591.24,1400.46,14.62,21.61,553.59,2388.05,9051.70,1.3,47.03,521.79,2388.06,8129.38,8.4286,0.03,393,2388,100,38.95,23.4694


# creating some features

1. Deltas from last day
2. Deltas from last 7 days
3. Deltas from last 14 days

# rolling measures of averages
4. avg s for days ran till now, std
5. mode s for days till now
6. median s for days till now

# measures of averages
4. avg s for days ran till now, std
5. mode s for days till now
6. median s for days till now

In [6]:
'''
# creating some features

1. Deltas from last day
2. Deltas from last 7 days
3. Deltas from last 14 days
'''
def create_difference_columns(df):
    dfcopy = pd.DataFrame()
    periods = [1,7,14,28,35]
    for period in periods:
        tmp = df.groupby('vehicleId').diff(periods=period)
        del tmp['days']
        tmp.columns = [x+"_delta_"+str(period) for x in df.columns[2:]]
        dfcopy = pd.concat([dfcopy,tmp],axis=1)
    return dfcopy.fillna(0)

In [7]:
'''
# rolling measures of averages for n days
4. avg s for days ran , std
5. mode s for days 
6. median s for days
'''

def create_rolling_columns(df):
    dfcopy = pd.DataFrame()
    periods = [5,10,15]
    for period in periods:
        tmp_mean = df.groupby('vehicleId').rolling(period).mean()[df.columns[2:]].reset_index()[df.columns[2:]]
        tmp_mean.columns = [x+"_roll_mean_"+str(period) for x in df.columns[2:]]
        tmp_std = df.groupby('vehicleId').rolling(period).std()[df.columns[2:]].reset_index()[df.columns[2:]]
        tmp_std.columns = [x+"_roll_std_"+str(period) for x in df.columns[2:]]
        tmp_median = df.groupby('vehicleId').rolling(period).median()[df.columns[2:]].reset_index()[df.columns[2:]]
        tmp_median.columns = [x+"_roll_median_"+str(period) for x in df.columns[2:]]
        dfcopy = pd.concat([dfcopy,tmp_mean,tmp_std,tmp_median],axis=1)
    return dfcopy.fillna(0)


In [8]:
'''
# cumulative measures of averages
4. avg s for days ran till now, std
5. mode s for days till now
6. median s for days till now
'''
def create_cumulative_columns(df):
    dfcopy = pd.DataFrame()
    tmp_mean = df.groupby('vehicleId').expanding(min_periods=5).mean()[df.columns[2:]].reset_index()[df.columns[2:]]
    tmp_mean.columns = [x+"_cum_mean_"+str(5) for x in df.columns[2:]]
    tmp_std = df.groupby('vehicleId').expanding(min_periods=5).std()[df.columns[2:]].reset_index()[df.columns[2:]]
    tmp_std.columns = [x+"_cum_std_"+str(5) for x in df.columns[2:]]
    tmp_median = df.groupby('vehicleId').expanding(min_periods=5).median()[df.columns[2:]].reset_index()[df.columns[2:]]
    tmp_median.columns = [x+"_cum_median_"+str(5) for x in df.columns[2:]]
    dfcopy = pd.concat([dfcopy,tmp_mean,tmp_std,tmp_median],axis=1)
    return dfcopy.fillna(0)


In [9]:
diffcols = create_difference_columns(train_data)
rollcols = create_rolling_columns(train_data)
cumcols = create_cumulative_columns(train_data)

In [10]:
traindf = pd.concat([train_data,diffcols,rollcols,cumcols],axis=1)

In [11]:
traindf.head()

Unnamed: 0,vehicleId,days,ecoMode,cityMode,sportMode,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21,ecoMode_delta_1,cityMode_delta_1,sportMode_delta_1,s1_delta_1,s2_delta_1,s3_delta_1,s4_delta_1,s5_delta_1,s6_delta_1,s7_delta_1,s8_delta_1,s9_delta_1,s10_delta_1,s11_delta_1,s12_delta_1,s13_delta_1,s14_delta_1,s15_delta_1,s16_delta_1,s17_delta_1,s18_delta_1,s19_delta_1,s20_delta_1,s21_delta_1,ecoMode_delta_7,cityMode_delta_7,sportMode_delta_7,s1_delta_7,s2_delta_7,s3_delta_7,s4_delta_7,s5_delta_7,s6_delta_7,s7_delta_7,s8_delta_7,s9_delta_7,s10_delta_7,s11_delta_7,s12_delta_7,s13_delta_7,s14_delta_7,s15_delta_7,s16_delta_7,s17_delta_7,s18_delta_7,s19_delta_7,s20_delta_7,s21_delta_7,ecoMode_delta_14,cityMode_delta_14,sportMode_delta_14,s1_delta_14,s2_delta_14,s3_delta_14,s4_delta_14,s5_delta_14,s6_delta_14,s7_delta_14,s8_delta_14,s9_delta_14,s10_delta_14,s11_delta_14,s12_delta_14,s13_delta_14,s14_delta_14,s15_delta_14,s16_delta_14,s17_delta_14,s18_delta_14,s19_delta_14,s20_delta_14,s21_delta_14,ecoMode_delta_28,cityMode_delta_28,sportMode_delta_28,s1_delta_28,s2_delta_28,s3_delta_28,s4_delta_28,s5_delta_28,s6_delta_28,s7_delta_28,s8_delta_28,s9_delta_28,s10_delta_28,s11_delta_28,s12_delta_28,s13_delta_28,s14_delta_28,s15_delta_28,s16_delta_28,s17_delta_28,s18_delta_28,s19_delta_28,s20_delta_28,s21_delta_28,ecoMode_delta_35,cityMode_delta_35,sportMode_delta_35,s1_delta_35,s2_delta_35,s3_delta_35,s4_delta_35,s5_delta_35,s6_delta_35,s7_delta_35,s8_delta_35,s9_delta_35,s10_delta_35,s11_delta_35,s12_delta_35,s13_delta_35,s14_delta_35,s15_delta_35,s16_delta_35,s17_delta_35,s18_delta_35,s19_delta_35,s20_delta_35,s21_delta_35,ecoMode_roll_mean_5,cityMode_roll_mean_5,sportMode_roll_mean_5,s1_roll_mean_5,s2_roll_mean_5,s3_roll_mean_5,s4_roll_mean_5,s5_roll_mean_5,s6_roll_mean_5,s7_roll_mean_5,s8_roll_mean_5,s9_roll_mean_5,s10_roll_mean_5,s11_roll_mean_5,s12_roll_mean_5,s13_roll_mean_5,s14_roll_mean_5,s15_roll_mean_5,s16_roll_mean_5,s17_roll_mean_5,s18_roll_mean_5,s19_roll_mean_5,s20_roll_mean_5,s21_roll_mean_5,ecoMode_roll_std_5,cityMode_roll_std_5,sportMode_roll_std_5,s1_roll_std_5,s2_roll_std_5,s3_roll_std_5,s4_roll_std_5,s5_roll_std_5,s6_roll_std_5,s7_roll_std_5,s8_roll_std_5,s9_roll_std_5,s10_roll_std_5,s11_roll_std_5,s12_roll_std_5,s13_roll_std_5,s14_roll_std_5,s15_roll_std_5,s16_roll_std_5,s17_roll_std_5,s18_roll_std_5,s19_roll_std_5,s20_roll_std_5,s21_roll_std_5,ecoMode_roll_median_5,cityMode_roll_median_5,sportMode_roll_median_5,s1_roll_median_5,s2_roll_median_5,s3_roll_median_5,s4_roll_median_5,s5_roll_median_5,s6_roll_median_5,s7_roll_median_5,s8_roll_median_5,s9_roll_median_5,s10_roll_median_5,s11_roll_median_5,s12_roll_median_5,s13_roll_median_5,s14_roll_median_5,s15_roll_median_5,s16_roll_median_5,s17_roll_median_5,s18_roll_median_5,s19_roll_median_5,s20_roll_median_5,s21_roll_median_5,ecoMode_roll_mean_10,cityMode_roll_mean_10,sportMode_roll_mean_10,s1_roll_mean_10,s2_roll_mean_10,s3_roll_mean_10,s4_roll_mean_10,s5_roll_mean_10,s6_roll_mean_10,s7_roll_mean_10,s8_roll_mean_10,s9_roll_mean_10,s10_roll_mean_10,s11_roll_mean_10,s12_roll_mean_10,s13_roll_mean_10,s14_roll_mean_10,s15_roll_mean_10,s16_roll_mean_10,s17_roll_mean_10,s18_roll_mean_10,s19_roll_mean_10,s20_roll_mean_10,s21_roll_mean_10,ecoMode_roll_std_10,cityMode_roll_std_10,sportMode_roll_std_10,s1_roll_std_10,s2_roll_std_10,s3_roll_std_10,s4_roll_std_10,s5_roll_std_10,s6_roll_std_10,s7_roll_std_10,s8_roll_std_10,s9_roll_std_10,s10_roll_std_10,s11_roll_std_10,s12_roll_std_10,s13_roll_std_10,s14_roll_std_10,s15_roll_std_10,s16_roll_std_10,s17_roll_std_10,s18_roll_std_10,s19_roll_std_10,s20_roll_std_10,s21_roll_std_10,ecoMode_roll_median_10,cityMode_roll_median_10,sportMode_roll_median_10,s1_roll_median_10,s2_roll_median_10,s3_roll_median_10,s4_roll_median_10,s5_roll_median_10,s6_roll_median_10,s7_roll_median_10,s8_roll_median_10,s9_roll_median_10,s10_roll_median_10,s11_roll_median_10,s12_roll_median_10,s13_roll_median_10,s14_roll_median_10,s15_roll_median_10,s16_roll_median_10,s17_roll_median_10,s18_roll_median_10,s19_roll_median_10,s20_roll_median_10,s21_roll_median_10,ecoMode_roll_mean_15,cityMode_roll_mean_15,sportMode_roll_mean_15,s1_roll_mean_15,s2_roll_mean_15,s3_roll_mean_15,s4_roll_mean_15,s5_roll_mean_15,s6_roll_mean_15,s7_roll_mean_15,s8_roll_mean_15,s9_roll_mean_15,s10_roll_mean_15,s11_roll_mean_15,s12_roll_mean_15,s13_roll_mean_15,s14_roll_mean_15,s15_roll_mean_15,s16_roll_mean_15,s17_roll_mean_15,s18_roll_mean_15,s19_roll_mean_15,s20_roll_mean_15,s21_roll_mean_15,ecoMode_roll_std_15,cityMode_roll_std_15,sportMode_roll_std_15,s1_roll_std_15,s2_roll_std_15,s3_roll_std_15,s4_roll_std_15,s5_roll_std_15,s6_roll_std_15,s7_roll_std_15,s8_roll_std_15,s9_roll_std_15,s10_roll_std_15,s11_roll_std_15,s12_roll_std_15,s13_roll_std_15,s14_roll_std_15,s15_roll_std_15,s16_roll_std_15,s17_roll_std_15,s18_roll_std_15,s19_roll_std_15,s20_roll_std_15,s21_roll_std_15,ecoMode_roll_median_15,cityMode_roll_median_15,sportMode_roll_median_15,s1_roll_median_15,s2_roll_median_15,s3_roll_median_15,s4_roll_median_15,s5_roll_median_15,s6_roll_median_15,s7_roll_median_15,s8_roll_median_15,s9_roll_median_15,s10_roll_median_15,s11_roll_median_15,s12_roll_median_15,s13_roll_median_15,s14_roll_median_15,s15_roll_median_15,s16_roll_median_15,s17_roll_median_15,s18_roll_median_15,s19_roll_median_15,s20_roll_median_15,s21_roll_median_15,ecoMode_cum_mean_5,cityMode_cum_mean_5,sportMode_cum_mean_5,s1_cum_mean_5,s2_cum_mean_5,s3_cum_mean_5,s4_cum_mean_5,s5_cum_mean_5,s6_cum_mean_5,s7_cum_mean_5,s8_cum_mean_5,s9_cum_mean_5,s10_cum_mean_5,s11_cum_mean_5,s12_cum_mean_5,s13_cum_mean_5,s14_cum_mean_5,s15_cum_mean_5,s16_cum_mean_5,s17_cum_mean_5,s18_cum_mean_5,s19_cum_mean_5,s20_cum_mean_5,s21_cum_mean_5,ecoMode_cum_std_5,cityMode_cum_std_5,sportMode_cum_std_5,s1_cum_std_5,s2_cum_std_5,s3_cum_std_5,s4_cum_std_5,s5_cum_std_5,s6_cum_std_5,s7_cum_std_5,s8_cum_std_5,s9_cum_std_5,s10_cum_std_5,s11_cum_std_5,s12_cum_std_5,s13_cum_std_5,s14_cum_std_5,s15_cum_std_5,s16_cum_std_5,s17_cum_std_5,s18_cum_std_5,s19_cum_std_5,s20_cum_std_5,s21_cum_std_5,ecoMode_cum_median_5,cityMode_cum_median_5,sportMode_cum_median_5,s1_cum_median_5,s2_cum_median_5,s3_cum_median_5,s4_cum_median_5,s5_cum_median_5,s6_cum_median_5,s7_cum_median_5,s8_cum_median_5,s9_cum_median_5,s10_cum_median_5,s11_cum_median_5,s12_cum_median_5,s13_cum_median_5,s14_cum_median_5,s15_cum_median_5,s16_cum_median_5,s17_cum_median_5,s18_cum_median_5,s19_cum_median_5,s20_cum_median_5,s21_cum_median_5
0,1,1,-0.0007,-0.0004,100,518.67,641.82,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19,1.3,47.47,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100,39.06,23.419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2,0.0019,-0.0003,100,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,47.49,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100,39.0,23.4236,0.0026,0.0001,0.0,0.0,0.33,2.12,2.54,0.0,0.0,-0.61,-0.02,-2.12,0.0,0.02,0.62,0.05,-7.13,0.0123,0.0,0.0,0.0,0.0,-0.06,0.0046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,3,-0.0043,0.0003,100,518.67,642.35,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94,1.3,47.27,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100,38.95,23.3442,-0.0062,0.0006,0.0,0.0,0.2,-3.83,1.06,0.0,0.0,0.51,0.04,8.87,0.0,-0.22,0.14,-0.04,1.74,-0.014,0.0,-2.0,0.0,0.0,-0.05,-0.0794,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,4,0.0007,0.0,100,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,47.13,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100,38.88,23.3739,0.005,-0.0003,0.0,0.0,0.0,-5.2,-2.33,0.0,0.0,0.19,0.03,-3.46,0.0,-0.14,0.44,0.05,0.6,-0.0496,0.0,2.0,0.0,0.0,-0.07,0.0297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,5,-0.0019,-0.0002,100,518.67,642.37,1582.85,1406.22,14.62,21.61,554.0,2388.06,9055.15,1.3,47.28,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100,38.9,23.4044,-0.0026,-0.0002,0.0,0.0,0.02,0.06,4.35,0.0,0.0,-0.45,-0.05,5.67,0.0,0.15,-0.67,-0.04,-0.03,0.0612,0.0,1.0,0.0,0.0,0.02,0.0305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.00086,-0.00012,100.0,518.67,642.208,1587.03,1403.206,14.62,21.61,554.164,2388.07,9049.566,1.3,47.328,522.282,2388.048,8134.194,8.41334,0.03,391.8,2388.0,100.0,38.958,23.39302,0.002397,0.000277,0.0,0.0,0.234776,4.075678,2.15944,0.0,0.0,0.286234,0.026458,4.587366,0.0,0.151063,0.432574,0.025884,2.651326,0.025953,0.0,1.095445,0.0,0.0,0.073621,0.033498,-0.0007,-0.0002,100.0,518.67,642.35,1587.99,1403.14,14.62,21.61,554.26,2388.06,9049.48,1.3,47.28,522.28,2388.04,8133.8,8.4195,0.03,392.0,2388.0,100.0,38.95,23.4044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.00086,-0.00012,100.0,518.67,642.208,1587.03,1403.206,14.62,21.61,554.164,2388.07,9049.566,1.3,47.328,522.282,2388.048,8134.194,8.41334,0.03,391.8,2388.0,100.0,38.958,23.39302,0.002397,0.000277,0.0,0.0,0.234776,4.075678,2.15944,0.0,0.0,0.286234,0.026458,4.587366,0.0,0.151063,0.432574,0.025884,2.651326,0.025953,0.0,1.095445,0.0,0.0,0.073621,0.033498,-0.0007,-0.0002,100.0,518.67,642.35,1587.99,1403.14,14.62,21.61,554.26,2388.06,9049.48,1.3,47.28,522.28,2388.04,8133.8,8.4195,0.03,392.0,2388.0,100.0,38.95,23.4044


# Lets populate the target first in the train file

In [12]:
train_gby_vid = train_data.groupby(['vehicleId']).agg({'days':'max'}).reset_index().rename(columns = {'days':'breakdown_day'})
traindf = pd.merge(traindf,train_gby_vid,on='vehicleId',how='left')

In [13]:
# breakdown in next 30 days
traindf['target'] = traindf.apply(lambda x : 1 if x['breakdown_day']-x['days']<=30 else 0,axis=1)

In [14]:
del traindf['breakdown_day']

In [15]:
len(traindf.columns)

435

# train validation split

In [16]:
# Iam going to be splitting the dataset using vehicle IDs
print(traindf['vehicleId'].unique())

[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100]


In [17]:
# get random 20 numbers from a list

import random
random.seed(1)
val_ids = random.sample(list(traindf['vehicleId'].unique()), 20)
train = traindf[traindf['vehicleId'].apply(lambda x: x not in val_ids)]
val = traindf[traindf['vehicleId'].apply(lambda x: x  in val_ids)]

In [18]:
train.head()

Unnamed: 0,vehicleId,days,ecoMode,cityMode,sportMode,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21,ecoMode_delta_1,cityMode_delta_1,sportMode_delta_1,s1_delta_1,s2_delta_1,s3_delta_1,s4_delta_1,s5_delta_1,s6_delta_1,s7_delta_1,s8_delta_1,s9_delta_1,s10_delta_1,s11_delta_1,s12_delta_1,s13_delta_1,s14_delta_1,s15_delta_1,s16_delta_1,s17_delta_1,s18_delta_1,s19_delta_1,s20_delta_1,s21_delta_1,ecoMode_delta_7,cityMode_delta_7,sportMode_delta_7,s1_delta_7,s2_delta_7,s3_delta_7,s4_delta_7,s5_delta_7,s6_delta_7,s7_delta_7,s8_delta_7,s9_delta_7,s10_delta_7,s11_delta_7,s12_delta_7,s13_delta_7,s14_delta_7,s15_delta_7,s16_delta_7,s17_delta_7,s18_delta_7,s19_delta_7,s20_delta_7,s21_delta_7,ecoMode_delta_14,cityMode_delta_14,sportMode_delta_14,s1_delta_14,s2_delta_14,s3_delta_14,s4_delta_14,s5_delta_14,s6_delta_14,s7_delta_14,s8_delta_14,s9_delta_14,s10_delta_14,s11_delta_14,s12_delta_14,s13_delta_14,s14_delta_14,s15_delta_14,s16_delta_14,s17_delta_14,s18_delta_14,s19_delta_14,s20_delta_14,s21_delta_14,ecoMode_delta_28,cityMode_delta_28,sportMode_delta_28,s1_delta_28,s2_delta_28,s3_delta_28,s4_delta_28,s5_delta_28,s6_delta_28,s7_delta_28,s8_delta_28,s9_delta_28,s10_delta_28,s11_delta_28,s12_delta_28,s13_delta_28,s14_delta_28,s15_delta_28,s16_delta_28,s17_delta_28,s18_delta_28,s19_delta_28,s20_delta_28,s21_delta_28,ecoMode_delta_35,cityMode_delta_35,sportMode_delta_35,s1_delta_35,s2_delta_35,s3_delta_35,s4_delta_35,s5_delta_35,s6_delta_35,s7_delta_35,s8_delta_35,s9_delta_35,s10_delta_35,s11_delta_35,s12_delta_35,s13_delta_35,s14_delta_35,s15_delta_35,s16_delta_35,s17_delta_35,s18_delta_35,s19_delta_35,s20_delta_35,s21_delta_35,ecoMode_roll_mean_5,cityMode_roll_mean_5,sportMode_roll_mean_5,s1_roll_mean_5,s2_roll_mean_5,s3_roll_mean_5,s4_roll_mean_5,s5_roll_mean_5,s6_roll_mean_5,s7_roll_mean_5,s8_roll_mean_5,s9_roll_mean_5,s10_roll_mean_5,s11_roll_mean_5,s12_roll_mean_5,s13_roll_mean_5,s14_roll_mean_5,s15_roll_mean_5,s16_roll_mean_5,s17_roll_mean_5,s18_roll_mean_5,s19_roll_mean_5,s20_roll_mean_5,s21_roll_mean_5,ecoMode_roll_std_5,cityMode_roll_std_5,sportMode_roll_std_5,s1_roll_std_5,s2_roll_std_5,s3_roll_std_5,s4_roll_std_5,s5_roll_std_5,s6_roll_std_5,s7_roll_std_5,s8_roll_std_5,s9_roll_std_5,s10_roll_std_5,s11_roll_std_5,s12_roll_std_5,s13_roll_std_5,s14_roll_std_5,s15_roll_std_5,s16_roll_std_5,s17_roll_std_5,s18_roll_std_5,s19_roll_std_5,s20_roll_std_5,s21_roll_std_5,ecoMode_roll_median_5,cityMode_roll_median_5,sportMode_roll_median_5,s1_roll_median_5,s2_roll_median_5,s3_roll_median_5,s4_roll_median_5,s5_roll_median_5,s6_roll_median_5,s7_roll_median_5,s8_roll_median_5,s9_roll_median_5,s10_roll_median_5,s11_roll_median_5,s12_roll_median_5,s13_roll_median_5,s14_roll_median_5,s15_roll_median_5,s16_roll_median_5,s17_roll_median_5,s18_roll_median_5,s19_roll_median_5,s20_roll_median_5,s21_roll_median_5,ecoMode_roll_mean_10,cityMode_roll_mean_10,sportMode_roll_mean_10,s1_roll_mean_10,s2_roll_mean_10,s3_roll_mean_10,s4_roll_mean_10,s5_roll_mean_10,s6_roll_mean_10,s7_roll_mean_10,s8_roll_mean_10,s9_roll_mean_10,s10_roll_mean_10,s11_roll_mean_10,s12_roll_mean_10,s13_roll_mean_10,s14_roll_mean_10,s15_roll_mean_10,s16_roll_mean_10,s17_roll_mean_10,s18_roll_mean_10,s19_roll_mean_10,s20_roll_mean_10,s21_roll_mean_10,ecoMode_roll_std_10,cityMode_roll_std_10,sportMode_roll_std_10,s1_roll_std_10,s2_roll_std_10,s3_roll_std_10,s4_roll_std_10,s5_roll_std_10,s6_roll_std_10,s7_roll_std_10,s8_roll_std_10,s9_roll_std_10,s10_roll_std_10,s11_roll_std_10,s12_roll_std_10,s13_roll_std_10,s14_roll_std_10,s15_roll_std_10,s16_roll_std_10,s17_roll_std_10,s18_roll_std_10,s19_roll_std_10,s20_roll_std_10,s21_roll_std_10,ecoMode_roll_median_10,cityMode_roll_median_10,sportMode_roll_median_10,s1_roll_median_10,s2_roll_median_10,s3_roll_median_10,s4_roll_median_10,s5_roll_median_10,s6_roll_median_10,s7_roll_median_10,s8_roll_median_10,s9_roll_median_10,s10_roll_median_10,s11_roll_median_10,s12_roll_median_10,s13_roll_median_10,s14_roll_median_10,s15_roll_median_10,s16_roll_median_10,s17_roll_median_10,s18_roll_median_10,s19_roll_median_10,s20_roll_median_10,s21_roll_median_10,ecoMode_roll_mean_15,cityMode_roll_mean_15,sportMode_roll_mean_15,s1_roll_mean_15,s2_roll_mean_15,s3_roll_mean_15,s4_roll_mean_15,s5_roll_mean_15,s6_roll_mean_15,s7_roll_mean_15,s8_roll_mean_15,s9_roll_mean_15,s10_roll_mean_15,s11_roll_mean_15,s12_roll_mean_15,s13_roll_mean_15,s14_roll_mean_15,s15_roll_mean_15,s16_roll_mean_15,s17_roll_mean_15,s18_roll_mean_15,s19_roll_mean_15,s20_roll_mean_15,s21_roll_mean_15,ecoMode_roll_std_15,cityMode_roll_std_15,sportMode_roll_std_15,s1_roll_std_15,s2_roll_std_15,s3_roll_std_15,s4_roll_std_15,s5_roll_std_15,s6_roll_std_15,s7_roll_std_15,s8_roll_std_15,s9_roll_std_15,s10_roll_std_15,s11_roll_std_15,s12_roll_std_15,s13_roll_std_15,s14_roll_std_15,s15_roll_std_15,s16_roll_std_15,s17_roll_std_15,s18_roll_std_15,s19_roll_std_15,s20_roll_std_15,s21_roll_std_15,ecoMode_roll_median_15,cityMode_roll_median_15,sportMode_roll_median_15,s1_roll_median_15,s2_roll_median_15,s3_roll_median_15,s4_roll_median_15,s5_roll_median_15,s6_roll_median_15,s7_roll_median_15,s8_roll_median_15,s9_roll_median_15,s10_roll_median_15,s11_roll_median_15,s12_roll_median_15,s13_roll_median_15,s14_roll_median_15,s15_roll_median_15,s16_roll_median_15,s17_roll_median_15,s18_roll_median_15,s19_roll_median_15,s20_roll_median_15,s21_roll_median_15,ecoMode_cum_mean_5,cityMode_cum_mean_5,sportMode_cum_mean_5,s1_cum_mean_5,s2_cum_mean_5,s3_cum_mean_5,s4_cum_mean_5,s5_cum_mean_5,s6_cum_mean_5,s7_cum_mean_5,s8_cum_mean_5,s9_cum_mean_5,s10_cum_mean_5,s11_cum_mean_5,s12_cum_mean_5,s13_cum_mean_5,s14_cum_mean_5,s15_cum_mean_5,s16_cum_mean_5,s17_cum_mean_5,s18_cum_mean_5,s19_cum_mean_5,s20_cum_mean_5,s21_cum_mean_5,ecoMode_cum_std_5,cityMode_cum_std_5,sportMode_cum_std_5,s1_cum_std_5,s2_cum_std_5,s3_cum_std_5,s4_cum_std_5,s5_cum_std_5,s6_cum_std_5,s7_cum_std_5,s8_cum_std_5,s9_cum_std_5,s10_cum_std_5,s11_cum_std_5,s12_cum_std_5,s13_cum_std_5,s14_cum_std_5,s15_cum_std_5,s16_cum_std_5,s17_cum_std_5,s18_cum_std_5,s19_cum_std_5,s20_cum_std_5,s21_cum_std_5,ecoMode_cum_median_5,cityMode_cum_median_5,sportMode_cum_median_5,s1_cum_median_5,s2_cum_median_5,s3_cum_median_5,s4_cum_median_5,s5_cum_median_5,s6_cum_median_5,s7_cum_median_5,s8_cum_median_5,s9_cum_median_5,s10_cum_median_5,s11_cum_median_5,s12_cum_median_5,s13_cum_median_5,s14_cum_median_5,s15_cum_median_5,s16_cum_median_5,s17_cum_median_5,s18_cum_median_5,s19_cum_median_5,s20_cum_median_5,s21_cum_median_5,target
192,2,1,-0.0018,0.0006,100,518.67,641.89,1583.84,1391.28,14.62,21.6,554.53,2388.01,9054.72,1.3,46.93,522.33,2388.06,8137.72,8.3905,0.03,391,2388,100,38.94,23.4585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
193,2,2,0.0043,-0.0003,100,518.67,641.82,1587.05,1393.13,14.62,21.61,554.77,2387.98,9051.31,1.3,47.24,522.7,2387.98,8131.09,8.4167,0.03,392,2388,100,39.06,23.4085,0.0061,-0.0009,0.0,0.0,-0.07,3.21,1.85,0.0,0.01,0.24,-0.03,-3.41,0.0,0.31,0.37,-0.08,-6.63,0.0262,0.0,1.0,0.0,0.0,0.12,-0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
194,2,3,0.0018,0.0003,100,518.67,641.55,1588.32,1398.96,14.62,21.6,555.14,2388.04,9054.24,1.3,47.22,522.58,2387.99,8140.58,8.3802,0.03,391,2388,100,39.11,23.425,-0.0025,0.0006,0.0,0.0,-0.27,1.27,5.83,0.0,-0.01,0.37,0.06,2.93,0.0,-0.02,-0.12,0.01,9.49,-0.0365,0.0,-1.0,0.0,0.0,0.05,0.0165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
195,2,4,0.0035,-0.0004,100,518.67,641.68,1584.15,1396.08,14.62,21.61,554.25,2387.98,9058.01,1.3,47.1,522.49,2387.93,8140.44,8.4018,0.03,391,2388,100,39.13,23.5027,0.0017,-0.0007,0.0,0.0,0.13,-4.17,-2.88,0.0,0.01,-0.89,-0.06,3.77,0.0,-0.12,-0.09,-0.06,-0.14,0.0216,0.0,0.0,0.0,0.0,0.02,0.0777,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
196,2,5,0.0005,0.0004,100,518.67,641.73,1579.03,1402.52,14.62,21.6,555.12,2388.03,9058.15,1.3,47.25,522.27,2387.94,8136.67,8.3867,0.03,390,2388,100,39.18,23.4234,-0.003,0.0008,0.0,0.0,0.05,-5.12,6.44,0.0,-0.01,0.87,0.05,0.14,0.0,0.15,-0.22,0.01,-3.77,-0.0151,0.0,-1.0,0.0,0.0,0.05,-0.0793,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00166,0.00012,100.0,518.67,641.734,1584.478,1396.394,14.62,21.604,554.762,2388.008,9055.286,1.3,47.148,522.474,2387.98,8137.3,8.39518,0.03,391.0,2388.0,100.0,39.084,23.44362,0.002434,0.000444,0.0,0.0,0.130882,3.590664,4.500487,0.0,0.005477,0.383106,0.027749,2.865559,0.0,0.135904,0.17672,0.051478,3.865081,0.014363,0.0,0.707107,0.0,0.0,0.091269,0.037752,0.0018,0.0003,100.0,518.67,641.73,1584.15,1396.08,14.62,21.6,554.77,2388.01,9054.72,1.3,47.22,522.49,2387.98,8137.72,8.3905,0.03,391.0,2388.0,100.0,39.11,23.425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00166,0.00012,100.0,518.67,641.734,1584.478,1396.394,14.62,21.604,554.762,2388.008,9055.286,1.3,47.148,522.474,2387.98,8137.3,8.39518,0.03,391.0,2388.0,100.0,39.084,23.44362,0.002434,0.000444,0.0,0.0,0.130882,3.590664,4.500487,0.0,0.005477,0.383106,0.027749,2.865559,0.0,0.135904,0.17672,0.051478,3.865081,0.014363,0.0,0.707107,0.0,0.0,0.091269,0.037752,0.0018,0.0003,100.0,518.67,641.73,1584.15,1396.08,14.62,21.6,554.77,2388.01,9054.72,1.3,47.22,522.49,2387.98,8137.72,8.3905,0.03,391.0,2388.0,100.0,39.11,23.425,0


# Feature selection

In [19]:
X = train.copy().sample(frac=1)
y = X['target']
del X['target']
del X['vehicleId']


In [20]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=500), max_features=100)
embeded_rf_selector.fit(X, y)

embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')

82 selected features


In [21]:
print(embeded_rf_feature)

['days', 's4', 's7', 's9', 's11', 's12', 's9_delta_35', 's14_delta_35', 's2_roll_mean_5', 's3_roll_mean_5', 's4_roll_mean_5', 's7_roll_mean_5', 's9_roll_mean_5', 's11_roll_mean_5', 's12_roll_mean_5', 's14_roll_mean_5', 's15_roll_mean_5', 's17_roll_mean_5', 's21_roll_mean_5', 's2_roll_median_5', 's3_roll_median_5', 's4_roll_median_5', 's7_roll_median_5', 's9_roll_median_5', 's11_roll_median_5', 's12_roll_median_5', 's14_roll_median_5', 's15_roll_median_5', 's17_roll_median_5', 's20_roll_median_5', 's21_roll_median_5', 's2_roll_mean_10', 's3_roll_mean_10', 's4_roll_mean_10', 's9_roll_mean_10', 's11_roll_mean_10', 's12_roll_mean_10', 's15_roll_mean_10', 's17_roll_mean_10', 's20_roll_mean_10', 's21_roll_mean_10', 's2_roll_median_10', 's3_roll_median_10', 's4_roll_median_10', 's9_roll_median_10', 's11_roll_median_10', 's12_roll_median_10', 's14_roll_median_10', 's15_roll_median_10', 's17_roll_median_10', 's21_roll_median_10', 's2_roll_mean_15', 's3_roll_mean_15', 's4_roll_mean_15', 's9_roll

# Train 

### Random hypertuning

In [22]:
train = train[embeded_rf_feature+['target']].sample(frac=1)
val = val[embeded_rf_feature+['target']].sample(frac=1)

X = train.copy()
y = X['target']
del X['target']

In [23]:
val_X = val.copy()
val_y = val_X['target']
del val_X['target']

In [24]:
from lightgbm import LGBMClassifier
from sklearn import metrics

In [25]:
def evaluate_model(X, y, val_X,val_y, params):
    model = LGBMClassifier(objective ='binary',
                            boosting ='gbdt', #dart
                            n_jobs = 4,
                            **params)
        
    model.fit(X,y)
    print("accuracy:",metrics.accuracy_score(model.predict(val_X),val_y))
    return metrics.log_loss(val_y, 
                             model.predict_proba(val_X))

In [26]:
param_grid = {
    'num_leaves': [16, 64, 128, 32],
    'n_estimators': [200, 400, 600, 800], #default class*iteration=2*100
    'bagging_freq': 5,
    'bagging_fraction' : [0.8, 0.9, .99 ],  # subsample
    'feature_fraction' : [0.8, 0.9, .99],  # colsample_bytree
    'reg_alpha': [0.2, 0.6, 0.8],
    'reg_lambda': [0.4, 0.6, 0.8],
    'max_depth' : [2,4,6,8,12],
    'learning_rate' : [.1,.05,.01]
}

print('Tuning begins...')
best_eval_score = 0
for i in range(50):
    params = {k: np.random.choice(v) for k, v in param_grid.items()}
    score = evaluate_model(X, y, val_X,val_y, params)
    
    print(params,score)
    if score < best_eval_score or best_eval_score == 0:
        best_eval_score = score
        best_params = params
print("Best evaluation logloss", best_eval_score)

Tuning begins...
accuracy: 0.9804164600892414
{'num_leaves': 32, 'n_estimators': 800, 'bagging_freq': 0, 'bagging_fraction': 0.8, 'feature_fraction': 0.8, 'reg_alpha': 0.6, 'reg_lambda': 0.8, 'max_depth': 8, 'learning_rate': 0.01} 0.04855312320926555
accuracy: 0.9833911750123946
{'num_leaves': 128, 'n_estimators': 600, 'bagging_freq': 4, 'bagging_fraction': 0.9, 'feature_fraction': 0.99, 'reg_alpha': 0.2, 'reg_lambda': 0.8, 'max_depth': 8, 'learning_rate': 0.05} 0.05182981664970322
accuracy: 0.9826474962816063
{'num_leaves': 32, 'n_estimators': 400, 'bagging_freq': 4, 'bagging_fraction': 0.99, 'feature_fraction': 0.99, 'reg_alpha': 0.6, 'reg_lambda': 0.8, 'max_depth': 6, 'learning_rate': 0.05} 0.04928723222053421
accuracy: 0.981903817550818
{'num_leaves': 128, 'n_estimators': 200, 'bagging_freq': 3, 'bagging_fraction': 0.99, 'feature_fraction': 0.9, 'reg_alpha': 0.8, 'reg_lambda': 0.8, 'max_depth': 6, 'learning_rate': 0.1} 0.05063357809634253
accuracy: 0.9836390679226574
{'num_leaves':

In [27]:
best_params

{'num_leaves': 16,
 'n_estimators': 400,
 'bagging_freq': 4,
 'bagging_fraction': 0.8,
 'feature_fraction': 0.8,
 'reg_alpha': 0.2,
 'reg_lambda': 0.4,
 'max_depth': 2,
 'learning_rate': 0.05}

# run model on whole data with best params

In [28]:
traindf = traindf[embeded_rf_feature+['target']].sample(frac=1)

X = traindf.copy()
y = X['target']
del X['target']

In [29]:
model = LGBMClassifier(objective ='binary',
                            boosting ='gbdt', #dart
                            n_jobs = 4,
                            **best_params)
model.fit(X,y)

LGBMClassifier(bagging_fraction=0.8, bagging_freq=4, boosting='gbdt',
               boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               feature_fraction=0.8, importance_type='split',
               learning_rate=0.05, max_depth=2, min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=400,
               n_jobs=4, num_leaves=16, objective='binary', random_state=None,
               reg_alpha=0.2, reg_lambda=0.4, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

# Test Data

In [30]:
test_data = pd.read_csv('../input/car_breakdown_test.tsv', sep='\t', header=0)
test_data.head()

Unnamed: 0,vehicleId,days,ecoMode,cityMode,sportMode,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,0.0023,0.0003,100,518.67,643.02,1585.29,1398.21,14.62,21.61,553.9,2388.04,9050.17,1.3,47.2,521.72,2388.03,8125.55,8.4052,0.03,392,2388,100,38.86,23.3735
1,1,2,-0.0027,-0.0003,100,518.67,641.71,1588.45,1395.42,14.62,21.61,554.85,2388.01,9054.42,1.3,47.5,522.16,2388.06,8139.62,8.3803,0.03,393,2388,100,39.02,23.3916
2,1,3,0.0003,0.0001,100,518.67,642.46,1586.94,1401.34,14.62,21.61,554.11,2388.05,9056.96,1.3,47.5,521.97,2388.03,8130.1,8.4441,0.03,393,2388,100,39.08,23.4166
3,1,4,0.0042,0.0,100,518.67,642.44,1584.12,1406.42,14.62,21.61,554.07,2388.03,9045.29,1.3,47.28,521.38,2388.05,8132.9,8.3917,0.03,391,2388,100,39.0,23.3737
4,1,5,0.0014,0.0,100,518.67,642.51,1587.19,1401.92,14.62,21.61,554.16,2388.01,9044.55,1.3,47.31,522.15,2388.03,8129.54,8.4031,0.03,390,2388,100,38.99,23.413


In [31]:
diffcols = create_difference_columns(test_data)
rollcols = create_rolling_columns(test_data)
cumcols = create_cumulative_columns(test_data)

In [32]:
testdf = pd.concat([test_data,diffcols,rollcols,cumcols],axis=1)

In [33]:
test_truth_data = pd.read_csv('../input/car_breakdown_test_truth.tsv', sep='\t', header=0)
test_truth_data.head()

Unnamed: 0,vehicleId,RUL
0,1,112
1,2,98
2,3,69
3,4,82
4,5,91


In [34]:
testdf = pd.merge(testdf,test_truth_data,on='vehicleId',how='left')

In [35]:
testdf['target'] = testdf.apply(lambda x : 1 if x['RUL']-x['days']<=30 else 0,axis=1)

In [36]:
test = testdf[embeded_rf_feature+['target']]

In [37]:
testX = test.copy()
testy = testX['target']
del testX['target']

In [38]:
print(metrics.accuracy_score(model.predict(testX),testy))

0.3424709835064142


# Reason for bad performance

I got a accuracy of 97% in the validation data(which is splitted using vehicle iDs, so no problem with time component). So why bad performance on test data? Reason below:

- The distribution of breakdown in test file is very different than that in the training file. In training file the cars are prone to breakdown only after 128 days(minimum breakdown day) while in the test file the breakdown could happen on the 7th day after start.

- To deal with this I tried removing the days feature from the model but still the data distribution is totally different. 

- Another confusion - In the description it says that - 

'''Test data has the exactly same schema as the training data. Except the fact that the data doesn't represent when the failure has occurrs, in other words the last row for a given **vehicleId** doesn't represent the day of breakdown(it has happened earlier than the last row).'''

But when I see for vehicleId 1 there are just 30 rows and its RUL is 112 days. BAsed on the description, there should be more than 112 rows for vehicle ID 1.

In [39]:
test_data[test_data['vehicleId']==1]

Unnamed: 0,vehicleId,days,ecoMode,cityMode,sportMode,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,0.0023,0.0003,100,518.67,643.02,1585.29,1398.21,14.62,21.61,553.9,2388.04,9050.17,1.3,47.2,521.72,2388.03,8125.55,8.4052,0.03,392,2388,100,38.86,23.3735
1,1,2,-0.0027,-0.0003,100,518.67,641.71,1588.45,1395.42,14.62,21.61,554.85,2388.01,9054.42,1.3,47.5,522.16,2388.06,8139.62,8.3803,0.03,393,2388,100,39.02,23.3916
2,1,3,0.0003,0.0001,100,518.67,642.46,1586.94,1401.34,14.62,21.61,554.11,2388.05,9056.96,1.3,47.5,521.97,2388.03,8130.1,8.4441,0.03,393,2388,100,39.08,23.4166
3,1,4,0.0042,0.0,100,518.67,642.44,1584.12,1406.42,14.62,21.61,554.07,2388.03,9045.29,1.3,47.28,521.38,2388.05,8132.9,8.3917,0.03,391,2388,100,39.0,23.3737
4,1,5,0.0014,0.0,100,518.67,642.51,1587.19,1401.92,14.62,21.61,554.16,2388.01,9044.55,1.3,47.31,522.15,2388.03,8129.54,8.4031,0.03,390,2388,100,38.99,23.413
5,1,6,0.0012,0.0003,100,518.67,642.11,1579.12,1395.13,14.62,21.61,554.22,2388.0,9050.96,1.3,47.26,521.92,2388.08,8127.46,8.4238,0.03,392,2388,100,38.91,23.3467
6,1,7,0.0,0.0002,100,518.67,642.11,1583.34,1404.84,14.62,21.61,553.89,2388.05,9051.39,1.3,47.31,522.01,2388.06,8134.97,8.3914,0.03,391,2388,100,38.85,23.3952
7,1,8,0.0006,0.0,100,518.67,642.54,1580.89,1400.89,14.62,21.61,553.59,2388.05,9052.86,1.3,47.21,522.09,2388.06,8125.93,8.4213,0.03,393,2388,100,39.05,23.3224
8,1,9,-0.0036,0.0,100,518.67,641.88,1593.29,1412.28,14.62,21.61,554.49,2388.06,9048.55,1.3,47.37,522.03,2388.05,8134.15,8.4353,0.03,391,2388,100,39.1,23.4521
9,1,10,-0.0025,-0.0001,100,518.67,642.07,1585.25,1398.64,14.62,21.61,554.28,2388.04,9051.95,1.3,47.14,522.0,2388.06,8134.08,8.4093,0.03,391,2388,100,38.87,23.382
