In [2]:
import os
import re
import pickle
import time
import datetime

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

from scipy.sparse import csr_matrix, vstack

%matplotlib inline

# Custom modules
import const
import func

## Load data

In [3]:
lut = pd.read_csv(const.LOOK_UP_TABLE)
lut.head(3)

Unnamed: 0,line,station,feature_nr,feat_nr_dat,name_dat,name_cat,name_num,col_dat,col_num,col_cat,station_V2,line_V2
0,0,0,0,1.0,L0_S0_D1,,L0_S0_F0,0.0,0.0,,0.0,1.0
1,0,0,2,3.0,L0_S0_D3,,L0_S0_F2,1.0,1.0,,0.0,1.0
2,0,0,4,5.0,L0_S0_D5,,L0_S0_F4,2.0,2.0,,0.0,1.0


In [4]:
dat = func.load_data_file(const.TRAIN_FILES[2])
dat_train = dat['data']['features']
id_train = dat['data']['ids']

dat = func.load_data_file(const.TEST_FILES[2])

dat_data = vstack([dat_train, dat['data']['features']], format='csr')
ids = pd.concat([id_train, dat['data']['ids']], axis=0)

Returning <open file '/Volumes/My Book/kaggle_bosch/train_date.pkl', mode 'rb' at 0x115c50d20>.pkl
Returning <open file '/Volumes/My Book/kaggle_bosch/test_date.pkl', mode 'rb' at 0x115c50d20>.pkl


## Calculate features based on base line definition

In [29]:
# First get max per line for all train and test samples
df = pd.DataFrame(columns=['L0max','L1max','L2max','L3max'], index=ids.Id)
for l in range(4):
    col_date = [int(i) for i in lut[lut['line']==l].col_dat.values if not np.isnan(i)]

    df['L{}max'.format(l)] = dat_data[:, col_date].max(1).todense().A1
    
    df['L{}max'.format(l)].replace(0, np.nan, inplace=True)
    df['L{}max'.format(l)].round(2)

# To go row index to check sorting afterwards
df.reset_index(inplace=True)
df.reset_index(inplace=True)

# Sort by ID
df.sort_values(['Id'], inplace=True)

for col in df.columns:
    df[col + '_prev'] = df[col].shift(1)
    df[col + '_next'] = df[col].shift(-1)

df.set_index('Id', inplace=True)

In [30]:
feat_cols = []

for l in range(4):
    df['sameL{}_next'.format(l)] = 2 * (df['L{}max'.format(l)]==df['L{}max_next'.format(l)]).astype(int) + \
                              1 * ((df['L{}max'.format(l)].isnull()) & (df['L{}max_next'.format(l)].isnull())).astype(int)
        
    df['sameL{}_prev'.format(l)] = 2 * (df['L{}max'.format(l)]==df['L{}max_prev'.format(l)]).astype(int) + \
                              1 * ((df['L{}max'.format(l)].isnull()) & (df['L{}max_prev'.format(l)].isnull())).astype(int)
        
    feat_cols += ['sameL{}_prev'.format(l), 'sameL{}_next'.format(l)]


In [21]:
df[feat_cols].to_csv(os.path.join(const.DATA_PATH, 'feat_set_jayjay_same_L_new.csv'), index_label='ID')

## Calculate features based on new line definition

In [32]:
line_V2s = lut['line_V2'].unique()
print line_V2s

[ 1.   2.   3.1  3.2  3.3  4.1  4.   4.2  4.3  4.4  5.   6.   7. ]


In [33]:
# First get max per line for all train and test samples
df = pd.DataFrame(columns=['L{}_V2_MAX'.format(x) for x in line_V2s], index=ids.Id)
for l in line_V2s:
    col_date = [int(i) for i in lut[lut['line_V2']==l].col_dat.values if not np.isnan(i)]

    df['L{}_V2_MAX'.format(l)] = dat_data[:, col_date].max(1).todense().A1
    
    df['L{}_V2_MAX'.format(l)].replace(0, np.nan, inplace=True)

# To go row index to check sorting afterwards
df.reset_index(inplace=True)
df.reset_index(inplace=True)

# Sort by ID
df.sort_values(['Id'], inplace=True)

for col in df.columns:
    df[col + '_prev'] = df[col].shift(1)
    df[col + '_next'] = df[col].shift(-1)

df.set_index('Id', inplace=True)

In [34]:
df.head(3)

Unnamed: 0_level_0,index,L1.0_V2_MAX,L2.0_V2_MAX,L3.1_V2_MAX,L3.2_V2_MAX,L3.3_V2_MAX,L4.1_V2_MAX,L4.0_V2_MAX,L4.2_V2_MAX,L4.3_V2_MAX,...,L4.3_V2_MAX_prev,L4.3_V2_MAX_next,L4.4_V2_MAX_prev,L4.4_V2_MAX_next,L5.0_V2_MAX_prev,L5.0_V2_MAX_next,L6.0_V2_MAX_prev,L6.0_V2_MAX_next,L7.0_V2_MAX_prev,L7.0_V2_MAX_next
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1183747,,,,,792.77002,,,,,...,,,,,,1045.959961,,1060.069946,,
2,1183748,,,1025.640015,,,,,,,...,,,,,794.719971,704.109985,800.700012,711.080017,,
3,1183749,,,,,671.950012,,,,,...,,,,,1045.959961,,1060.069946,87.290001,,


In [35]:
feat_cols = []

for l in line_V2s:
    df['sameL{}_V2_next'.format(l)] = 2 * (df['L{}_V2_MAX'.format(l)]==df['L{}_V2_MAX_next'.format(l)]).astype(int) + \
                              1 * ((df['L{}_V2_MAX'.format(l)].isnull()) & (df['L{}_V2_MAX_next'.format(l)].isnull())).astype(int)
        
    df['sameL{}_V2_prev'.format(l)] = 2 * (df['L{}_V2_MAX'.format(l)]==df['L{}_V2_MAX_prev'.format(l)]).astype(int) + \
                              1 * ((df['L{}_V2_MAX'.format(l)].isnull()) & (df['L{}_V2_MAX_prev'.format(l)].isnull())).astype(int)
        
    feat_cols += ['sameL{}_V2_prev'.format(l), 'sameL{}_V2_next'.format(l)]


In [36]:
for l in line_V2s:
    print df['sameL{}_V2_next'.format(l)].value_counts()

0    1806883
1     470769
2      89843
Name: sameL1.0_V2_next, dtype: int64
1    1520559
0     817657
2      29279
Name: sameL2.0_V2_next, dtype: int64
1    2274600
0      91436
2       1459
Name: sameL3.1_V2_next, dtype: int64
1    2262296
0     102192
2       3007
Name: sameL3.2_V2_next, dtype: int64
1    1878591
0     477023
2      11881
Name: sameL3.3_V2_next, dtype: int64
1    2154762
0     208479
2       4254
Name: sameL4.1_V2_next, dtype: int64
1    2262677
0     102608
2       2210
Name: sameL4.0_V2_next, dtype: int64
1    2339544
0      27345
2        606
Name: sameL4.2_V2_next, dtype: int64
1    2350903
0      16335
2        257
Name: sameL4.3_V2_next, dtype: int64
1    2298473
0      67518
2       1504
Name: sameL4.4_V2_next, dtype: int64
1    1182329
0    1150975
2      34191
Name: sameL5.0_V2_next, dtype: int64
0    2267265
2      88471
1      11759
Name: sameL6.0_V2_next, dtype: int64
1    2138683
0     223096
2       5716
Name: sameL7.0_V2_next, dtype: int64


In [37]:
df[feat_cols].to_csv(os.path.join(const.DATA_PATH, 'feat_set_V2_same_L_new.csv'), index_label='ID')