## Benchmark for measuring performance of engineered features and models

Prepare data for later analysis

In [1]:
import sys
sys.path.insert(0, '../../bosch_helper')
from bosch_helper import *

%matplotlib inline

  from collections import Sequence


### Analyze of numeric features

In [2]:
if not os.path.exists('../../data/data.hdf'):

    col_train = pd.read_csv('../../data/train_numeric.csv.zip', index_col=0, nrows=10)
    col_train = {k: np.float32 for k in col_train.columns}
    col_train['Id'] = np.int64

    col_test = pd.read_csv('../../data/test_numeric.csv.zip', index_col=0, nrows=10)
    col_test = {k: np.float32 for k in col_test.columns}
    col_test['Id'] = np.int64

    train = pd.read_csv('../../data/train_numeric.csv.zip', index_col=0, dtype=col_train)
    test = pd.read_csv('../../data/test_numeric.csv.zip', index_col=0, dtype=col_test)
    
    x = pd.concat((train, test), keys=['train', 'test'])
    
    del train, test
    gc.collect()
    
    x.to_hdf('../../data/data.hdf', 'numeric', complib='blosc:lz4', complevel=9, format='t')
else:
    x = pd.read_hdf('../../data/data.hdf', 'numeric')

### Load date data, calculate start time, end time, start station, and end station

In [4]:
Flag = True

if os.path.exists('../../data/data.hdf'):
    u = pd.HDFStore('../../data/data.hdf')
    if 'data' in u.keys():
        Flag = False

if Flag:

    col_train = pd.read_csv('../../data/train_date.csv.zip', index_col=0, nrows=10)
    col_train = {k: np.float32 for k in col_train.columns}
    col_train['Id'] = np.int64

    col_test = pd.read_csv('../../data/test_date.csv.zip', index_col=0, nrows=10)
    col_test = {k: np.float32 for k in col_test.columns}
    col_test['Id'] = np.int64

    date_train = pd.read_csv('../../data/train_date.csv.zip', index_col=0, dtype=col_train)
    date_test = pd.read_csv('../../data/test_date.csv.zip', index_col=0, dtype=col_test)
    
    date = pd.concat((date_train, date_test), keys=['train', 'test'])
    
    del date_train, date_test
    gc.collect()
    
    date.to_hdf('../../data/data.hdf', 'date', complib='blosc:lz4', complevel=9, format='t')
else:
    date = pd.read_hdf('../../data/data.hdf', 'date')

### Start and end stations and times

In [5]:
def find_first_nonnan(row):
    v = np.where(~np.isnan(row))[0]
    if len(v)==0:
        return -1
    else:
        return int(v[0])

def find_last_nonnan(row):
    v = np.where(~np.isnan(row))[0]
    if len(v)==0:
        return -1
    else:
            return int(v[-1])

In [6]:
if not os.path.exists('time_station_start.hdf'):

    date.columns = pd.MultiIndex.from_tuples([tuple(c.split('_')) for c in date.columns])
    time_station_start = date.groupby(level=1, axis=1).min(axis=1)

    time_station_start['station_start'] = time_station_start.apply(find_first_nonnan, axis=1)
    time_station_start['station_end'] = time_station_start.apply(find_last_nonnan, axis=1)

    tmp = np.around(time_station_start.iloc[:, :-2].min(axis=1)*100)
    tmp = tmp.apply(lambda e: int(e) if ~np.isnan(e) else -1)
    time_station_start['time_start'] = tmp

    tmp = np.around(time_station_start.iloc[:, :-2].max(axis=1)*100)
    tmp = tmp.apply(lambda e: int(e) if ~np.isnan(e) else -1)
    time_station_start['time_end'] = tmp

    time_station_start.drop(time_station_start.columns[:-4], axis=1, inplace=True)
    time_station_start.to_hdf('time_station_start.hdf', 
        'time_station_start', format='table', complib='blosc:lz4', complevel=9)
else:
    time_station_start = pd.read_hdf('time_station_start.hdf', 'time_station_start')

### Join `time_station_start` with `x`

In [10]:
x.sort_index(level='Id', inplace=True)

In [11]:
time_station_start.sort_index(level='Id', inplace=True)

In [31]:
if not os.path.exists('benchmark_8_numeric_features_1.hdf'):
    x = x.join(time_station_start)
    
    del time_station_start
    gc.collect()

    del date
    gc.collect()
    
    x.sort_values(['station_start', 'time_start', 'Id'], inplace=True)
    
    x.to_hdf('benchmark_8_numeric_features_1.hdf', 'x', complib='blosc:lz4', comlevel=9, format='t')
    
else:
    x = pd.read_hdf('benchmark_8_numeric_features_1.hdf', 'x')