#### Start and end stations and times

Save numeric and date data to `'../../data/data.hdf'` with `np.float32` accuracy.

Create the following features for each part
- ~~Start station~~
- ~~End station~~
- ~~Start time in each station (6min is 1 (int) instead of 0.01)~~
- ~~End time in each station~~
- ~~Week number of start time~~
- ~~Week number of end time~~

DataFrames are always sorted by `Id`, the index names are `set` and `Id`.

#### Hash of all numeric features
- ~~Label encode of hash values~~
- ~~Counts of same hash values~~

In [2]:
import sys
sys.path.insert(0, '../../bosch_helper')
from bosch_helper import *

%matplotlib inline

  from collections import Sequence


### Load original numeric features

NOTE: `x` has `Response`

In [3]:
col_train = pd.read_csv('../../data/train_numeric.csv.zip', index_col=0, nrows=10)
col_train = {k: np.float32 for k in col_train.columns}
col_train['Id'] = np.int64

col_test = pd.read_csv('../../data/test_numeric.csv.zip', index_col=0, nrows=10)
col_test = {k: np.float32 for k in col_test.columns}
col_test['Id'] = np.int64

train = pd.read_csv('../../data/train_numeric.csv.zip', index_col=0, dtype=col_train)
test = pd.read_csv('../../data/test_numeric.csv.zip', index_col=0, dtype=col_test)

x = pd.concat((train, test), keys=['train', 'test'])

In [6]:
x.index.names = ['set', 'Id']

In [8]:
x.sort_index(level=1, inplace=True)

In [10]:
x.to_hdf('../../data/data.hdf', 'numeric', complib='blosc:lz4', complevel=9, format='t')

In [13]:
del train, test
gc.collect()

53

#### Load date data

In [14]:
col_train = pd.read_csv('../../data/train_date.csv.zip', index_col=0, nrows=10)
col_train = {k: np.float32 for k in col_train.columns}
col_train['Id'] = np.int64

col_test = pd.read_csv('../../data/test_date.csv.zip', index_col=0, nrows=10)
col_test = {k: np.float32 for k in col_test.columns}
col_test['Id'] = np.int64

date_train = pd.read_csv('../../data/train_date.csv.zip', index_col=0, dtype=col_train)
date_test = pd.read_csv('../../data/test_date.csv.zip', index_col=0, dtype=col_test)

In [15]:
date = pd.concat((date_train, date_test), keys=['train', 'test'])

In [17]:
date.index.names = ['set', 'Id']

In [19]:
date.sort_index(level=1, inplace=True)

In [25]:
date.to_hdf('../../data/data.hdf', 'date', complib='blosc:lz4', complevel=9, format='t')

In [26]:
del date_train, date_test
gc.collect()

344

In [None]:
date.columns = pd.MultiIndex.from_tuples([tuple(c.split('_')) for c in date.columns])

In [27]:
def find_first_nonnan(row):
    v = np.where(~np.isnan(row))[0]
    if len(v)==0:
        return -1
    else:
        return int(v[0])

def find_last_nonnan(row):
    v = np.where(~np.isnan(row))[0]
    if len(v)==0:
        return -1
    else:
            return int(v[-1])

##### Start time in each station

NOTE: 1 time unit equals 6 minutes

In [87]:
time_station_start = date.groupby(level=1, axis=1).min(axis=1)
time_station_start = np.around(time_station_start*100)

##### Start and end stations

In [88]:
time_station_start['station_start'] = time_station_start.apply(find_first_nonnan, axis=1)
time_station_start['station_end'] = time_station_start.apply(find_last_nonnan, axis=1)

##### Week number of start time

In [89]:
tmp = np.around(time_station_start.iloc[:, :-2].min(axis=1))
tmp = tmp.apply(lambda e: int(e) if ~np.isnan(e) else -1)

In [90]:
time_station_start['week_number_start'] = np.floor(tmp/1680).astype(np.int)

In [91]:
col_names = ['time_start_'+k for k in time_station_start.columns[:-3]]

col_names.extend(['station_start', 'station_end', 'week_number_start'])

In [92]:
time_station_start.columns = col_names

##### End time at each station

In [93]:
time_station_end = date.groupby(level=1, axis=1).max(axis=1)
time_station_end = np.around(time_station_end*100)

In [94]:
col_names = ['time_end_'+k for k in time_station_end.columns]
time_station_end.columns = col_names

In [95]:
tmp = np.around(time_station_end.max(axis=1))
tmp = tmp.apply(lambda e: int(e) if ~np.isnan(e) else -1)

In [96]:
time_station_end['week_number_end'] = np.floor(tmp/1680).astype(np.int)

In [97]:
time_station = time_station_start.join(time_station_end)

In [104]:
time_station.to_hdf('time_station.hdf', 
    'time_station', format='table', complib='blosc:lz4', complevel=9)

In [106]:
time_station = pd.read_hdf('time_station.hdf', 'time_station')

#### Hash value of numeric features

In [108]:
numeric = x.drop(['Response'], axis=1)

In [113]:
hash_numeric = [numeric.iloc[(n-1)*10**5:n*10**5,:].apply(lambda u: hash(tuple(u)), axis=1) for n in range(25)]
hash_numeric = pd.concat(hash_numeric)

In [115]:
hash_numeric = pd.DataFrame(hash_numeric)

In [120]:
le = LabelEncoder()
hash_numeric['hash_encoded'] = le.fit_transform(hash_numeric.values)

In [123]:
hash_numeric.drop([0], axis=1, inplace=True)

In [124]:
hash_number_counts = hash_numeric.groupby('hash_encoded')['hash_encoded'].count()

In [129]:
hash_number_counts.name = 'count_hash'

In [130]:
hash_numeric = hash_numeric.join(hash_number_counts, on='hash_encoded')

In [138]:
hash_numeric.to_hdf('hash_numeric.hdf', 'hash_numeric', format='table', complib='blosc:lz4', complevel=9)

#### Count encoding of each value

#### Z-score per week