# Notebook to create feature sets

In [171]:
import os
import re
import pickle
import time
import datetime

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix

%matplotlib inline

# Custom modules
import const
import func

## Load data

In [172]:
#date_data = func.load_data_file(const.TRAIN_FILES[2], ftype='csv')

In [173]:
date_data = pd.read_csv(os.path.join(const.BASE_PATH, const.TRAIN_FILES[2] + '.csv'), nrows=10000, index_col=0)
print date_data.shape

(10000, 1156)


In [174]:
# Load feature look-up table to see how many numeric/categorical features there are
date_info = pd.read_csv(os.path.join(const.DATA_PATH, 'date_feat_lut_V2.csv'), index_col='name_dat')
date_info.head()

Unnamed: 0_level_0,line,station,feature_nr,feat_nr_dat,name_cat,name_num,station_V2
name_dat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L0_S0_D1,0,0,0,1.0,,L0_S0_F0,0.0
L0_S0_D3,0,0,2,3.0,,L0_S0_F2,0.0
L0_S0_D5,0,0,4,5.0,,L0_S0_F4,0.0
L0_S0_D7,0,0,6,7.0,,L0_S0_F6,0.0
L0_S0_D9,0,0,8,9.0,,L0_S0_F8,0.0


In [175]:
date_data = date_data.apply(lambda x: x-min(x), axis=1)

## Check whether there are negative time steps?

In [176]:
date_data.fillna(method='ffill').head()

Unnamed: 0_level_0,L0_S0_D1,L0_S0_D3,L0_S0_D5,L0_S0_D7,L0_S0_D9,L0_S0_D11,L0_S0_D13,L0_S0_D15,L0_S0_D17,L0_S0_D19,...,L3_S50_D4246,L3_S50_D4248,L3_S50_D4250,L3_S50_D4252,L3_S50_D4254,L3_S51_D4255,L3_S51_D4257,L3_S51_D4259,L3_S51_D4261,L3_S51_D4263
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [177]:
agg = date_data.fillna(method='ffill', axis=1).diff(1, axis=1)

In [178]:
(agg<0).sum(0).sum()

0

Yes!!!

## Check timing per station

In [179]:
date_data = date_data.transpose()
date_data.head()

Id,4,6,7,9,11,13,14,16,18,23,...,19904,19905,19906,19909,19910,19912,19915,19917,19921,19923
L0_S0_D1,0.0,,0.0,0.0,0.0,0.0,,,0.0,,...,,,,0.0,0.0,,0.0,0.0,,0.0
L0_S0_D3,0.0,,0.0,0.0,0.0,0.0,,,0.0,,...,,,,0.0,0.0,,0.0,0.0,,0.0
L0_S0_D5,0.0,,0.0,0.0,0.0,0.0,,,0.0,,...,,,,0.0,0.0,,0.0,0.0,,0.0
L0_S0_D7,0.0,,0.0,0.0,0.0,0.0,,,0.0,,...,,,,0.0,0.0,,0.0,0.0,,0.0
L0_S0_D9,0.0,,0.0,0.0,0.0,0.0,,,0.0,,...,,,,0.0,0.0,,0.0,0.0,,0.0


In [180]:
date_data = date_data.merge(date_info[['line','station_V2', 'feature_nr']], how='left', left_index=True, right_index=True)

In [181]:
date_data.head()

Id,4,6,7,9,11,13,14,16,18,23,...,19909,19910,19912,19915,19917,19921,19923,line,station_V2,feature_nr
L0_S0_D1,0.0,,0.0,0.0,0.0,0.0,,,0.0,,...,0.0,0.0,,0.0,0.0,,0.0,0,0.0,0
L0_S0_D11,0.0,,0.0,0.0,0.0,0.0,,,0.0,,...,0.0,0.0,,0.0,0.0,,0.0,0,0.0,10
L0_S0_D13,0.0,,0.0,0.0,0.0,0.0,,,0.0,,...,0.0,0.0,,0.0,0.0,,0.0,0,0.0,12
L0_S0_D15,0.0,,0.0,0.0,0.0,0.0,,,0.0,,...,0.0,0.0,,0.0,0.0,,0.0,0,0.0,14
L0_S0_D17,0.0,,0.0,0.0,0.0,0.0,,,0.0,,...,0.0,0.0,,0.0,0.0,,0.0,0,0.0,16


In [183]:
date_data.sort_values('feature_nr').index.get_loc('L0_S1_D26')

12

In [169]:
date_data.columns.get_loc(18225)

9164

In [184]:
td = date_data.sort_values('feature_nr').fillna(method='ffill', axis=0).iloc[:, 9164].diff()
td[td<0]

Series([], Name: 18225, dtype: float64)

In [156]:
date_data.sort_values('station_V2').fillna(method='ffill', axis=0).iloc[0:20, [4219, -1]]

Id,8413,station_V2
L0_S0_D1,0.0,0.0
L0_S0_D11,0.0,0.0
L0_S0_D13,0.0,0.0
L0_S0_D15,0.0,0.0
L0_S0_D17,0.0,0.0
L0_S0_D19,0.0,0.0
L0_S0_D21,0.0,0.0
L0_S0_D23,0.0,0.0
L0_S0_D3,0.0,0.0
L0_S0_D5,0.0,0.0


In [125]:
date_data.sort_values('station_V2').index

Index([u'L0_S0_D1', u'L0_S0_D11', u'L0_S0_D13', u'L0_S0_D15', u'L0_S0_D17',
       u'L0_S0_D19', u'L0_S0_D21', u'L0_S0_D23', u'L0_S0_D3', u'L0_S0_D5',
       ...
       u'L3_S50_D4250', u'L3_S50_D4242', u'L3_S50_D4246', u'L3_S50_D4244',
       u'L3_S50_D4248', u'L3_S51_D4261', u'L3_S51_D4255', u'L3_S51_D4257',
       u'L3_S51_D4259', u'L3_S51_D4263'],
      dtype='object', length=1156)

In [150]:
(date_data.sort_values('station_V2').fillna(method='ffill', axis=0).groupby(['line','station_V2']).std()>0).sum().sum()

685

In [151]:
(date_data.sort_values('station_V2').sort_values('station_V2').fillna(method='ffill', axis=0).groupby(['line','station_V2']).std()<0).sum().sum()

0

In [165]:
tmp = date_data.sort_values('station_V2').fillna(method='ffill', axis=0).groupby(['line','station_V2']).apply(lambda x: x.max()-x.min())

In [166]:
(tmp>0).sum(0).sum()

685

In [167]:
(tmp>0).sum(0).sort_values(ascending=False)

Id
5196     1
18225    1
15652    1
15653    1
15654    1
15657    1
11091    1
1620     1
18196    1
8225     1
8219     1
15685    1
3998     1
18161    1
15691    1
1644     1
6303     1
11122    1
13216    1
8203     1
8198     1
3984     1
11137    1
1669     1
13198    1
3977     1
8180     1
13195    1
1695     1
13260    1
        ..
13004    0
13000    0
12998    0
12996    0
12995    0
12993    0
12992    0
12991    0
12989    0
12988    0
12985    0
13011    0
13012    0
13013    0
13027    0
13035    0
13034    0
13033    0
13031    0
13029    0
13028    0
13026    0
13014    0
13024    0
13023    0
13021    0
13018    0
13017    0
13016    0
4        0
dtype: int64