# Notebook to create feature sets

In [1]:
import os
import re
import pickle
import time
import datetime

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix

%matplotlib inline

# Custom modules
import const
import func

## Load data

In [2]:
#date_data = func.load_data_file(const.TRAIN_FILES[2], ftype='csv')

In [3]:
date_data = pd.read_csv(os.path.join(const.BASE_PATH, const.TRAIN_FILES[2] + '.csv'), nrows=200000, index_col=0)
print date_data.shape

(200000, 1156)


In [4]:
# Load feature look-up table to see how many numeric/categorical features there are
date_info = pd.read_csv(os.path.join(const.DATA_PATH, 'date_feat_lut_V2.csv'), index_col='name_dat')
date_info.head()

Unnamed: 0_level_0,line,station,feature_nr,feat_nr_dat,name_cat,name_num,station_V2
name_dat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L0_S0_D1,0,0,0,1.0,,L0_S0_F0,0.0
L0_S0_D3,0,0,2,3.0,,L0_S0_F2,0.0
L0_S0_D5,0,0,4,5.0,,L0_S0_F4,0.0
L0_S0_D7,0,0,6,7.0,,L0_S0_F6,0.0
L0_S0_D9,0,0,8,9.0,,L0_S0_F8,0.0


In [5]:
date_data = date_data.apply(lambda x: x-min(x), axis=1)

## Check whether there are negative time steps?

In [6]:
date_data.head()

Unnamed: 0_level_0,L0_S0_D1,L0_S0_D3,L0_S0_D5,L0_S0_D7,L0_S0_D9,L0_S0_D11,L0_S0_D13,L0_S0_D15,L0_S0_D17,L0_S0_D19,...,L3_S50_D4246,L3_S50_D4248,L3_S50_D4250,L3_S50_D4252,L3_S50_D4254,L3_S51_D4255,L3_S51_D4257,L3_S51_D4259,L3_S51_D4261,L3_S51_D4263
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [7]:
# Any negative time steps?
(date_data.diff(1, axis=1)<0).sum().sum()

0

Does not look like it...

## Check timing per station

In [8]:
date_data = date_data.transpose()
date_data.head()

Id,4,6,7,9,11,13,14,16,18,23,...,399884,399886,399888,399890,399892,399894,399896,399897,399898,399899
L0_S0_D1,0.0,,0.0,0.0,0.0,0.0,,,0.0,,...,0.0,,0.0,0.0,0.0,,,0.0,0.0,
L0_S0_D3,0.0,,0.0,0.0,0.0,0.0,,,0.0,,...,0.0,,0.0,0.0,0.0,,,0.0,0.0,
L0_S0_D5,0.0,,0.0,0.0,0.0,0.0,,,0.0,,...,0.0,,0.0,0.0,0.0,,,0.0,0.0,
L0_S0_D7,0.0,,0.0,0.0,0.0,0.0,,,0.0,,...,0.0,,0.0,0.0,0.0,,,0.0,0.0,
L0_S0_D9,0.0,,0.0,0.0,0.0,0.0,,,0.0,,...,0.0,,0.0,0.0,0.0,,,0.0,0.0,


In [9]:
date_data = date_data.merge(date_info[['line','station_V2']], how='left', left_index=True, right_index=True)

In [10]:
date_data.head()

Id,4,6,7,9,11,13,14,16,18,23,...,399888,399890,399892,399894,399896,399897,399898,399899,line,station_V2
L0_S0_D1,0.0,,0.0,0.0,0.0,0.0,,,0.0,,...,0.0,0.0,0.0,,,0.0,0.0,,0,0.0
L0_S0_D11,0.0,,0.0,0.0,0.0,0.0,,,0.0,,...,0.0,0.0,0.0,,,0.0,0.0,,0,0.0
L0_S0_D13,0.0,,0.0,0.0,0.0,0.0,,,0.0,,...,0.0,0.0,0.0,,,0.0,0.0,,0,0.0
L0_S0_D15,0.0,,0.0,0.0,0.0,0.0,,,0.0,,...,0.0,0.0,0.0,,,0.0,0.0,,0,0.0
L0_S0_D17,0.0,,0.0,0.0,0.0,0.0,,,0.0,,...,0.0,0.0,0.0,,,0.0,0.0,,0,0.0


In [11]:
#date_data.drop(435812, axis=0, inplace=True)

In [12]:
tmp = date_data.groupby(['line','station_V2']).apply(lambda x: x.max()-x.min())

In [13]:
tmp.mean(1)[tmp.mean(1)>0]

Series([], dtype: float64)

In [14]:
date_data[date_data.station_V2.isin([36,37])]

Id,4,6,7,9,11,13,14,16,18,23,...,399888,399890,399892,399894,399896,399897,399898,399899,line,station_V2
L3_S36_D3919,,,,4.96,3.38,8.07,,,0.44,,...,,,0.93,,,,,,3,36.0
L3_S36_D3921,,,,4.96,3.38,8.07,,,0.44,,...,,,0.93,,,,,,3,36.0
L3_S36_D3923,,,,4.96,3.38,8.07,,,0.44,,...,,,0.93,,,,,,3,36.0
L3_S36_D3925,,,,4.96,3.38,8.07,,,0.44,,...,,,0.93,,,,,,3,36.0
L3_S36_D3928,,,,4.96,3.38,8.07,,,0.44,,...,,,0.93,,,,,,3,36.0
L3_S36_D3932,,,,4.96,3.38,8.07,,,0.44,,...,,,0.93,,,,,,3,36.0
L3_S36_D3936,,,,4.96,3.38,8.07,,,0.44,,...,,,0.93,,,,,,3,36.0
L3_S36_D3940,,,,4.96,3.38,8.07,,,0.44,,...,,,0.93,,,,,,3,36.0
L3_S37_D3942,5.05,,5.72,4.96,3.38,8.07,,,0.44,,...,3.19,1.08,0.93,,,1.71,1.26,,3,37.0
L3_S37_D3943,5.05,,5.72,4.96,3.38,8.07,,,0.44,,...,3.19,1.08,0.93,,,1.71,1.26,,3,37.0
