# EDA Features per station

In [209]:
import os
import re

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

# Custom modules
import c

## Understanding the data
- The feature names of numerical data correspond to line and station numbers
- Observations:
    - Per line different amount of stations -> it might make sense to develop a model per line

In [142]:
def get_columns_csv(csv_file):
    ''' gets data columns for csv file identifier '''
    cols = list(pd.read_csv(os.path.join(c.BASE_PATH, csv_file + '.csv'), 
                              nrows=1,
                              index_col=0,
                             dtype=np.float32).columns)
    #print cols
    
    if 'Response' in cols:
        cols.remove('Response')
    
    
    return cols

def get_station_info(csv_file):
    ''' extracts line and station info from list of columns '''
    
    cols = get_columns_csv(csv_file)
    
    station_info = [[int(i) for i in re.findall(r'\d+', col_name)] for col_name in cols]
    station_info = pd.DataFrame(station_info, columns=['line', 'station', 'feature_nr'])
    
    return station_info

In [134]:
c.TRAIN_FILES

['train_numeric', 'train_categorical_to_num', 'train_date']

In [179]:
[num_info, cat_info, date_info] = [get_station_info(f) for f in c.TRAIN_FILES]

# Date features refer to numeric/categorical features
date_info['ref_feat_nr'] = date_info['feature_nr'] - 1

### Analyze line info
- Every line has different amount of features
- Every station has different amount of features
- Number of date features always equal or more than numeric features per station -> how can this be more??

In [139]:
# Number of stations differs per line, so lines are different
num_info.line.value_counts()

1    513
3    245
0    168
2     42
Name: line, dtype: int64

In [140]:
cat_info.line.value_counts()

1    1227
3     431
0     323
2     159
Name: line, dtype: int64

In [141]:
date_info.line.value_counts()

1    621
3    273
0    184
2     78
Name: line, dtype: int64

In [219]:
station_info = pd.concat([num_info.groupby(['line','station']).feature_nr.agg({'n_feat_num':'count'}),
                        cat_info.groupby(['line','station']).feature_nr.agg({'n_feat_cat':'count'}),
                        date_info.groupby(['line','station']).feature_nr.agg({'n_feat_date':'count'})],
                       axis=1).fillna(0)
                        

In [220]:
station_info

Unnamed: 0_level_0,Unnamed: 1_level_0,n_feat_num,n_feat_cat,n_feat_date
line,station,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,12.0,0.0,12
0,1,2.0,4.0,2
0,2,9.0,18.0,9
0,3,9.0,18.0,9
0,4,2.0,6.0,2
0,5,2.0,0.0,2
0,6,3.0,10.0,5
0,7,3.0,0.0,5
0,8,3.0,0.0,4
0,9,12.0,39.0,13


### Why are there sometimes more date features than numeric features?

Take as example line number 0 and station number 6: 

In [299]:
line_nr = 0
stat_nr = 0
print 'Numeric features'
print num_info[ (num_info.line==line_nr) & (num_info.station==stat_nr)]
print 'Categorical features'
print cat_info[ (cat_info.line==line_nr) & (cat_info.station==stat_nr)]
print 'Date features'
print date_info[ (date_info.line==line_nr) & (date_info.station==stat_nr)]

Numeric features
    line  station  feature_nr
0      0        0           0
1      0        0           2
2      0        0           4
3      0        0           6
4      0        0           8
5      0        0          10
6      0        0          12
7      0        0          14
8      0        0          16
9      0        0          18
10     0        0          20
11     0        0          22
Categorical features
Empty DataFrame
Columns: [line, station, feature_nr]
Index: []
Date features
    line  station  feature_nr  ref_feat_nr
0      0        0           1            0
1      0        0           3            2
2      0        0           5            4
3      0        0           7            6
4      0        0           9            8
5      0        0          11           10
6      0        0          13           12
7      0        0          15           14
8      0        0          17           16
9      0        0          19           18
10     0        0     

In [383]:
line_nr = 0
stat_nr = 6
print 'Numeric features'
print num_info[ (num_info.line==line_nr) & (num_info.station==stat_nr)]
print 'Categorical features'
print cat_info[ (cat_info.line==line_nr) & (cat_info.station==stat_nr)]
print 'Date features'
print date_info[ (date_info.line==line_nr) & (date_info.station==stat_nr)]

Numeric features
    line  station  feature_nr
36     0        6         118
37     0        6         122
38     0        6         132
Categorical features
    line  station  feature_nr
46     0        6         119
47     0        6         121
48     0        6         123
49     0        6         125
50     0        6         126
51     0        6         128
52     0        6         129
53     0        6         131
54     0        6         133
55     0        6         135
Date features
    line  station  feature_nr  ref_feat_nr
36     0        6         120          119
37     0        6         124          123
38     0        6         127          126
39     0        6         130          129
40     0        6         134          133


In [190]:
print('Numeric: {} of {} features do not have a timestamp'. \
          format(len(set(num_info.feature_nr.values) - set(date_info.ref_feat_nr.values)),
                 num_info.shape[0]))
print('Categorical: {} of {} features do not have a timestamp'. \
          format(len(set(cat_info.feature_nr.values) - set(date_info.ref_feat_nr.values)),
                 cat_info.shape[0]))
print('Date: {} of {} features do not refer to numeric/categorical features'. \
          format(len(set(date_info.ref_feat_nr.values) - set(cat_info.feature_nr.values) - set(num_info.feature_nr.values)),
                 cat_info.shape[0]))

Numeric: 602 of 968 features do not have a timestamp
Categorical: 1381 of 2140 features do not have a timestamp
Date: 31 of 2140 features do not refer to numeric/categorical features


### Calculate per station amount of features with timestamp

In [377]:
def timestap_coverage_per_station(x, date_info, num_info, cat_info):
    line_nr = x.name[0]
    station_nr = x.name[1]
    
    #print line_nr, station_nr
    feat_with_timestamp = date_info[ (date_info.line==line_nr) & (date_info.station==station_nr)].ref_feat_nr.values
    
    #print feat_with_timestamp
    num_with_timestamp = ((num_info.line==line_nr) & \
                          (num_info.station==station_nr) & \
                          (num_info.feature_nr.isin(feat_with_timestamp)))
    cat_with_timestamp = ((cat_info.line==line_nr) & \
                          (cat_info.station==station_nr) & \
                          (cat_info.feature_nr.isin(feat_with_timestamp)))
    t_without_cat_num = len(set(feat_with_timestamp) - \
                            set(num_info[num_with_timestamp].feature_nr) - \
                            set(cat_info[cat_with_timestamp].feature_nr))
    
    #print x['n_feat_num'],x['n_feat_cat']
    #print num_with_timestamp,cat_with_timestamp
    
    
    r_num = float(num_with_timestamp.sum()) / x['n_feat_num'] if x['n_feat_num']>0 else 0
    r_cat = float(cat_with_timestamp.sum()) / x['n_feat_cat'] if x['n_feat_cat']>0 else 0
    r_no = float(t_without_cat_num) 
    
    #print r_num, r_cat

    return pd.Series({'feat_cat_w_t': r_cat, 'feat_num_w_t': r_num, 'feat_t_wo_cn': r_no}).fillna(0).round(2)

In [378]:
station_info[['feat_cat_w_t','feat_num_w_t','feat_t_wo_ct']] = \
    station_info.apply(lambda x: timestap_coverage_per_station(x, date_info, num_info, cat_info), axis=1)

In [379]:
station_info

Unnamed: 0_level_0,Unnamed: 1_level_0,n_feat_num,n_feat_cat,n_feat_date,test,feat_num_w_t,feat_cat_w_t,feat_t_wo_ct
line,station,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,12.0,0.0,12,,1.0,0.0,0.0
0,1,2.0,4.0,2,,0.0,0.5,0.0
0,2,9.0,18.0,9,,0.0,0.5,0.0
0,3,9.0,18.0,9,,0.0,0.5,0.0
0,4,2.0,6.0,2,,0.0,0.33,0.0
0,5,2.0,0.0,2,,1.0,0.0,0.0
0,6,3.0,10.0,5,0.0,0.0,0.5,0.0
0,7,3.0,0.0,5,,1.0,0.0,2.0
0,8,3.0,0.0,4,,1.0,0.0,1.0
0,9,12.0,39.0,13,,0.0,0.33,0.0


In [381]:
station_info['feat_num_w_t'].value_counts()

1.00    25
0.00    23
0.31     1
0.23     1
0.50     1
0.25     1
Name: feat_num_w_t, dtype: int64

In [393]:
station_info.loc[station_info['n_feat_cat']>0,'feat_cat_w_t'].value_counts().astype(float)/52

0.33    0.269231
0.50    0.211538
0.23    0.057692
0.00    0.057692
0.40    0.019231
0.16    0.019231
0.35    0.019231
Name: feat_cat_w_t, dtype: float64

In [391]:
18./52

0.34615384615384615

Observations numeric features:
 - 48% of stations have all timestamps of numeric features
 - 44% of stations have zero timestamps of numeric features
 - 8% of stations has between 0.23 and 0.5 timestamps

Conclusions numeric features:
 - Group numeric features into with and without timestamps

Observations categorical features:
 - 35% of stations have no categorical features
 - 21% of stations with categorical features have 50% timestamps
 - 5% of stations with categorical features have 0% timestamps

Conclusions categorical features:
 - Group numeric features into with and without timestamps

### Are the feature numbers consistent?

There are no common feature_nr in the three data sets...so that is consistent.

In [194]:
print(len(set(date_info.feature_nr.values) & set(cat_info.feature_nr.values)))
print(len(set(date_info.feature_nr.values) & set(num_info.feature_nr.values)))
print(len(set(num_info.feature_nr.values) & set(cat_info.feature_nr.values)))

0
0
0


The combined feature numbers increase monotomically by 1...also consistent!

In [207]:
all_feat_nr = (set(date_info.feature_nr.values) | set(cat_info.feature_nr.values) | set(num_info.feature_nr.values))

In [208]:
(max(all_feat_nr) - min(all_feat_nr) + 1) / len(all_feat_nr)

1