In [27]:
import pandas as pd
from src.constants import fn_features_dc, dir_data, features_for_total
from wKit.ML.dprep import fillna_group_mean
from wKit.ML.feature_selection import has_value_thres
from src.ftr_aggregate import filter_total, filter_year

In [48]:
def load_features(lts, drop_na_thres=0.1, how='TOTAL', years=(2014,2015,2016,2017)):
    """
    filter data with year month by year
    filter data to either total count only or divided by different types
    filter data(columns) if has values< thres (whole population is decided by lts)
    fill na with group means for ['moving', 'parking', 'crash']
    """
    seg_type = pd.read_csv('data/seg_street_type.csv')
    fillna_by_group_names = ['moving', 'parking', 'crash']
    joint_features = []
    for name, fn in fn_features_dc.items():
        print 'loading', name, fn
        ftr = pd.read_csv(dir_data + fn, index_col=0)
        ftr = filter_year(ftr, years=years)

        if name in features_for_total:
            ftr = filter_total(ftr, name, how=how)
        
        # ftr aggregated to one item matches one segment
        ftr = ftr.groupby(level=0).sum()
        
        # get complete list of segments
        ftr = lts.merge(ftr, left_index=True, right_index=True, how='left').drop('LTS', axis=1)
        
        # filter columns with too many NA
        keep_col = has_value_thres(ftr, thres=drop_na_thres)
        keep_col = keep_col[keep_col].index.tolist()
        print 'all columns #:', ftr.shape[1], 'columns pass NA thres:', len(keep_col),'\n'
        ftr = ftr[keep_col]
        
        # fillna by means of segment types, if applicable
        if name in fillna_by_group_names:
            ftr = fillna_group_mean(ftr, seg_type)
        
        joint_features.append(ftr)
    
    joint_features = reduce(lambda x, y: pd.merge(x, y, left_index=True, right_index=True, how='outer'), joint_features)
    
    print 'fill the rest NA with 0'
    joint_features.fillna(0, inplace=True)
    return joint_features

In [8]:
lts  = pd.read_csv('data/feature_lts_dc.csv',index_col=0)

In [49]:
ftrs = load_features(lts, how='NO_TOTAL')

loading seg_attr feature_seg_attribute_dc.csv
all columns #: 20 columns pass NA thres: 20 

loading crash feature_crash_dc.csv
all columns #: 49 columns pass NA thres: 11 

loading 311 feature_311_dc.csv
all columns #: 72 columns pass NA thres: 72 

loading parking feature_parking_violations_dc.csv
all columns #: 254 columns pass NA thres: 36 

loading v0 feature_vision0_dc.csv
all columns #: 20 columns pass NA thres: 20 

loading moving feature_mov_violations_dc.csv
all columns #: 349 columns pass NA thres: 8 

loading poi feature_poi_dc.csv
all columns #: 10 columns pass NA thres: 10 

loading bk_osm feature_bk_facs_dc.csv
all columns #: 13 columns pass NA thres: 13 

loading net_SaE feature_seg_as_edge_dc.csv
all columns #: 3 columns pass NA thres: 3 

loading net_SaN feature_seg_as_node_dc.csv
all columns #: 20 columns pass NA thres: 20 

loading crime feature_crime_dc.csv
all columns #: 12 columns pass NA thres: 11 

loading bk_opendc feature_bk_opendc_dc.csv
all columns #: 6 colu

In [51]:
ftrs.shape

(13293, 230)

In [9]:
lts.merge(dfs[0], left_index=True, right_index=True, how='outer').shape

(13293, 21)

In [11]:
years = (2014, 2015, 2016, 2017)
total_or_not = 'TOTAL'

In [21]:
from src.ftr_aggregate import load_joint_features

ftr, col2code = load_joint_features(years=years, how=total_or_not, na=None)

ftr = lts.merge(ftr, left_index=True, right_index=True, how='left').drop('LTS', axis=1)
ftr.shape

(13293, 65)

In [22]:
ftr.isnull().sum().sort_values(ascending=False)

v0_total                       8542
moving_total                   6096
crash_total                    3812
parking_total                  2812
311_total_not_parking_meter     584
311_total                       584
crime_total                     432
ud_bridge_SgAsEg                  0
d_btw_cntr_SgAsEg                 0
STREETTYPE_ST                     0
SEGMENTTYPE_3                     0
SEGMENTTYPE_2                     0
SEGMENTTYPE_1                     0
ud_btw_cntr_SgAsEg                0
STREETTYPE_WAY                    0
poi_total                         0
d_auth_score_SgAsNd               0
STREETTYPE_TER                    0
bikable_yes                       0
STREETTYPE_RD                     0
d_clo_cntr_SgAsNd                 0
STREETTYPE_PL                     0
STREETTYPE_PKWY                   0
STREETTYPE_OTHER                  0
STREETTYPE_LN                     0
STREETTYPE_DR                     0
STREETTYPE_CT                     0
STREETTYPE_CRES             