In [40]:
from sklearn import linear_model, svm, tree, ensemble
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score
import pandas as pd
import geopandas as gp

In [2]:
def fit_pred(model, train_dummies_x, train_y, test_dummies_x, test_y, regression=True):
    def round_reg_pred(pred):
        pred_round = pred.round()
        pred_round[pred_round<1] = 1
        pred_round[pred_round>4] = 4
        return pred_round
    
    model.fit(train_dummies_x, train_y)

    train_pred = model.predict(train_dummies_x)
    test_pred = model.predict(test_dummies_x)
    train_pred_round = round_reg_pred(train_pred) if regression else train_pred
    test_pred_round = round_reg_pred(test_pred) if regression else test_pred
    
    if regression:
        mse_train = mean_squared_error(train_y, train_pred)
        mse_test = mean_squared_error(test_y, test_pred)
        var_train = model.score(train_dummies_x, train_y)
        var_test = model.score(test_dummies_x, test_y)
    
    acc_train = accuracy_score(train_y, train_pred_round)
    acc_test = accuracy_score(test_y, test_pred_round)
    f1_train = f1_score(train_y, train_pred_round,average='weighted')
    f1_test = f1_score(test_y, test_pred_round,average='weighted')

    result = {
        'f1_train': f1_train,
        'f1_test': f1_test,
        'acc_train': acc_train,
        'acc_test': acc_test,
    }
    if regression:
        result.update({
            'mse_train': mse_train,
            'mse_test': mse_test,
            'var_train': var_train,
            'var_test': var_test,
        })
    return result

In [79]:
def get_models(no_svm=False):
    models = [
        ["linear", linear_model.LinearRegression(), True],
        ["ridge", linear_model.Ridge (alpha = .5), True],
        ["lasso", linear_model.Lasso(alpha=.5), True],
        ["DTReg", tree.DecisionTreeRegressor(max_depth=10), True],
        ["RFReg", ensemble.RandomForestRegressor(), True],
        ["AdaReg", ensemble.AdaBoostRegressor(), True],
        ["BagReg", ensemble.BaggingRegressor(), True],
        ["GraReg", ensemble.GradientBoostingRegressor(), True],
        ["logistics", linear_model.LogisticRegression(), False],
        ["DTClsfr", tree.DecisionTreeClassifier(), False],
    ]
    if not no_svm:
        models+=[["SVR", svm.SVR(), True],
                 ["SVM", svm.SVC(), False]]
    return models

In [4]:
def stats2ftr(stats, how='TOTAL', ftr_name=None):
    ftr = stats.drop(['MONTH','YEAR'], axis=1).groupby('index_seg').sum()
    if how=='TOTAL':
        assert ftr_name is not None, 'please provide the feature name for the aggregated feature'
        ftr = ftr.sum(axis=1).to_frame()
        ftr.columns = [ftr_name]
    ftr = ftr.reset_index()
    return ftr

In [132]:
def get_train_test_dummies(train_x, test_x, fillna_dict):
    train_fillna_x = train_x.fillna(fillna_mode_or_mean)
    test_fillna_x = test_x.fillna(fillna_mode_or_mean)
    train_dummies_x = pd.get_dummies(train_fillna_x)
    test_dummies_x = pd.get_dummies(test_fillna_x)
    for x in list(set(train_dummies_x.columns)-set(test_dummies_x.columns)):
        test_dummies_x[x] = 0
    test_dummies_x = test_dummies_x[train_dummies_x.columns]
    return train_dummies_x, test_dummies_x

# load raw data

In [81]:
from src.constants import fn_segments_dc, fn_feature_bk_facs_dc, fn_feature_poi_dc, fn_feature_seg_as_edge_dc, fn_feature_seg_as_node_dc
fn_feature_lts_dc = 'feature_lts_dc.csv'
fn_feature_crime_dc = 'feature_crime_incidences_dc.csv'
fn_feature_mov_dc = 'feature_mov_violations_dc.csv'
fn_feature_parking_dc = 'feature_parking_violations_dc.csv'
data_dir = 'data/'

In [41]:
segs = gp.read_file(data_dir+fn_segments_dc)

ftr_segs_col = ['DIRECTIONALITY', 'STREETTYPE', 'SHAPE_Length']
ftr_segs = segs[ftr_segs_col].reset_index()
ftr_segs.columns = ['index_seg']+list(ftr_segs.columns[1:])
ftr_segs.DIRECTIONALITY = ftr_segs.DIRECTIONALITY.apply(lambda x: 'Bi-direction' if x==2 else 'one-way')


In [6]:
ftr_lts = pd.read_csv(data_dir+fn_feature_lts_dc)
ftr_bk = pd.read_csv(data_dir+fn_feature_bk_facs_dc, index_col=0)
ftr_poi = pd.read_csv(data_dir+fn_feature_poi_dc, index_col=0)
ftr_edge = pd.read_csv(data_dir+fn_feature_seg_as_edge_dc)
ftr_node = pd.read_csv(data_dir+fn_feature_seg_as_node_dc)

In [7]:
crime_stats = pd.read_csv(data_dir+fn_feature_crime_dc)
mov_stats = pd.read_csv(data_dir+fn_feature_mov_dc)
parking_stats = pd.read_csv(data_dir+fn_feature_parking_dc)

ftr_mov = stats2ftr(mov_stats, how='NOT_TOTAL')
ftr_crime = stats2ftr(crime_stats, how='NOT_TOTAL')
ftr_parking = stats2ftr(parking_stats, how='NOT_TOTAL')

ftr_mov_total = stats2ftr(mov_stats, ftr_name='moving_violations_cnt')
ftr_crime_total = stats2ftr(crime_stats, ftr_name='crime_incidents_cnt')
ftr_parking_total = stats2ftr(parking_stats, ftr_name='parking_violations_cnt')

In [133]:
cut_off = 100
dfs = [ftr_lts, ftr_segs, ftr_bk, ftr_poi, ftr_edge, ftr_node, ftr_mov, ftr_crime, ftr_parking]
df = reduce(lambda left,right: pd.merge(left,right, how='left'), dfs)
df = df[~df.LTS.isnull()&(df.LTS!=10)]
df.drop('index_seg', axis=1, inplace=True)
non_na_count = df.count()
keep_col = non_na_count[non_na_count>cut_off].index
df = df[keep_col]
df.shape

(13293, 184)

# get train and test

In [134]:
train = df.sample(frac=0.8, random_state=1)
test = df[~df.index.isin(train.index)]
train.shape, test.shape

((10634, 184), (2659, 184))

In [135]:
train_y = train.LTS.values
train_x = train.drop('LTS', axis=1)
test_y = test.LTS.values
test_x = test.drop('LTS', axis=1)

In [136]:
train_x.shape, test_x.shape

((10634, 183), (2659, 183))

In [137]:
# fillna_mode_or_mean = train_x[['cycle_lane', 'cycle_way', 'side_walk', 'bikable']].mode().T.to_dict()[0]
fillna_mode_or_mean = train_x[['cycle_lane', 'side_walk', 'bikable', 'DIRECTIONALITY', 'STREETTYPE']].mode().T.to_dict()[0]
fillna_mode_or_mean.update(train_x.mean().to_dict())

best with f1 test
- ftr_bk, ftr_poi, ftr_edge, ftr_node:   DTClsfr->0.486618
- ftr_bk, ftr_poi, ftr_edge, ftr_node, ftr_mov_total, ftr_crime_total(one aggregate column): DTClsfr->0.479885
- ftr_bk, ftr_poi, ftr_edge, ftr_node, ftr_mov, ftr_crime, cut_off = 100: DTClsfr->0.495041
- ftr_segs, ftr_bk, ftr_poi, ftr_edge, ftr_node, ftr_mov, ftr_crime, cut_off=100: RFReg->0.569317
- ftr_segs, ftr_bk, ftr_poi, ftr_edge, ftr_node, ftr_mov, ftr_crime, ftr_parking, cut_off=100: BagReg->0.571100 or sometimes RFREG->0.569


In [140]:
train_dummies_x, test_dummies_x = get_train_test_dummies(train_x, test_x, fillna_mode_or_mean)
print 'dummies features =', train_dummies_x.shape

models = get_models(no_svm=False)
results = {}
for name, model, is_regression in models:
    print('model = %s' % name)
    results[name] = fit_pred(model, train_dummies_x, train_y, test_dummies_x, test_y, regression=is_regression)

df_results = pd.DataFrame(results)
df_results.T.sort('f1_test', ascending=False)

dummies features = (10634, 210)
model = linear
model = ridge
model = lasso
model = DTReg
model = RFReg
model = AdaReg
model = BagReg
model = GraReg
model = logistics
model = DTClsfr
model = SVR
model = SVM


Unnamed: 0,acc_test,acc_train,f1_test,f1_train,mse_test,mse_train,var_test,var_train
RFReg,0.555848,0.888095,0.569198,0.890695,0.557774,0.095528,0.522862,0.918518
BagReg,0.540429,0.891198,0.552007,0.893558,0.553546,0.094992,0.526478,0.918975
DTReg,0.541181,0.644536,0.546847,0.649364,0.695903,0.423791,0.404702,0.638519
DTClsfr,0.538548,1.0,0.53871,1.0,,,,
GraReg,0.523881,0.545138,0.534443,0.557736,0.537734,0.48306,0.540004,0.587964
linear,0.422339,0.435396,0.428018,0.435718,0.727979,0.695277,0.377263,0.40695
ridge,0.403159,0.407467,0.402802,0.402541,0.748217,0.714013,0.359951,0.390969
logistics,0.446408,0.443013,0.306645,0.300528,,,,
SVM,0.430613,0.993323,0.262058,0.99331,,,,
AdaReg,0.288454,0.295467,0.1766,0.178236,0.809287,0.793741,0.307709,0.322963


best F1
- ftr_segs, ftr_bk, ftr_poi, ftr_edge, ftr_node, ftr_mov, ftr_crime, ftr_parking, cut_off=100: RFReg->0.553822

In [141]:
fillna_unknown_or_zero = train_x.dtypes.apply(lambda x: 'unknown' if x.name=='object' else 0.0).to_dict()

In [143]:
train_dummies_x, test_dummies_x = get_train_test_dummies(train_x, test_x, fillna_unknown_or_zero)
print 'dummies features =', train_dummies_x.shape


models = get_models()
results = {}
for name, model, is_regression in models:
    print('model = %s' % name)
    results[name] = fit_pred(model, train_dummies_x, train_y, test_dummies_x, test_y, regression=is_regression)

df_results = pd.DataFrame(results)
df_results.T.sort('f1_test', ascending=False)

dummies features = (10634, 210)
model = linear
model = ridge
model = lasso
model = DTReg
model = RFReg
model = AdaReg
model = BagReg
model = GraReg
model = logistics
model = DTClsfr
model = SVR
model = SVM


Unnamed: 0,acc_test,acc_train,f1_test,f1_train,mse_test,mse_train,var_test,var_train
SVR,0.253855,0.858379,0.139016,0.807684,1.166471,0.172517,0.002162,0.852848
lasso,0.253479,0.266033,0.150752,0.15865,0.973118,0.9768,0.167562,0.16682
AdaReg,0.280557,0.286064,0.172581,0.173725,0.814547,0.797884,0.303209,0.31943
SVM,0.430613,0.993323,0.262058,0.99331,,,,
logistics,0.446408,0.443013,0.306645,0.300528,,,,
ridge,0.403159,0.407467,0.402802,0.402541,0.748217,0.714013,0.359951,0.390969
linear,0.422339,0.435396,0.428018,0.435718,0.727979,0.695277,0.377263,0.40695
GraReg,0.523505,0.545138,0.534114,0.557736,0.53825,0.48306,0.539563,0.587964
DTReg,0.538548,0.645759,0.543772,0.650608,0.712428,0.42351,0.390565,0.638759
DTClsfr,0.547574,1.0,0.546798,1.0,,,,
