In [1]:
from sklearn import linear_model, svm, tree, ensemble
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score, roc_auc_score, classification_report
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import geopandas as gp
from datetime import datetime

In [2]:
def stats2ftr(stats, how='TOTAL', ftr_name=None):
    ftr = stats.drop(['MONTH','YEAR'], axis=1).groupby('index_seg').sum()
    if how=='TOTAL':
        assert ftr_name is not None, 'please provide the feature name for the aggregated feature'
        ftr = ftr.sum(axis=1).to_frame()
        ftr.columns = [ftr_name]
    ftr = ftr.reset_index()
    return ftr

In [3]:
def get_train_test_dummies(train_x, test_x, fillna_dict):
    train_fillna_x = train_x.fillna(fillna_mode_or_mean)
    test_fillna_x = test_x.fillna(fillna_mode_or_mean)
    train_dummies_x = pd.get_dummies(train_fillna_x)
    test_dummies_x = pd.get_dummies(test_fillna_x)
    for x in list(set(train_dummies_x.columns)-set(test_dummies_x.columns)):
        test_dummies_x[x] = 0
    test_dummies_x = test_dummies_x[train_dummies_x.columns]
    return train_dummies_x, test_dummies_x

# parameters for gridsearchCV
- linear: tol
- ridge: tol, alpha=0.5
- lasso: tol, alpha=0.5
- DTReg: max_depth=10, max_features=None [float, 'sqrt', 'log2']
- DTClsfr: criterion=gini(or entropy), max_features=None [float, 'sqrt', 'log2'], max_depth=None
- RFReg: n_estimators=10, max_features=None [float, 'sqrt', 'log2']
- BagReg: n_estimators=10, max_features=1.0(float or int), max_samples=1.0(float or int)
- AdaReg: n_estimators=50, max_features=None [float, 'sqrt', 'log2'], learning_rate=1.
- GraReg: n_estimators=100, max_features=None [float, 'sqrt', 'log2'], learning_rate=.1, max_depth=3
- logistics: penalty=l2(or l1), C=1.0(like SVM, smaller->stronger regularization), tol=1e-4
- svr: C=1.0, epsilon=0.1, kernal=rbf(linear, poly, sigmoid), degree=3(for poly), epsilon=0.1(C, gamma , epsilon play a very important role in rbf gaussian kernel based SVM ), tol=1e-3

In [4]:
ALPHAs = np.logspace(0,2,10)
DEPTHs = [3, 5, 10, 30, 50]
N_ESTIMATORs = [10, 30, 50, 100, 256, 500]
MAX_FTRs = [0.1, 0.3, 0.5, 1.]
LEARN_RATEs = np.logspace(-4, 1, 5)
PENALTYs = ['l1', 'l2']
TOLs = [1e-2, 1e-3, 1e-4, 1e-5]
GAMMAs = [1e-2, 1e-3, 1e-4, 1e-5]
Cs = np.logspace(-4, 2, 4)
EPs = np.logspace(-4, 2, 4)
DEGs = [2, 3, 4, 5, 6]
CRITERIONs = ['gini', 'entropy']

# lasso and Ridge
params_alpha = {'alpha': ALPHAs}
params_logis = {'C': Cs, 'penalty':PENALTYs}
params_dt_reg = {'max_depth': DEPTHs, 'max_features': MAX_FTRs}
params_dt_cla = {'max_depth': DEPTHs, 'max_features': MAX_FTRs, 'criterion': CRITERIONs}
params_rf_reg = {'n_estimators': N_ESTIMATORs, 'max_features': MAX_FTRs}
params_bag_reg = {'n_estimators': N_ESTIMATORs, 'max_features': MAX_FTRs}
params_ada_reg = {'n_estimators': N_ESTIMATORs, 'learning_rate': LEARN_RATEs}

# slow models, because of too many combination of parameters
params_gra_reg = {'n_estimators': N_ESTIMATORs[:-2], 'max_features': MAX_FTRs, 'learning_rate': LEARN_RATEs, 'max_depth':DEPTHs}

params_svm = [
    {'kernel':['linear'], 'C': Cs},
    {'kernel':['rbf'], 'C': Cs, 'gamma': GAMMAs},
    {'kernel':['sigmoid'], 'C': Cs, 'gamma': GAMMAs},
    {'kernel':['poly'], 'C': Cs, 'gamma': GAMMAs, 'degree': DEGs},
]

params_svr = [
    {'kernel':['linear'], 'C': Cs},
    {'kernel':['rbf'], 'C': Cs, 'gamma': GAMMAs},
    {'kernel':['sigmoid'], 'C': Cs, 'gamma': GAMMAs},
    {'kernel':['poly'], 'C': Cs, 'gamma': GAMMAs, 'degree': DEGs},
]


In [5]:
def get_models(slow_model=True):
    """return: model, tuning_params, name, regresion_or_classification
    """
    is_reg = True
    is_cla = False
    reg_models = [
        [linear_model.LinearRegression(), {}, "linear", is_reg],
        [linear_model.Ridge(), params_alpha, "ridge", is_reg],
        [linear_model.Lasso(), params_alpha, "lasso", is_reg],
        [tree.DecisionTreeRegressor(), params_dt_reg, "DTReg", is_reg],
        [ensemble.RandomForestRegressor(), params_rf_reg, "RFReg", is_reg],
        [ensemble.AdaBoostRegressor(), params_ada_reg, "AdaReg", is_reg],
        [ensemble.BaggingRegressor(), params_bag_reg, "BagReg", is_reg],
    ]
    cls_models = [
        [linear_model.LogisticRegression(), params_logis, "logistics", is_cla],
        [tree.DecisionTreeClassifier(), params_dt_cla, "DTClsfr", is_cla],
        [ensemble.RandomForestClassifier(), params_rf_reg, "RFclsfr", is_cla],
        [ensemble.AdaBoostClassifier(), params_ada_reg, "Adaclsfr", is_cla],
        [ensemble.BaggingClassifier(), params_bag_reg, "Bagclsfr", is_cla],
    ]
    models = cls_models+reg_models
    if slow_model:
        slow_reg_models = [
            [ensemble.GradientBoostingRegressor(), params_gra_reg, "GraReg", is_reg],
            [svm.SVR(), params_svr, "SVR", is_reg]
        ]
        
        slow_cls_models = [
            [ensemble.GradientBoostingClassifier(), params_gra_reg, "Graclsfr", is_cla],
            [svm.SVC(), params_svm, "SVC", False],
        ]
        models += slow_cls_models
        models += slow_reg_models
        
    return models

In [6]:
def CV_models(models, train_x, train_y, n_jobs=4, cv=5):
    cv_results = []
    whole_start = datetime.now()
    for model, tuning_params, name, is_regression in models:
        sub_start = datetime.now()
        print 'CVing', name, is_regression, sub_start
        scoring = 'neg_mean_squared_error' if is_regression else 'f1_weighted'
        clf = GridSearchCV(model, tuning_params, n_jobs=n_jobs, cv=cv, scoring=scoring)
        clf.fit(train_x, train_y)
        
        df = pd.DataFrame(clf.cv_results_).sort_values(by = 'mean_test_score', ascending=False)
        df.to_csv('cv_%d_model_%s.csv' % (cv, name))
        test_score, train_score, fit_time = df[['mean_test_score', 'mean_train_score', 'mean_fit_time']].values[0]
        
        sub_end = datetime.now()
        print 'score: %s, best test = %.3f, train = %.3f, mean_fit_time = %f' %(scoring, test_score, train_score, fit_time)
        print 'best params', clf.best_params_
        print sub_end, sub_end-sub_start

        result = {
            'run_time': sub_end-sub_start,
            'score': scoring,
            'model_name': name,
            'is_regression': is_regression,
            'mean_test': test_score,
            'mean_train': train_score,
            'mean_fit_time': fit_time,
            'best_params': clf.best_params_,
            'best_model': clf.best_estimator_,
        }
        cv_results.append(result)
    whole_end = datetime.now()
    print 'finish CV', whole_end, whole_end-whole_start
    return cv_results

In [7]:
def fit_pred(model, train_dummies_x, train_y, test_dummies_x, test_y, regression=True):
    def round_reg_pred(pred):
        pred_round = pred.round()
        pred_round[pred_round<1] = 1
        pred_round[pred_round>4] = 4
        return pred_round
    
    model.fit(train_dummies_x, train_y)

    train_pred = model.predict(train_dummies_x)
    test_pred = model.predict(test_dummies_x)
    train_pred_round = round_reg_pred(train_pred) if regression else train_pred
    test_pred_round = round_reg_pred(test_pred) if regression else test_pred
    
    if regression:
        mse_train = mean_squared_error(train_y, train_pred)
        mse_test = mean_squared_error(test_y, test_pred)
        var_train = model.score(train_dummies_x, train_y)
        var_test = model.score(test_dummies_x, test_y)
    
    acc_train = accuracy_score(train_y, train_pred_round)
    acc_test = accuracy_score(test_y, test_pred_round)
    f1_train = f1_score(train_y, train_pred_round,average='weighted')
    f1_test = f1_score(test_y, test_pred_round,average='weighted')

    result = {
        'f1_train': f1_train,
        'f1_test': f1_test,
        'acc_train': acc_train,
        'acc_test': acc_test,
    }
    if regression:
        result.update({
            'mse_train': mse_train,
            'mse_test': mse_test,
            'var_train': var_train,
            'var_test': var_test,
        })
    return result

In [8]:
def test_evaluation(df_cv, train_dummies_x, train_y, test_dummies_x, test_y):

    results = {}
    for best_model, model_name, is_regression in df_cv[['best_model', 'model_name', 'is_regression']].values:
        print('model = %s' % model_name)
        results[model_name] = fit_pred(best_model, train_dummies_x, train_y, test_dummies_x, test_y, regression=is_regression)

    df_results = pd.DataFrame(results)
    return df_results.T.sort_values(by='f1_test', ascending=False)
    

# load raw data

In [9]:
from src.constants import fn_segments_dc, fn_feature_bk_facs_dc, fn_feature_poi_dc, fn_feature_seg_as_edge_dc, fn_feature_seg_as_node_dc
fn_feature_lts_dc = 'feature_lts_dc.csv'
fn_feature_crime_dc = 'feature_crime_incidences_dc.csv'
fn_feature_mov_dc = 'feature_mov_violations_dc.csv'
fn_feature_parking_dc = 'feature_parking_violations_dc.csv'
data_dir = 'data/'

In [10]:
segs = gp.read_file(data_dir+fn_segments_dc)

ftr_segs_col = ['DIRECTIONALITY', 'STREETTYPE', 'SHAPE_Length']
ftr_segs = segs[ftr_segs_col].reset_index()
ftr_segs.columns = ['index_seg']+list(ftr_segs.columns[1:])
ftr_segs.DIRECTIONALITY = ftr_segs.DIRECTIONALITY.apply(lambda x: 'Bi-direction' if x==2 else 'one-way')


In [11]:
ftr_lts = pd.read_csv(data_dir+fn_feature_lts_dc)
ftr_bk = pd.read_csv(data_dir+fn_feature_bk_facs_dc, index_col=0)
ftr_poi = pd.read_csv(data_dir+fn_feature_poi_dc, index_col=0)
ftr_edge = pd.read_csv(data_dir+fn_feature_seg_as_edge_dc)
ftr_node = pd.read_csv(data_dir+fn_feature_seg_as_node_dc)

In [12]:
crime_stats = pd.read_csv(data_dir+fn_feature_crime_dc)
mov_stats = pd.read_csv(data_dir+fn_feature_mov_dc)
parking_stats = pd.read_csv(data_dir+fn_feature_parking_dc)

ftr_mov = stats2ftr(mov_stats, how='NOT_TOTAL')
ftr_crime = stats2ftr(crime_stats, how='NOT_TOTAL')
ftr_parking = stats2ftr(parking_stats, how='NOT_TOTAL')

ftr_mov_total = stats2ftr(mov_stats, ftr_name='moving_violations_cnt')
ftr_crime_total = stats2ftr(crime_stats, ftr_name='crime_incidents_cnt')
ftr_parking_total = stats2ftr(parking_stats, ftr_name='parking_violations_cnt')

In [13]:
cut_off = 100
dfs = [ftr_lts, ftr_segs, ftr_bk, ftr_poi, ftr_edge, ftr_node, ftr_mov, ftr_crime, ftr_parking]
df = reduce(lambda left,right: pd.merge(left,right, how='left'), dfs)
df = df[~df.LTS.isnull()&(df.LTS!=10)]
df.drop('index_seg', axis=1, inplace=True)
non_na_count = df.count()
keep_col = non_na_count[non_na_count>cut_off].index
df = df[keep_col]
df.shape

(13293, 184)

# get train and test

In [14]:
train = df.sample(frac=0.8, random_state=1)
test = df[~df.index.isin(train.index)]
train.shape, test.shape

((10634, 184), (2659, 184))

In [15]:
train_y = train.LTS.values
train_x = train.drop('LTS', axis=1)
test_y = test.LTS.values
test_x = test.drop('LTS', axis=1)

In [16]:
train_x.shape, test_x.shape

((10634, 183), (2659, 183))

# Fillna with mean or mode

In [17]:
# fillna_mode_or_mean = train_x[['cycle_lane', 'cycle_way', 'side_walk', 'bikable']].mode().T.to_dict()[0]
fillna_mode_or_mean = train_x[['cycle_lane', 'side_walk', 'bikable', 'DIRECTIONALITY', 'STREETTYPE']].mode().T.to_dict()[0]
fillna_mode_or_mean.update(train_x.mean().to_dict())

In [18]:
train_dummies_x, test_dummies_x = get_train_test_dummies(train_x, test_x, fillna_mode_or_mean)
print 'dummies features =', train_dummies_x.shape

dummies features = (10634, 210)


In [None]:
cv=5
models = get_models(slow_model=False)
cv_results = CV_models(models, train_dummies_x, train_y, n_jobs=5, cv=cv)
df_cv = pd.DataFrame(cv_results)
df_cv.to_csv('cv_%d_best_models.csv' % cv)

CVing logistics False 2017-04-13 10:16:42.354244


  'precision', 'predicted', average, warn_for)


In [None]:
df_test_result = test_evaluation(df_cv, train_dummies_x[:8000], train_y[:8000], train_dummies_x[8000:], train_y[8000:])
df_test_result.to_csv('test evaluation result.csv')

# Fillna with unknown or zero

In [141]:
fillna_unknown_or_zero = train_x.dtypes.apply(lambda x: 'unknown' if x.name=='object' else 0.0).to_dict()

In [None]:
train_dummies_x, test_dummies_x = get_train_test_dummies(train_x, test_x, fillna_unknown_or_zero)
print 'dummies features =', train_dummies_x.shape

In [21]:
models = get_models()
cv_results = CV_models(models)
df_cv = pd.DataFrame(cv_results)

CVing linear True
best test mse = 0.069, train mse = 0.044, mean_fit_time = 0.039400
best params {}

CVing ridge True
best test mse = 0.071, train mse = 0.044, mean_fit_time = 0.075200
best params {'alpha': 1.0}

CVing lasso True
best test mse = 0.802, train mse = 0.411, mean_fit_time = 0.000000
best params {'alpha': 1.0}

CVing DTReg True
best test mse = 0.046, train mse = 0.034, mean_fit_time = 0.003200
best params {'max_features': 0.5, 'max_depth': 3}

CVing RFReg True
best test mse = 0.051, train mse = 0.005, mean_fit_time = 0.119800
best params {'max_features': 0.5, 'n_estimators': 30}

CVing AdaReg True
best test mse = 0.054, train mse = 0.008, mean_fit_time = 1.259600
best params {'n_estimators': 500, 'learning_rate': 0.001}

CVing BagReg True
best test mse = 0.065, train mse = 0.008, mean_fit_time = 0.768400
best params {'max_features': 0.5, 'n_estimators': 256}

CVing GraReg True
best test mse = 0.057, train mse = 0.000, mean_fit_time = 0.009800
best params {'max_features': 'l

Unnamed: 0,best_model,best_params,is_regression,mean_fit_time,mean_test_mse,mean_train_mse,model_name
0,"LinearRegression(copy_X=True, fit_intercept=Tr...",{},True,0.0394,0.069046,0.04418,linear
1,"Ridge(alpha=1.0, copy_X=True, fit_intercept=Tr...",{u'alpha': 1.0},True,0.0752,0.070631,0.044479,ridge
2,"Lasso(alpha=1.0, copy_X=True, fit_intercept=Tr...",{u'alpha': 1.0},True,0.0,0.802258,0.411115,lasso
3,"DecisionTreeRegressor(criterion='mse', max_dep...","{u'max_features': 0.5, u'max_depth': 3}",True,0.0032,0.045794,0.033528,DTReg
4,"(DecisionTreeRegressor(criterion='mse', max_de...","{u'max_features': 0.5, u'n_estimators': 30}",True,0.1198,0.050667,0.005298,RFReg
5,"(DecisionTreeRegressor(criterion='mse', max_de...","{u'n_estimators': 500, u'learning_rate': 0.001}",True,1.2596,0.054454,0.00823,AdaReg
6,"(DecisionTreeRegressor(criterion='mse', max_de...","{u'max_features': 0.5, u'n_estimators': 256}",True,0.7684,0.064779,0.008056,BagReg
7,([DecisionTreeRegressor(criterion='friedman_ms...,"{u'max_features': u'log2', u'n_estimators': 50...",True,0.0098,0.057343,0.00029,GraReg
8,"LogisticRegression(C=6.3095734448019298, class...","{u'penalty': u'l1', u'C': 6.3095734448}",False,0.0124,-0.979983,-0.97333,logistics
9,"DecisionTreeClassifier(class_weight=None, crit...","{u'max_features': u'log2', u'criterion': u'gin...",False,0.003,-0.9733,-0.986643,DTClsfr
