In [2]:
from sklearn import linear_model, svm, tree, ensemble
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score
import pandas as pd
from src.constants import fn_feature_bk_facs_dc, fn_feature_poi_dc, fn_feature_seg_as_edge_dc, fn_feature_seg_as_node_dc
fn_feature_lts_dc = 'feature_lts_dc.csv'
data_dir = 'data/'

In [3]:
def fit_pred(model, train_dummies_x, train_y, test_dummies_x, test_y, regression=True):
    def round_reg_pred(pred):
        pred_round = pred.round()
        pred_round[pred_round<1] = 1
        pred_round[pred_round>4] = 4
        return pred_round
    
    model.fit(train_dummies_x, train_y)

    train_pred = model.predict(train_dummies_x)
    test_pred = model.predict(test_dummies_x)
    train_pred_round = round_reg_pred(train_pred) if regression else train_pred
    test_pred_round = round_reg_pred(test_pred) if regression else test_pred
    
    if regression:
        mse_train = mean_squared_error(train_y, train_pred)
        mse_test = mean_squared_error(test_y, test_pred)
        var_train = model.score(train_dummies_x, train_y)
        var_test = model.score(test_dummies_x, test_y)
    
    acc_train = accuracy_score(train_y, train_pred_round)
    acc_test = accuracy_score(test_y, test_pred_round)
    f1_train = f1_score(train_y, train_pred_round,average='weighted')
    f1_test = f1_score(test_y, test_pred_round,average='weighted')

    result = {
        'f1_train': f1_train,
        'f1_test': f1_test,
        'acc_train': acc_train,
        'acc_test': acc_test,
    }
    if regression:
        result.update({
            'mse_train': mse_train,
            'mse_test': mse_test,
            'var_train': var_train,
            'var_test': var_test,
        })
    return result

In [4]:
def get_models():
    models = [
        ["linear", linear_model.LinearRegression(), True],
        ["ridge", linear_model.Ridge (alpha = .5), True],
        ["lasso", linear_model.Lasso(alpha=.5), True],
        ["SVR", svm.SVR(), True],
        ["DTReg", tree.DecisionTreeRegressor(max_depth=10), True],
        ["RFReg", ensemble.RandomForestRegressor(), True],
        ["AdaReg", ensemble.AdaBoostRegressor(), True],
        ["BagReg", ensemble.BaggingRegressor(), True],
        ["GraReg", ensemble.GradientBoostingRegressor(), True],
        ["logistics", linear_model.LogisticRegression(), False],
        ["DTClsfr", tree.DecisionTreeClassifier(), False],
        ["SVM", svm.SVC(), False]
    ]
    return models

# load raw data

In [5]:
ftr_bk = pd.read_csv(data_dir+fn_feature_bk_facs_dc, index_col=0)
ftr_poi = pd.read_csv(data_dir+fn_feature_poi_dc, index_col=0)
ftr_edge = pd.read_csv(data_dir+fn_feature_seg_as_edge_dc)
ftr_node = pd.read_csv(data_dir+fn_feature_seg_as_node_dc)
ftr_lts = pd.read_csv(data_dir+fn_feature_lts_dc)

In [6]:
df = ftr_lts.merge(ftr_bk, how='left').merge(ftr_poi, how='left').merge(ftr_edge, how='left').merge(ftr_node, how='left')

In [7]:
df = df.drop('index_seg', axis=1)

In [8]:
df = df[~df.LTS.isnull()&(df.LTS!=10)].copy()

# get train and test

In [18]:
train = df.sample(frac=0.8, random_state=1)
test = df[~df.index.isin(train.index)]

In [19]:
train_y = train.LTS.values
train_x = train.drop('LTS', axis=1)
test_y = test.LTS.values
test_x = test.drop('LTS', axis=1)

In [20]:
fillna_mode_or_mean = train_x[['cycle_lane', 'cycle_way', 'side_walk', 'bikable']].mode().T.to_dict()[0]
fillna_mode_or_mean.update(train_x.mean().to_dict())

In [21]:
train_fillna_x = train_x.fillna(fillna_mode_or_mean)
test_fillna_x = test_x.fillna(fillna_mode_or_mean)
train_dummies_x = pd.get_dummies(train_fillna_x)
test_dummies_x = pd.get_dummies(test_fillna_x)

models = get_models()
results = {}
for name, model, is_regression in models:
    print('model = %s' % name)
    results[name] = fit_pred(model, train_dummies_x, train_y, test_dummies_x, test_y, regression=is_regression)

df_results = pd.DataFrame(results)
df_results.T.sort('f1_test')

model = linear
model = ridge
model = lasso


  'precision', 'predicted', average, warn_for)


model = SVR
model = DTReg
model = RFReg
model = AdaReg
model = BagReg
model = GraReg
model = logistics
model = DTClsfr
model = SVM


Unnamed: 0,acc_test,acc_train,f1_test,f1_train,mse_test,mse_train,var_test,var_train
lasso,0.250846,0.259545,0.143218,0.145388,0.97509,0.993131,0.165876,0.15289
SVR,0.259496,0.854711,0.15063,0.803676,1.159852,0.174664,0.007824,0.851017
ridge,0.267394,0.279857,0.182154,0.193039,0.954743,0.96978,0.183281,0.172808
linear,0.289959,0.303837,0.229721,0.244246,0.929092,0.942083,0.205223,0.196432
AdaReg,0.292591,0.289919,0.242469,0.240685,0.898261,0.898977,0.231597,0.233201
SVM,0.432869,0.989844,0.268753,0.989831,,,,
logistics,0.449417,0.443107,0.308272,0.299029,,,,
GraReg,0.388116,0.402671,0.378085,0.393989,0.785877,0.739148,0.327735,0.36953
BagReg,0.412185,0.818319,0.417033,0.822732,0.804743,0.145713,0.311596,0.875711
RFReg,0.414065,0.823303,0.417814,0.828142,0.824655,0.145532,0.294563,0.875866


In [22]:
fillna_unknown_or_zero = train_x.dtypes.apply(lambda x: 'unknown' if x.name=='object' else 0.0).to_dict()

In [23]:
train_fillna_x = train_x.fillna(fillna_unknown_or_zero)
test_fillna_x = test_x.fillna(fillna_unknown_or_zero)
train_dummies_x = pd.get_dummies(train_fillna_x)
test_dummies_x = pd.get_dummies(test_fillna_x)

models = get_models()
results = {}
for name, model, is_regression in models:
    print('model = %s' % name)
    results[name] = fit_pred(model, train_dummies_x, train_y, test_dummies_x, test_y, regression=is_regression)

df_results = pd.DataFrame(results)
df_results.T.sort('f1_test')

model = linear
model = ridge
model = lasso
model = SVR
model = DTReg
model = RFReg
model = AdaReg
model = BagReg
model = GraReg
model = logistics
model = DTClsfr
model = SVM


Unnamed: 0,acc_test,acc_train,f1_test,f1_train,mse_test,mse_train,var_test,var_train
lasso,0.249718,0.261238,0.144913,0.149172,0.956211,0.975151,0.182026,0.168226
SVR,0.263257,0.853959,0.157679,0.802926,1.158203,0.174592,0.009234,0.851079
AdaReg,0.272283,0.264153,0.198053,0.188561,0.908888,0.902739,0.222507,0.229992
ridge,0.295224,0.311642,0.244425,0.261627,0.92225,0.925926,0.211077,0.210214
linear,0.300489,0.319635,0.256904,0.272905,0.919403,0.910833,0.213512,0.223088
SVM,0.432869,0.989562,0.269357,0.989549,,,,
logistics,0.447161,0.441414,0.305139,0.296776,,,,
DTReg,0.386235,0.46925,0.390997,0.47749,0.881898,0.631002,0.245595,0.461775
GraReg,0.402031,0.418187,0.394143,0.412813,0.772038,0.722093,0.339573,0.384077
RFReg,0.433246,0.829227,0.441381,0.833413,0.773009,0.140248,0.338743,0.880373
