# Model Evaluation

We will look at the six trained models and compare their performance on several different types of aggregation.

In [1]:
import pandas as pd
import numpy as np
from cats import dummify
import altair as alt
import pickle
import re
from modeling_functions import aggregate_errors, aggregate_samples, plot_agg_error, aggregate_samples_2, aggregate_errors_2

In [2]:
homesf = open('home_votes', 'rb')
homes = pickle.load(homesf)
homesf.close()
test_indicesf = open('split_indeces', 'rb')
split_indices = pickle.load(test_indicesf)
test_indicesf.close()
homes = pd.get_dummies(homes,columns=dummify)
extra_cats = [x for x in homes.columns if '_0' in x]
homes.drop(columns=extra_cats, inplace=True)

In [14]:
dropcols = ['v19pu','v20pu','vpu', 'pin']
X_test = homes[~homes.index.isin(split_indices[2])].drop(columns=dropcols)
#y_test_rv = homes[~homes.index.isin(split_indices[2])]['vpu']
#y_test_19 = homes[~homes.index.isin(split_indices[2])]['v19pu']
#y_test_20 = homes[~homes.index.isin(split_indices[2])]['v20pu']
y_pred_rv_rf = pd.read_csv('y_rfrv.csv', header=0, names =['U','index','t','p'], index_col=['index']).drop(columns='U')
y_pred_rv_19 = pd.read_csv('y_rf19.csv', header=0, names =['U','index','t','p'], index_col=['index']).drop(columns='U')
y_pred_rv_20 = pd.read_csv('y_rf20.csv', header=0, names =['U','index','t','p'], index_col=['index']).drop(columns='U')
y_pred_nn_rf = pd.read_csv('y_nnrv.csv', header=0, names =['U','index','t','p'], index_col=['index']).drop(columns='U')
y_pred_nn_19 = pd.read_csv('y_nn19.csv', header=0, names =['U','index','t','p'], index_col=['index']).drop(columns='U')
y_pred_nn_20 = pd.read_csv('y_nn20.csv', header=0, names =['U','index','t','p'], index_col=['index']).drop(columns='U')

In [16]:
preds = {
    'Random Forest - Registered Voters' : (y_pred_rv_rf['p'], y_pred_rv_rf['t']),
    'Neural Network Regression - Registered Voters' : (y_pred_nn_rf['p'], y_pred_nn_rf['t']),
    'Random Forest - 2019 Voters' : (y_pred_rv_19['p'], y_pred_rv_19['t']),
    'Neural Network Regression - 2019 Voters' : (y_pred_nn_19['p'], y_pred_nn_19['t']),
    'Random Forest - 2020 Voters' : (y_pred_rv_20['p'], y_pred_rv_20['t']),
    'Neural Network Regression - 2020 Voters' : (y_pred_nn_20['p'], y_pred_nn_20['t'])
}

## Accuracy by size of sample

In [7]:
test_10 = aggregate_samples_2(X_test,100,10)
test_100 = aggregate_samples_2(X_test,100,100)
test_1k = aggregate_samples_2(X_test,100,1000)
test_10k = aggregate_samples_2(X_test,100,10000)
test_100k = aggregate_samples_2(X_test,100,100000)

In [17]:
for key, val in preds.items():
    print(key)
    groups =[]
    for test in [test_10, test_100, test_1k, test_10k, test_100k]:
        tr = aggregate_errors_2(val[0], val[1], test)
        groups.append((len(test[0]), np.sqrt((tr['Error'] * tr['Error']).mean())))
    group_error = pd.DataFrame(groups, columns=['Sample Size','RMSE'])
    display(group_error)


Random Forest - Registered Voters


Unnamed: 0,Sample Size,RMSE
0,10,0.122837
1,100,0.040361
2,1000,0.017542
3,10000,0.005498
4,100000,0.002276


Neural Network Regression - Registered Voters


Unnamed: 0,Sample Size,RMSE
0,10,0.163421
1,100,0.066022
2,1000,0.026482
3,10000,0.017696
4,100000,0.016228


Random Forest - 2019 Voters


Unnamed: 0,Sample Size,RMSE
0,10,0.776427
1,100,0.106477
2,1000,0.031964
3,10000,0.010648
4,100000,0.005407


Neural Network Regression - 2019 Voters


Unnamed: 0,Sample Size,RMSE
0,10,0.810373
1,100,0.103831
2,1000,0.036766
3,10000,0.017788
4,100000,0.011457


Random Forest - 2020 Voters


Unnamed: 0,Sample Size,RMSE
0,10,0.195815
1,100,0.054724
2,1000,0.019983
3,10000,0.005612
4,100000,0.00223


Neural Network Regression - 2020 Voters


Unnamed: 0,Sample Size,RMSE
0,10,0.192054
1,100,0.075312
2,1000,0.027421
3,10000,0.017058
4,100000,0.015355


## Accuracy by City

In [9]:
city_cols = [string for string in homes.columns if 'districtname_' in string]
city_index = { city : homes[homes[city] == 1].index for city in city_cols}

In [18]:
for key, val in preds.items():
    print(key)
    groups=[]
    tr = aggregate_errors_2(val[0], val[1], city_index).dropna().sort_values(by='Error')
    tr['RMSE'] = np.sqrt(tr['Error']**2)
    #groups.append((test[0], np.sqrt((tr['Error'] * tr['Error']).mean())))
    #group_error = pd.DataFrame(groups, columns=['Sample Size','RMSE'])
    display(tr[['Sample', 'Actual', 'Predicted', 'RMSE']].sort_values(by='RMSE'))

Random Forest - Registered Voters


Unnamed: 0,Sample,Actual,Predicted,RMSE
7,districtname_CARNATION,367.235348,366.942134,0.000798
30,districtname_RENTON,15214.029777,15189.021769,0.001644
18,districtname_KING COUNTY,43338.09321,43410.856537,0.001679
32,districtname_SEATTLE,117660.377956,117931.205773,0.002302
20,districtname_LAKE FOREST PARK,2818.081137,2828.401342,0.003662
31,districtname_SAMMAMISH,10413.88364,10366.97272,0.004505
23,districtname_MERCER ISLAND,4986.869272,5013.349436,0.00531
16,districtname_KENMORE,3830.028931,3807.905823,0.005776
3,districtname_BELLEVUE,22154.998328,22008.279007,0.006622
28,districtname_PACIFIC,818.021629,812.339662,0.006946


Neural Network Regression - Registered Voters


Unnamed: 0,Sample,Actual,Predicted,RMSE
31,districtname_SAMMAMISH,10413.88364,10407.696229,0.000594
19,districtname_KIRKLAND,15156.363095,15092.915179,0.004186
23,districtname_MERCER ISLAND,4986.869272,5010.14399,0.004667
12,districtname_ENUMCLAW,2004.419345,2014.55566,0.005057
33,districtname_SHORELINE,10249.499559,10302.164995,0.005138
38,districtname_WOODINVILLE,2302.705464,2321.474663,0.008151
15,districtname_ISSAQUAH,6459.950499,6403.245662,0.008778
11,districtname_DUVALL,1340.804945,1326.107749,0.010961
26,districtname_NORMANDY PARK,835.353154,845.113666,0.011684
32,districtname_SEATTLE,117660.377956,115759.012956,0.01616


Random Forest - 2019 Voters


Unnamed: 0,Sample,Actual,Predicted,RMSE
33,districtname_SHORELINE,4548.921839,4549.935825,0.000223
38,districtname_WOODINVILLE,976.536613,975.50693,0.001054
32,districtname_SEATTLE,56482.047883,56580.891547,0.00175
29,districtname_REDMOND,3232.732934,3240.216242,0.002315
26,districtname_NORMANDY PARK,453.60162,452.444555,0.002551
10,districtname_DES MOINES,1911.233648,1917.138271,0.003089
21,districtname_MAPLE VALLEY,1747.375839,1753.474225,0.00349
9,districtname_COVINGTON,1201.133809,1193.875291,0.006043
17,districtname_KENT,5741.093306,5697.725671,0.007554
36,districtname_SeaTac,1149.971809,1158.7974,0.007675


Neural Network Regression - 2019 Voters


Unnamed: 0,Sample,Actual,Predicted,RMSE
32,districtname_SEATTLE,56482.047883,56357.938433,0.002197
19,districtname_KIRKLAND,5981.901049,5948.916119,0.005514
6,districtname_BURIEN,3170.886958,3148.453093,0.007075
1,districtname_AUBURN,2916.370787,2887.475708,0.009908
8,districtname_CLYDE HILL,215.5,213.340633,0.01002
33,districtname_SHORELINE,4548.921839,4597.676843,0.010718
30,districtname_RENTON,5217.051728,5160.447127,0.01085
27,districtname_NORTH BEND,559.546428,553.473714,0.010853
12,districtname_ENUMCLAW,728.771333,720.176989,0.011793
18,districtname_KING COUNTY,18490.482467,18270.404964,0.011902


Random Forest - 2020 Voters


Unnamed: 0,Sample,Actual,Predicted,RMSE
32,districtname_SEATTLE,94819.724612,94845.637403,0.000273
3,districtname_BELLEVUE,17387.04538,17392.525172,0.000315
18,districtname_KING COUNTY,34836.601254,34851.069475,0.000415
17,districtname_KENT,12796.264111,12781.155221,0.001181
39,districtname_YARROW POINT,202.5,202.023631,0.002352
23,districtname_MERCER ISLAND,4159.191538,4147.328104,0.002852
35,districtname_SNOQUALMIE,1822.376111,1829.494625,0.003906
9,districtname_COVINGTON,2559.755308,2571.538437,0.004603
14,districtname_HUNTS POINT,97.0,97.566806,0.005843
27,districtname_NORTH BEND,1070.973686,1079.096418,0.007584


Neural Network Regression - 2020 Voters


Unnamed: 0,Sample,Actual,Predicted,RMSE
9,districtname_COVINGTON,2559.755308,2560.160345,0.000158
38,districtname_WOODINVILLE,1869.623999,1868.609822,0.000542
1,districtname_AUBURN,6838.713557,6824.689704,0.002051
10,districtname_DES MOINES,3684.184173,3672.286792,0.003229
13,districtname_FEDERAL WAY,9353.429636,9389.842542,0.003893
32,districtname_SEATTLE,94819.724612,94338.020499,0.00508
29,districtname_REDMOND,6401.778671,6448.97074,0.007372
3,districtname_BELLEVUE,17387.04538,17228.137297,0.009139
36,districtname_SeaTac,2310.974167,2286.046722,0.010787
17,districtname_KENT,12796.264111,12597.747139,0.015514


## Accuracy by Housing Type

In [11]:
htypes = [
    'apartment',
    'single_family',
    'duplex',
    'triplex',
    'fourplex',
    'townhouse',
    'senior_housing',
    'student_housing',
    'mobile_home'
]

In [12]:
housing_cols = [string for string in homes.columns if string in htypes]
housing_index = { house : X_test[X_test[house] == 1].index for house in housing_cols}

In [19]:
for key, val in preds.items():
    print(key)
    tr = aggregate_errors_2(val[0], val[1], housing_index).dropna().sort_values(by='Error')
    tr['RMSE'] = np.sqrt(tr['Error']**2)
    #groups.append((test[0], np.sqrt((tr['Error'] * tr['Error']).mean())))
    #group_error = pd.DataFrame(groups, columns=['Sample Size','RMSE'])
    display(tr[['Sample', 'Actual', 'Predicted', 'RMSE']].sort_values(by='RMSE'))

Random Forest - Registered Voters


Unnamed: 0,Sample,Actual,Predicted,RMSE
6,senior_housing,2652.526647,2655.069881,0.000959
5,apartment,59794.476257,59644.425805,0.002509
0,single_family,249880.849645,250824.78404,0.003778
4,townhouse,7039.35246,7098.74903,0.008438
3,fourplex,2235.358974,2255.890707,0.009185
1,duplex,3961.55753,4010.007816,0.01223
2,triplex,1622.583333,1647.079199,0.015097
8,mobile_home,313.885408,305.809191,0.02573


Neural Network Regression - Registered Voters


Unnamed: 0,Sample,Actual,Predicted,RMSE
0,single_family,249880.849645,254935.454005,0.020228
4,townhouse,7039.35246,7223.30999,0.026133
2,triplex,1622.583333,1680.314838,0.03558
3,fourplex,2235.358974,2040.604963,0.087124
8,mobile_home,313.885408,279.840736,0.108462
1,duplex,3961.55753,4400.812103,0.110879
5,apartment,59794.476257,50113.593654,0.161903
6,senior_housing,2652.526647,1900.891696,0.283366


Random Forest - 2019 Voters


Unnamed: 0,Sample,Actual,Predicted,RMSE
5,apartment,16719.15818,16713.400683,0.000344
3,fourplex,610.369872,610.632775,0.000431
6,senior_housing,1504.545516,1505.64693,0.000732
1,duplex,1476.804021,1478.197362,0.000943
4,townhouse,3030.445188,3017.846543,0.004157
0,single_family,114965.017507,115596.661884,0.005494
2,triplex,577.466667,608.748121,0.05417
8,mobile_home,159.578582,150.770274,0.055197


Neural Network Regression - 2019 Voters


Unnamed: 0,Sample,Actual,Predicted,RMSE
3,fourplex,610.369872,614.148264,0.00619
0,single_family,114965.017507,113387.295477,0.013723
1,duplex,1476.804021,1504.698392,0.018888
5,apartment,16719.15818,17072.270303,0.02112
2,triplex,577.466667,564.757889,0.022008
4,townhouse,3030.445188,2959.449128,0.023428
6,senior_housing,1504.545516,1453.146299,0.034163
8,mobile_home,159.578582,130.089792,0.184792


Random Forest - 2020 Voters


Unnamed: 0,Sample,Actual,Predicted,RMSE
6,senior_housing,2032.964824,2033.171594,0.000102
3,fourplex,1567.817949,1569.035525,0.000777
5,apartment,41987.903225,41882.981701,0.002499
0,single_family,202247.673392,202818.624523,0.002823
1,duplex,2966.606277,2991.836718,0.008505
2,triplex,1210.083333,1230.755636,0.017083
4,townhouse,5618.548988,5489.541046,0.022961
8,mobile_home,259.421984,251.634855,0.030017


Neural Network Regression - 2020 Voters


Unnamed: 0,Sample,Actual,Predicted,RMSE
3,fourplex,1567.817949,1557.45027,0.006613
5,apartment,41987.903225,42298.559992,0.007399
0,single_family,202247.673392,197756.910856,0.022204
1,duplex,2966.606277,3036.821827,0.023669
6,senior_housing,2032.964824,1982.716823,0.024717
2,triplex,1210.083333,1242.919038,0.027135
4,townhouse,5618.548988,5289.885259,0.058496
8,mobile_home,259.421984,219.275274,0.154754
