# Model Evaluation

In [2]:
import pandas as pd
import numpy as np
from cats import dummify
import altair as alt
import pickle
import re
from modeling_functions import aggregate_errors, aggregate_samples, plot_agg_error, aggregate_samples_2, aggregate_errors_2

In [3]:
homesf = open('home_votes', 'rb')
homes = pickle.load(homesf)
homesf.close()
test_indicesf = open('split_indeces', 'rb')
split_indices = pickle.load(test_indicesf)
test_indicesf.close()
homes = pd.get_dummies(homes,columns=dummify)
extra_cats = [x for x in homes.columns if '_0' in x]
homes.drop(columns=extra_cats, inplace=True)

In [6]:
dropcols = ['v19pu','v20pu','vpu', 'pin']
X_test = homes[~homes.index.isin(split_indices[2])].drop(columns=dropcols)
#y_test_rv = homes[~homes.index.isin(split_indices[2])]['vpu']
#y_test_19 = homes[~homes.index.isin(split_indices[2])]['v19pu']
#y_test_20 = homes[~homes.index.isin(split_indices[2])]['v20pu']
y_pred_rv_rf = pd.read_csv('y_rfrv.csv', header=0, names =['U','index','t','p'], index_col=['index']).drop(columns='U')
y_pred_rv_19 = pd.read_csv('y_rf19.csv', header=0, names =['U','index','t','p'], index_col=['index']).drop(columns='U')
y_pred_rv_20 = pd.read_csv('y_rf20.csv', header=0, names =['U','index','t','p'], index_col=['index']).drop(columns='U')
y_pred_nn_rf = pd.read_csv('y_nnrv.csv', header=0, names =['U','index','t','p'], index_col=['index']).drop(columns='U')
# y_pred_nn_19 = pd.read_csv('y_nn19.csv')
# y_pred_nn_20 = pd.read_csv('y_nn20.csv')

In [5]:
y_pred_nn_rf

Unnamed: 0.1,Unnamed: 0,key_0,vpu,0
0,0,2,1.0,0.925473
1,1,4,1.0,0.895530
2,2,6,1.0,0.929883
3,3,9,1.0,0.874035
4,4,15,1.0,1.316842
...,...,...,...,...
274374,274374,989588,4.0,2.665967
274375,274375,989590,1.0,2.164391
274376,274376,989591,2.0,2.684244
274377,274377,989592,3.0,2.333793


In [7]:
preds = {
    'Random Forest - Registered Voters' : (y_pred_rv_rf['p'], y_pred_rv_rf['t']),
    'Random Forest - 2019 Voters' : (y_pred_rv_19['p'], y_pred_rv_19['t']),
    'Random Forest - 2020 Voters' : (y_pred_rv_20['p'], y_pred_rv_20['t']),
    'Neural Network Regression - ' : (y_pred_nn_rf['p'], y_pred_nn_rf['t'])
}

## Accuracy by size of sample

In [8]:
test_10 = aggregate_samples_2(X_test,100,10)
test_100 = aggregate_samples_2(X_test,100,100)
test_1k = aggregate_samples_2(X_test,100,1000)
test_10k = aggregate_samples_2(X_test,100,10000)
test_100k = aggregate_samples_2(X_test,100,100000)

In [9]:
for key, val in preds.items():
    print(key)
    groups =[]
    for test in [test_10, test_100, test_1k, test_10k, test_100k]:
        tr = aggregate_errors_2(val[0], val[1], test)
        groups.append((len(test[0]), np.sqrt((tr['Error'] * tr['Error']).mean())))
    group_error = pd.DataFrame(groups, columns=['Sample Size','RMSE'])
    display(group_error)


Random Forest - Registered Voters


Unnamed: 0,Sample Size,RMSE
0,10,0.136016
1,100,0.041312
2,1000,0.020295
3,10000,0.005184
4,100000,0.001811


Random Forest - 2019 Voters


Unnamed: 0,Sample Size,RMSE
0,10,0.672222
1,100,0.13036
2,1000,0.040951
3,10000,0.011734
4,100000,0.00503


Random Forest - 2020 Voters


Unnamed: 0,Sample Size,RMSE
0,10,0.188001
1,100,0.058387
2,1000,0.021376
3,10000,0.005996
4,100000,0.001947


Neural Network Regression - 


Unnamed: 0,Sample Size,RMSE
0,10,0.159396
1,100,0.05512
2,1000,0.032112
3,10000,0.018286
4,100000,0.016712


## Accuracy by City

In [10]:
city_cols = [string for string in homes.columns if 'districtname_' in string]
city_index = { city : homes[homes[city] == 1].index for city in city_cols}

In [11]:
for key, val in preds.items():
    print(key)
    groups=[]
    tr = aggregate_errors_2(val[0], val[1], city_index).dropna().sort_values(by='Error')
    tr['RMSE'] = np.sqrt(tr['Error']**2)
    #groups.append((test[0], np.sqrt((tr['Error'] * tr['Error']).mean())))
    #group_error = pd.DataFrame(groups, columns=['Sample Size','RMSE'])
    display(tr[['Sample', 'Actual', 'Predicted', 'RMSE']].sort_values(by='RMSE'))

Random Forest - Registered Voters


Unnamed: 0,Sample,Actual,Predicted,RMSE
7,districtname_CARNATION,367.235348,366.942134,0.000798
30,districtname_RENTON,15214.029777,15189.021769,0.001644
18,districtname_KING COUNTY,43338.09321,43410.856537,0.001679
32,districtname_SEATTLE,117660.377956,117931.205773,0.002302
20,districtname_LAKE FOREST PARK,2818.081137,2828.401342,0.003662
31,districtname_SAMMAMISH,10413.88364,10366.97272,0.004505
23,districtname_MERCER ISLAND,4986.869272,5013.349436,0.00531
16,districtname_KENMORE,3830.028931,3807.905823,0.005776
3,districtname_BELLEVUE,22154.998328,22008.279007,0.006622
28,districtname_PACIFIC,818.021629,812.339662,0.006946


Random Forest - 2019 Voters


Unnamed: 0,Sample,Actual,Predicted,RMSE
33,districtname_SHORELINE,4548.921839,4549.935825,0.000223
38,districtname_WOODINVILLE,976.536613,975.50693,0.001054
32,districtname_SEATTLE,56482.047883,56580.891547,0.00175
29,districtname_REDMOND,3232.732934,3240.216242,0.002315
26,districtname_NORMANDY PARK,453.60162,452.444555,0.002551
10,districtname_DES MOINES,1911.233648,1917.138271,0.003089
21,districtname_MAPLE VALLEY,1747.375839,1753.474225,0.00349
9,districtname_COVINGTON,1201.133809,1193.875291,0.006043
17,districtname_KENT,5741.093306,5697.725671,0.007554
36,districtname_SeaTac,1149.971809,1158.7974,0.007675


Random Forest - 2020 Voters


Unnamed: 0,Sample,Actual,Predicted,RMSE
32,districtname_SEATTLE,94819.724612,94845.637403,0.000273
3,districtname_BELLEVUE,17387.04538,17392.525172,0.000315
18,districtname_KING COUNTY,34836.601254,34851.069475,0.000415
17,districtname_KENT,12796.264111,12781.155221,0.001181
39,districtname_YARROW POINT,202.5,202.023631,0.002352
23,districtname_MERCER ISLAND,4159.191538,4147.328104,0.002852
35,districtname_SNOQUALMIE,1822.376111,1829.494625,0.003906
9,districtname_COVINGTON,2559.755308,2571.538437,0.004603
14,districtname_HUNTS POINT,97.0,97.566806,0.005843
27,districtname_NORTH BEND,1070.973686,1079.096418,0.007584


## Accuracy by Housing Type

In [5]:
htypes = [
    'apartment',
    'single_family',
    'duplex',
    'triplex',
    'fourplex',
    'townhouse',
    'senior_housing',
    'student_housing',
    'mobile_home'
]

In [6]:
housing_cols = [string for string in homes.columns if string in htypes]
housing_index = { house : X_test[X_test[house] == 1].index for house in housing_cols}

In [7]:
def aggregate_errors_2(y_pred, y_true, tests):
    test_results = []
    for key, val in tests.items():
        test_results.append((key, y_pred[y_pred.index.isin(val)].sum(), y_true[y_true.index.isin(val)].sum()))
    tr = pd.DataFrame(test_results, columns=['Sample','Predicted','Actual'])
    tr['Error'] = ( tr['Actual'] - tr['Predicted'] ) / tr['Actual']
    return tr

In [8]:
for key, val in preds.items():
    print(key)
    tr = aggregate_errors_2(val[0], val[1], housing_index).dropna().sort_values(by='Error')
    tr['RMSE'] = np.sqrt(tr['Error']**2)
    #groups.append((test[0], np.sqrt((tr['Error'] * tr['Error']).mean())))
    #group_error = pd.DataFrame(groups, columns=['Sample Size','RMSE'])
    display(tr[['Sample', 'Actual', 'Predicted', 'RMSE']].sort_values(by='RMSE'))

Random Forest - Registered Voters


Unnamed: 0,Sample,Actual,Predicted,RMSE
6,senior_housing,2652.526647,2655.069881,0.000959
5,apartment,59794.476257,59644.425805,0.002509
0,single_family,249880.849645,250824.78404,0.003778
4,townhouse,7039.35246,7098.74903,0.008438
3,fourplex,2235.358974,2255.890707,0.009185
1,duplex,3961.55753,4010.007816,0.01223
2,triplex,1622.583333,1647.079199,0.015097
8,mobile_home,313.885408,305.809191,0.02573


Random Forest - 2019 Voters


Unnamed: 0,Sample,Actual,Predicted,RMSE
5,apartment,16719.15818,16713.400683,0.000344
3,fourplex,610.369872,610.632775,0.000431
6,senior_housing,1504.545516,1505.64693,0.000732
1,duplex,1476.804021,1478.197362,0.000943
4,townhouse,3030.445188,3017.846543,0.004157
0,single_family,114965.017507,115596.661884,0.005494
2,triplex,577.466667,608.748121,0.05417
8,mobile_home,159.578582,150.770274,0.055197


Random Forest - 2020 Voters


Unnamed: 0,Sample,Actual,Predicted,RMSE
6,senior_housing,2032.964824,2033.171594,0.000102
3,fourplex,1567.817949,1569.035525,0.000777
5,apartment,41987.903225,41882.981701,0.002499
0,single_family,202247.673392,202818.624523,0.002823
1,duplex,2966.606277,2991.836718,0.008505
2,triplex,1210.083333,1230.755636,0.017083
4,townhouse,5618.548988,5489.541046,0.022961
8,mobile_home,259.421984,251.634855,0.030017
