In [14]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split as tts
from sklearn import preprocessing
from sklearn.model_selection import KFold
from cats import dummify
#from texter import send
import pickle
%config InlineBackend.figure_formats = set(['retina'])
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
#from modeling_functions import feature_importance, aggregate_samples, aggregate_errors, plot_agg_error
from IPython.display import display

In [13]:
pd.options.display.max_columns, pd.options.display.max_rows = 200,1000

In [10]:
alt.themes.enable('dark')

ThemeRegistry.enable('dark')

In [12]:
homes = pd.read_pickle('data/home_votes')

In [15]:
homes = pd.get_dummies(homes,columns=dummify)

In [16]:
# house_types = homes['house_types']
extra_cats = [x for x in homes.columns if '_0' in x]
# homes.drop(columns=['house_types'])
homes.drop(columns=extra_cats, inplace=True)

-----
## Train and Test Splits

In [17]:
validation = homes.sample(125000, random_state=42)

#### Splits on a 10% sample - model validation

In [18]:
# Create splits that don't mix same-building units between train and test
train_parcel = validation['pin'].sample(frac=.6, random_state=42)
#test_indicesf = open('split_indeces', 'rb')
#split_indices = pickle.load(test_indicesf)
#test_indicesf.close()
#train_parcel = split_indices[0]
X_train_parcel = validation[validation.index.isin(train_parcel.index)].drop(columns=['vpu', 'pin','v19pu','v20pu'])
X_test_parcel = validation[~validation.index.isin(train_parcel.index)].drop(columns=['vpu', 'pin','v19pu','v20pu'])

# Targets for three models
y_train_parcel_rv = homes[homes.index.isin(train_parcel.index)]['vpu']
y_train_parcel_19 = homes[homes.index.isin(train_parcel.index)]['v19pu']
y_train_parcel_20 = homes[homes.index.isin(train_parcel.index)]['v20pu']
y_test_parcel_rv = homes[~homes.index.isin(train_parcel.index)]['vpu']
y_test_parcel_19 = homes[~homes.index.isin(train_parcel.index)]['v19pu']
y_test_parcel_20 = homes[~homes.index.isin(train_parcel.index)]['v20pu']

#### Splits on the full data - model training

In [20]:
# Create splits that don't mix same-building units between train and test
train_parcel_a = homes[~homes.index.isin(train_parcel.index)]['pin'].sample(frac=.7, random_state=42).index
train_parcel_full = train_parcel_a.union(train_parcel.index)
X_train_parcel_full = homes[homes.index.isin(train_parcel_full)].drop(columns=['vpu', 'pin','v19pu','v20pu'])
X_test_parcel_full = homes[~homes.index.isin(train_parcel_full)].drop(columns=['vpu', 'pin','v19pu','v20pu'])

# Targets for three models
y_train_parcel_rv_full = homes[homes.index.isin(train_parcel_full)]['vpu']
y_train_parcel_19_full = homes[homes.index.isin(train_parcel_full)]['v19pu']
y_train_parcel_20_full = homes[homes.index.isin(train_parcel_full)]['v20pu']
y_test_parcel_rv_full = homes[~homes.index.isin(train_parcel_full)]['vpu']
y_test_parcel_19_full = homes[~homes.index.isin(train_parcel_full)]['v19pu']
y_test_parcel_20_full = homes[~homes.index.isin(train_parcel_full)]['v20pu']

# create 100 samples of 1,000 households each to test aggregate accuracy
tests = aggregate_samples(X_test_parcel_full, 100, 1000)

NameError: name 'aggregate_samples' is not defined

In [10]:
splits = [validation['pin'].index,train_parcel.index,train_parcel_full]
with open('split_indeces', 'wb') as file:
    pickle.dump(splits, file)

-----
## Random Forest Regression

In [11]:
random_grid = {'bootstrap': [True, False],
               'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
               'max_features': ['auto', 'sqrt'],
               'min_samples_leaf': [1, 2, 4],
               'min_samples_split': [2, 5, 10],
               'n_estimators': [100,200,400]}
rfr = RandomForestRegressor(verbose=0,n_jobs=-1)
rf_random = RandomizedSearchCV(estimator=rfr, param_distributions= random_grid, scoring="neg_mean_squared_error", n_iter = 15, cv = 3, verbose=0, random_state=42, n_jobs = -1)

-----
### Registered Voters Per Household

In [13]:
rf_random.fit(X_train_parcel,y_train_parcel_rv)
rfrv = rf_random.best_estimator_
print('Accuracy on subset: ' + str(rfrv.score(X_test_parcel, y_test_parcel_rv)))
rfrv.fit(X_train_parcel_full, y_train_parcel_rv_full)
print('Accuracy on full: ' + str(rfrv.score(X_test_parcel_full, y_test_parcel_rv_full)))
with open('rfrv', 'wb') as file:
    pickle.dump(rfrv, file)
send('Done with RV')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  1.5min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    0.5s finished
Accuracy on subset: 0.42997231090026167
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 20.4min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 21.8min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    

#### Feature Importances

In [14]:
feature_importance(homes, rfrv)

#### Aggregate Accuracy

In [24]:
rfrv_errors = aggregate_errors(rfrv, y_test_parcel_rv_full, tests);
plot_agg_error(rfrv_errors)
print('RMSE = ' + str(np.sqrt((rfrv_errors['Error'] * rfrv_errors['Error']).mean())))

sed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend Thre

RMSE = 0.014285135198096533


In [24]:
f = open('data/rfrv', 'rb')
rfrv = pickle.load(f)     
f.close()

In [None]:
erv = pd.merge(y_test_parcel_rv_full, pd.DataFrame(rfrv.predict(X_test_parcel_full)), how='left', left_index=True, right_on=X_test_parcel_full.index)
erv.to_csv('y_rfrv.csv')

In [29]:
y_rv = rfrv.predict(pd.concat([X_test_parcel_full, X_train_parcel_full]))
pd.DataFrame(y_rv).to_csv('y_rv.csv')

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.9s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    9.5s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:   10.2s finished


-----
### 2019 Voted Model

In [27]:
rfr = RandomForestRegressor(verbose=0,n_jobs=-1)
rf_random = RandomizedSearchCV(estimator=rfr, param_distributions= random_grid, scoring="neg_mean_squared_error", n_iter = 15, cv = 3, verbose=0, random_state=42, n_jobs = -1)
rf_random.fit(X_train_parcel,y_train_parcel_19)
rf19 = rf_random.best_estimator_
print('Accuracy on subset: ' + str(rf19.score(X_test_parcel, y_test_parcel_19)))
rf19.fit(X_train_parcel_full, y_train_parcel_19_full)
print('Accuracy on subset: ' + str(rf19.score(X_test_parcel_full, y_test_parcel_19_full)))
with open('rf19', 'wb') as file:
    pickle.dump(rf19, file)
send('Done with RF19')

Accuracy on subset: 0.38331848511986677
Accuracy on subset: 0.451652834873971


#### Feature Importances

In [38]:
feature_importance(homes, rf19)

#### Aggregate Errors

In [40]:
rf19_errors = aggregate_errors(rf19, y_test_parcel_19_full, tests)
plot_agg_error(rf19_errors)
print('RMSE = ' + str(np.sqrt((rf19_errors['Error'] * rf19_errors['Error']).mean())))

RMSE = 0.03338086027054189


In [None]:
f = open('data/rf19', 'rb')
rf19 = pickle.load(f)     
f.close()

In [None]:
e19 = pd.merge(y_test_parcel_19_full, pd.DataFrame(rf19.predict(X_test_parcel_full)), how='left', left_index=True, right_on=X_test_parcel_full.index)
e19.to_csv('y_rf19.csv')

In [None]:
y_19 = rf19.predict(pd.concat([X_test_parcel_full, X_train_parcel_full]))
pd.DataFrame(y_19).to_csv('y_19.csv')

-----
### 2020 Voters

In [49]:
rfr = RandomForestRegressor(verbose=0,n_jobs=-1)
rf_random = RandomizedSearchCV(estimator=rfr, param_distributions= random_grid, scoring="neg_mean_squared_error", n_iter = 15, cv = 3, verbose=1, random_state=42, n_jobs = -1)
rf_random.fit(X_train_parcel,y_train_parcel_20)
rf20 = rf_random.best_estimator_
print(str(rf20.score(X_test_parcel, y_test_parcel_20)))
rf20.fit(X_train_parcel_full, y_train_parcel_20_full)
print(str(rf20.score(X_test_parcel_full, y_test_parcel_20_full)))
with open('rf20', 'wb') as file:
    pickle.dump(rf20, file)
send('Done with RF20')

Fitting 3 folds for each of 15 candidates, totalling 45 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed: 28.7min finished
0.4304487237550123
0.6754868192542156


#### Feature Importances

In [50]:
feature_importance(homes, rf20)

#### Aggregate Errors

In [51]:
rf20_errors = aggregate_errors(rf20, y_test_parcel_20_full, tests)
plot_agg_error(rf20_errors)
print('RMSE = ' + str(np.sqrt((rf20_errors['Error'] * rf20_errors['Error']).mean())))

RMSE = 0.017784277175286363


In [32]:
f = open('data/rf20', 'rb')
rf20 = pickle.load(f)     
f.close()

In [None]:
e20 = pd.merge(y_test_parcel_20_full, pd.DataFrame(rf20.predict(X_test_parcel_full)), how='left', left_index=True, right_on=X_test_parcel_full.index)
e20.to_csv('y_rf20.csv')

In [38]:
y_20 = rf20.predict(pd.concat([X_test_parcel_full, X_train_parcel_full]))
pd.DataFrame(y_20).to_csv('y_20.csv')

In [46]:
y_full = pd.DataFrame(y_rv).merge(pd.DataFrame(y_20), how='left', left_index=True, right_index=True).merge(pd.DataFrame(y_19), how='left', left_index=True, right_index=True)
y_full.columns=['rv_pred','v20','v19']

In [47]:
y_full

Unnamed: 0,rv_pred,v20,v19
0,1.003859,1.007266,0.714036
1,0.999728,1.000096,0.664502
2,1.002056,1.007266,0.714197
3,1.004025,1.000766,0.662539
4,1.089214,1.019894,0.674050
...,...,...,...
989593,2.965298,2.656308,1.567566
989594,2.190160,1.316052,0.801750
989595,1.228244,1.183820,1.221424
989596,1.849885,1.975462,1.599413


In [49]:
y_full = y_full.merge(homes['pin'], how='left', left_index=True, right_index=True)

In [51]:
y_full = y_full.groupby('pin').agg('sum')

In [52]:
y_full['delta'] = (y_full['v20'] - y_full['v19']) / y_full['rv_pred']

In [56]:
y_full.dropna(subset = ['rv_pred']).to_csv('delta_predict.csv')