## Running random forest model using k-fold cross validation

In [52]:
from os import chdir
chdir('..')
import config as cfg
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
import math

In [53]:
df_cand = pd.read_csv(cfg.training_candidate_file, index_col = 0)
print(df_cand['ELECTION_DATE'].value_counts())

2016-06-07    316
2018-06-05    313
Name: ELECTION_DATE, dtype: int64


In [54]:
race_key = ['CONTEST_NAME' , 'ELECTION_DATE' ]
cand_key = [*race_key, 'CANDIDATE_NAME']

In [55]:
df = df_cand.copy()

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 629 entries, 0 to 628
Data columns (total 14 columns):
CANDIDATE_NAME       629 non-null object
CONTEST_NAME         629 non-null object
ELECTION_DATE        629 non-null object
PARTY_NAME           629 non-null object
INCUMBENT_FLAG       629 non-null int64
PARTY_LEAN           629 non-null int64
WRITE_IN_FLAG        629 non-null int64
VOTE_TOTAL           629 non-null int64
CAND_TOTAL_RAISED    629 non-null float64
LEGISLATURE          629 non-null bool
CANDIDATE_COUNT      629 non-null int64
RACE_TOTAL_RAISED    629 non-null float64
RACE_VOTE_TOTAL      629 non-null int64
VOTE_SHARE           629 non-null float64
dtypes: bool(1), float64(3), int64(6), object(4)
memory usage: 69.4+ KB


In [57]:
df.columns


Index(['CANDIDATE_NAME', 'CONTEST_NAME', 'ELECTION_DATE', 'PARTY_NAME',
       'INCUMBENT_FLAG', 'PARTY_LEAN', 'WRITE_IN_FLAG', 'VOTE_TOTAL',
       'CAND_TOTAL_RAISED', 'LEGISLATURE', 'CANDIDATE_COUNT',
       'RACE_TOTAL_RAISED', 'RACE_VOTE_TOTAL', 'VOTE_SHARE'],
      dtype='object')

Feature columns: 'ELECTION_DATE', "PARTY_LEAN", 'WRITE_IN_FLAG', 'CAND_TOTAL_RAISED', 'CANDIDATE_COUNT','RACE_TOTAL_RAISED', , 'RACE_VOTE_TOTAL'
Y column: 'VOTE_SHARE'

In [58]:
features = [ 'ELECTION_DATE', "PARTY_LEAN", 'WRITE_IN_FLAG', 'CAND_TOTAL_RAISED', 'CANDIDATE_COUNT','RACE_TOTAL_RAISED', 'RACE_VOTE_TOTAL']
target = ['VOTE_SHARE']

# Convert election dates into a binary feature
datemap = {'2016-06-07': 0, '2018-06-05': 1}
df['ELECTION_DATE'] = df['ELECTION_DATE'].replace(datemap)

In [60]:
R =[df for _, df in df.groupby(race_key)]
random.shuffle(R)
print(len(R))
R
# R is list of dataframes, each dataframe has all candidates in just one race

200


[     CANDIDATE_NAME                       CONTEST_NAME  ELECTION_DATE  \
 153    DEVON MATHIS  State Assembly Member District 26              0   
 154  RUBEN MACARENO  State Assembly Member District 26              0   
 155    RUDY MENDOZA  State Assembly Member District 26              0   
 
      PARTY_NAME  INCUMBENT_FLAG  PARTY_LEAN  WRITE_IN_FLAG  VOTE_TOTAL  \
 153  Republican               1           1              0       28563   
 154  Democratic               0          -1              0       20536   
 155  Republican               0           1              0       18216   
 
      CAND_TOTAL_RAISED  LEGISLATURE  CANDIDATE_COUNT  RACE_TOTAL_RAISED  \
 153          315551.01         True                3          399302.14   
 154            1001.13         True                3          399302.14   
 155           82750.00         True                3          399302.14   
 
      RACE_VOTE_TOTAL  VOTE_SHARE  
 153            67315    0.424319  
 154            67315 

# Split folds
There are 200 races in this data file
Makes sense to withhold 20% as test, and do 5-fold cross-validation.

In [50]:
# Yield 5 successive  
# chunks from l. 
def divide_chunks(R, k): 
    l = len(R) // k
      
    for i in range(0, k):
        if i == k-1:
            yield R[i*l:]
        else:
            yield R[i*l: (i+1)*l] 
  
k = 5
folds = list(divide_chunks(R, k))
folds # Should be list of list of dataframes

[[    CANDIDATE_NAME                       CONTEST_NAME  ELECTION_DATE  \
  528  RICHARD BLOOM  State Assembly Member District 50              1   
  
       PARTY_NAME  INCUMBENT_FLAG  PARTY_LEAN  WRITE_IN_FLAG  VOTE_TOTAL  \
  528  Democratic               1          -1              0       79458   
  
       CAND_TOTAL_RAISED  LEGISLATURE  CANDIDATE_COUNT  RACE_TOTAL_RAISED  \
  528          574046.54         True                1          574046.54   
  
       RACE_VOTE_TOTAL  VOTE_SHARE  
  528            79458         1.0  ,
          CANDIDATE_NAME                       CONTEST_NAME  ELECTION_DATE  \
  545  GREGG D. FRITCHLE  State Assembly Member District 55              1   
  546       PHILLIP CHEN  State Assembly Member District 55              1   
  547    JAMES G. GERBUS  State Assembly Member District 55              1   
  548      MELISSA FAZLI  State Assembly Member District 55              1   
  549        SCOTT LEBDA  State Assembly Member District 55             

# apply regression

In [101]:
def divide_chunks(R, k): 
    # Yield 5 successive  
    # chunks from l. 
    l = len(R) // k
      
    for i in range(0, k):
        if i == k-1:
            yield R[i*l:]
        else:
            yield R[i*l: (i+1)*l] 
  
def apply_k_fold_regression(R, k=5, **hyperparams):
    """
    Perform k-fold splitting of races, then group and apply regresion, return regressor with rmse values
    
    :param R: list of dataframes that has candidate features and vote results, each dataframe from just one race
    :param k: number of folds. Whatever is left over after the k'th fold will be added to the last fold.
    
    :param hyperparams: keyword hyperparameters to be fed to the regressor
    :returns regressor, mean_rmse, std_rmse
    """

    
    folds = list(divide_chunks(R, k))
    folds # Should be list of list of dataframes
    
    rmse_list = []
    for i in range(0, k):
        # i denotes the test fold, all others are train.
        test = [race for race in folds[i]]
        # Flatten all train races together
        train = [race for j, fold in enumerate(folds) for race in fold if j!=i]

        # Concat all races in all train folds together
        train = pd.concat(train)

        # Concat all test races together, into one big dataframe
        test = pd.concat(test)

        # Splitting columns.
        key_train = train[cand_key]
        X_train = train[features]
        y_train = train[target].values.ravel()
        X_test = test[features]
        y_test = test[target].values.ravel()

        reg = RandomForestRegressor(**hyperparams)

        reg.fit(X_train, y_train)

        prediction = reg.predict(X_test)
        prediction = pd.DataFrame(prediction, index=test.index)
        test_results = test[cand_key].copy()
        test_results['y_test'] = y_test
        test_results['y_pred'] = prediction

    #     _normalize(test_all)
        denom = test_results.groupby(race_key).agg(sum_pct=('y_pred', 'sum'))
        test_results = test_results.set_index(cand_key)
        test_results = pd.merge(left=test_results, right = denom, left_index=True, right_index=True)
        test_results['y_pred'] = test_results['y_pred']/test_results['sum_pct']
    #     print(test_results)

        rmse_list.append(
            math.sqrt(
                mse(
                    100 * test_results['y_pred'], 100 * test_results['y_test']
                )
            )
        )

    mean_rmse = np.mean(rmse_list)
    std_rmse = np.std(rmse_list)
    return reg, mean_rmse, std_rmse

In [102]:
apply_k_fold_regression(R, k=5, n_estimators=20, max_features=5, random_state=14)    

(RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                       max_features=5, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=14, verbose=0,
                       warm_start=False),
 11.499881197873806,
 0.4973382776859854)

Below should sum to  629, the length of the entire cand dataframe

In [54]:
def run_model(race)

503
126


In [6]:
df2.ELECTION_DATE.value_counts()

2016-06-07    316
2018-06-05    313
Name: ELECTION_DATE, dtype: int64