The goal of this random forest is to predict the political party preferences for a subset of taxpayers by using a random forest classification

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [31]:
# Replace the path with the correct path for your data.
df = pd.read_csv(
    'TaxInfo.csv',
    skipinitialspace=True,
)


In [32]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,HHI,HHDL,Married,CollegGrads,AHHAge,Cars,Filed in 2017,Filed in 2016,Filed in 2015,PoliticalParty
0,1,49685,227187,0,0,105,0,1,1,1,Democrat
1,2,64756,-507342,2,3,68,3,1,0,0,Independent
2,3,115435,521290,1,3,81,2,0,1,0,Republican
3,4,99454,251829,2,1,52,4,1,0,0,Republican
4,5,157274,-472337,0,1,28,1,1,0,1,Independent


In [33]:
#check out data
df.describe()

Unnamed: 0.1,Unnamed: 0,HHI,HHDL,Married,CollegGrads,AHHAge,Cars,Filed in 2017,Filed in 2016,Filed in 2015
count,1004.0,1004.0,1004.0,1004.0,1004.0,1004.0,1004.0,1004.0,1004.0,1004.0
mean,502.5,153524.522908,-6666.766932,0.97012,2.595618,60.577689,2.447211,0.507968,0.512948,0.50996
std,289.974137,86167.778113,593751.687936,0.806939,1.687388,24.847991,1.701621,0.500186,0.500081,0.50015
min,1.0,185.0,-999994.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0
25%,251.75,82788.5,-514295.5,0.0,1.0,39.0,1.0,0.0,0.0,0.0
50%,502.5,153788.5,-11331.5,1.0,3.0,60.0,2.0,1.0,1.0,1.0
75%,753.25,228034.0,526141.25,2.0,4.0,81.0,4.0,1.0,1.0,1.0
max,1004.0,299700.0,999628.0,2.0,5.0,105.0,5.0,1.0,1.0,1.0


In [34]:
#drop NA's
df = df.dropna()
df.PoliticalParty.head(20)

0        Democrat
1     Independent
2      Republican
3      Republican
4     Independent
5        Democrat
6      Republican
7        Democrat
8      Republican
9        Democrat
10       Democrat
11    Independent
12     Republican
13    Independent
14     Republican
15     Republican
16     Republican
17     Republican
18       Democrat
19    Independent
Name: PoliticalParty, dtype: object

In [35]:
#check data types
df.dtypes

Unnamed: 0         int64
HHI                int64
HHDL               int64
Married            int64
CollegGrads        int64
AHHAge             int64
Cars               int64
Filed in 2017      int64
Filed in 2016      int64
Filed in 2015      int64
PoliticalParty    object
dtype: object

In [36]:
#Create get dummies for Political Party column
pd.get_dummies(df.PoliticalParty)

Unnamed: 0,Democrat,Independent,Republican
0,1,0,0
1,0,1,0
2,0,0,1
3,0,0,1
4,0,1,0
5,1,0,0
6,0,0,1
7,1,0,0
8,0,0,1
9,1,0,0


In [37]:
df.head(20)

Unnamed: 0.1,Unnamed: 0,HHI,HHDL,Married,CollegGrads,AHHAge,Cars,Filed in 2017,Filed in 2016,Filed in 2015,PoliticalParty
0,1,49685,227187,0,0,105,0,1,1,1,Democrat
1,2,64756,-507342,2,3,68,3,1,0,0,Independent
2,3,115435,521290,1,3,81,2,0,1,0,Republican
3,4,99454,251829,2,1,52,4,1,0,0,Republican
4,5,157274,-472337,0,1,28,1,1,0,1,Independent
5,6,235312,807659,2,2,45,0,1,0,0,Democrat
6,7,158461,590203,2,0,74,0,0,1,0,Republican
7,8,205734,416788,2,0,84,5,0,1,1,Democrat
8,9,238581,812542,2,3,18,2,1,0,0,Republican
9,10,214597,-193805,2,1,66,5,1,0,0,Democrat


In [51]:
#Test the model
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

rfc = ensemble.RandomForestClassifier()
Y = pd.get_dummies(df.PoliticalParty)
X = df.drop(['Unnamed: 0','PoliticalParty'], 1)

cross_val_score(rfc, X, Y, cv=9)



array([0.1875    , 0.16964286, 0.15178571, 0.11607143, 0.15178571,
       0.14414414, 0.0990991 , 0.10810811, 0.14414414])

In [43]:
# Import and run PCA model 
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X)
X_pca = pca.transform(X)

In [47]:
rfc = ensemble.RandomForestClassifier()
cross_val_score(rfc, X_pca, Y, cv=2)



array([0.12948207, 0.1314741 ])

In [50]:
X.dtypes

Unnamed: 0       int64
HHI              int64
HHDL             int64
Married          int64
CollegGrads      int64
AHHAge           int64
Cars             int64
Filed in 2017    int64
Filed in 2016    int64
Filed in 2015    int64
dtype: object

In [60]:
#PCA analysis shows that changing the number of varaibles doesn't increase accuracy, so let's change the predicted varaible
#Let's try to alter the predicted variable to whether or not the person is a Democrat, and retest the model
df['Democrat_YesNo'] = df['PoliticalParty'].apply(lambda x: True if x == 'Democrat' else False)

In [68]:
#retest the model
rfc = ensemble.RandomForestClassifier()
Y = df.Democrat_YesNo
X = df.drop(['PoliticalParty', 'Democrat_YesNo', 'Unnamed: 0'], 1)

cross_val_score(rfc, X, Y, cv=9)



array([0.62831858, 0.61061947, 0.61607143, 0.6036036 , 0.63063063,
       0.66666667, 0.64864865, 0.63963964, 0.66666667])

In [66]:
#That gave us much better accuracy, ranging between ~63 and ~67 percent

['Unnamed: 0',
 'HHI',
 'HHDL',
 'Married',
 'CollegGrads',
 'AHHAge',
 'Cars',
 'Filed in 2017',
 'Filed in 2016',
 'Filed in 2015',
 'PoliticalParty',
 'Democrat_YesNo']