In [47]:
# Initial imports
import pandas as pd
import numpy as np
from pathlib import Path
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [13]:
df_2019 = pd.read_csv('../election_resources/historical_elections.csv')
df_2019.fillna(0, inplace=True)
df_2019.head()

Unnamed: 0,samplesize,party,results,favorability
0,1204.0,d,1.0,-22.0006
1,1553.0,d,1.0,-2.0001
2,838.0,d,1.0,-17.6672
3,1198.0,d,1.0,-20.3339
4,1859.0,d,1.0,-6.667


In [14]:
X = df_2019.copy()


In [15]:
df_2019 = pd.get_dummies(df_2019, dtype=float)

In [16]:
y = df_2019['results'].values.reshape(-1,1)
X = df_2019.drop(['results'], axis=1).values
y[:5]

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.]])

In [17]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [18]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [19]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [20]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [21]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [49]:
joblib.dump(rf_model, "./random_forest.joblib")

['./random_forest.joblib']

In [22]:
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

In [23]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

array([1., 0., 0., ..., 1., 0., 1.])

In [50]:
display(y_train)
display(predictions)

array([[1.],
       [0.],
       [1.],
       ...,
       [0.],
       [0.],
       [1.]])

array([1., 0., 0., ..., 1., 0., 1.])

In [26]:
display(predictions.shape)
display(y.shape)

(1611,)

(6443, 1)

In [27]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [28]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,787,13
Actual 1,12,799


Accuracy Score : 0.9844816883923029
Classification Report
              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       800
         1.0       0.98      0.99      0.98       811

    accuracy                           0.98      1611
   macro avg       0.98      0.98      0.98      1611
weighted avg       0.98      0.98      0.98      1611

