In [6]:
# Initial imports
import pandas as pd
import numpy as np
from pathlib import Path
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [7]:
biden_v_trump = pd.read_csv('../election_resources/historical_elections.csv')
biden_v_trump.fillna(0, inplace=True)
biden_v_trump.head()

Unnamed: 0,samplesize,party,results,favorability
0,1204.0,d,1.0,-22.0006
1,1553.0,d,1.0,-2.0001
2,838.0,d,1.0,-17.6672
3,1198.0,d,1.0,-20.3339
4,1859.0,d,1.0,-6.667


In [8]:
sample_size_to_replace = list(biden_v_trump['samplesize'].value_counts()[biden_v_trump['samplesize'].value_counts() < 310].index)
for i in sample_size_to_replace:
    biden_v_trump['samplesize'] = biden_v_trump['samplesize'].replace(i, 'samplesize')
biden_v_trump['samplesize'].value_counts()

samplesize    6443
Name: samplesize, dtype: int64

In [23]:
X = biden_v_trump.copy()


In [9]:
biden_v_trump = pd.get_dummies(biden_v_trump, dtype=float)
biden_v_trump.head()

Unnamed: 0,results,favorability,samplesize_samplesize,party_0,party_d,party_r
0,1.0,-22.0006,1.0,0.0,1.0,0.0
1,1.0,-2.0001,1.0,0.0,1.0,0.0
2,1.0,-17.6672,1.0,0.0,1.0,0.0
3,1.0,-20.3339,1.0,0.0,1.0,0.0
4,1.0,-6.667,1.0,0.0,1.0,0.0


In [11]:
y = biden_v_trump['results'].values.reshape(-1,1)
X = biden_v_trump.drop(['results', 'party_0'], axis=1).values
y[:5]

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.]])

In [12]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [13]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [14]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [15]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [17]:
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

In [18]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [19]:
joblib.dump(rf_model, "./random_forest.joblib")

['./random_forest.joblib']

In [20]:
display(y_train.dtype)
display(predictions.dtype)

dtype('float64')

dtype('float64')

In [21]:
display(predictions.shape)
display(y.shape)

(1611,)

(6443, 1)

In [22]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [23]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,783,17
Actual 1,15,796


Accuracy Score : 0.9801365611421478
Classification Report
              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       800
         1.0       0.98      0.98      0.98       811

    accuracy                           0.98      1611
   macro avg       0.98      0.98      0.98      1611
weighted avg       0.98      0.98      0.98      1611

