In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [3]:
# load the data
voter_data = pd.read_csv('vote_history_tensors.csv')

# Predict g2024 based on general election



In [11]:
# Select general election
general_elections = [f"g{year}" for year in range(1994, 2024, 2)]  # g1994 to g2022
target = "g2024"


# Drop rows where target is missing
voter_data = voter_data.dropna(subset=[target, "age"])

# Features and target
X = voter_data[general_elections+["age"]]
y = voter_data[target].astype(int)  #  integer type (0 or 1)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=42)

# Random Forest Classifier
rf = RandomForestClassifier(n_estimators=10, random_state=42)
rf.fit(X_train, y_train)

# Predict and eval
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     79679
           1       1.00      1.00      1.00    972021

    accuracy                           1.00   1051700
   macro avg       1.00      1.00      1.00   1051700
weighted avg       1.00      1.00      1.00   1051700



# Predict p2024 based on primary election

In [9]:


primary_elections = [f"p{year}" for year in range(1994, 2024, 2)]  # p1994 to p2022


target = "p2024"


voter_data = voter_data.dropna(subset=[target, "age"])



y = voter_data[target].astype(int)
X = voter_data[primary_elections+["age"]]


X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, train_size=0.7, random_state=42)


rf = RandomForestClassifier(n_estimators=10,  random_state=42)
rf.fit(X_train, y_train)


y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))



Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    890719
           1       1.00      1.00      1.00    160981

    accuracy                           1.00   1051700
   macro avg       1.00      1.00      1.00   1051700
weighted avg       1.00      1.00      1.00   1051700



# Predict g2024 based on primary election

In [8]:

primary_elections = [f"p{year}" for year in range(1994, 2024, 2)]  # p1994 to p2022


target = "g2024"


voter_data = voter_data.dropna(subset=[target, "age"])


y = voter_data[target].astype(int)
X = voter_data[primary_elections +["age"]]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=42)


rf = RandomForestClassifier(n_estimators=10, random_state=42)
rf.fit(X_train, y_train)


y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))

Accuracy: 0.924233146334506

Classification Report:
               precision    recall  f1-score   support

           0       0.14      0.00      0.00     79679
           1       0.92      1.00      0.96    972021

    accuracy                           0.92   1051700
   macro avg       0.53      0.50      0.48   1051700
weighted avg       0.87      0.92      0.89   1051700



# Predict p2024 based on general election

In [6]:


general_elections = [f"g{year}" for year in range(1994, 2024, 2)]  # g1994 to g2022


target = "p2024"


voter_data = voter_data.dropna(subset=[target, "age"])



X = voter_data[general_elections + ["age"]]
y = voter_data[target].astype(int)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=42)


rf = RandomForestClassifier(n_estimators=10, random_state=42)
rf.fit(X_train, y_train)


y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))




Accuracy: 0.846974422363792

Classification Report:
               precision    recall  f1-score   support

           0       0.85      1.00      0.92    890719
           1       0.71      0.00      0.00    160981

    accuracy                           0.85   1051700
   macro avg       0.78      0.50      0.46   1051700
weighted avg       0.83      0.85      0.78   1051700


Feature Importances:
 age      0.384790
g1998    0.106673
g2010    0.106605
g2014    0.106374
g2018    0.106342
g2002    0.053393
g1994    0.053067
g2004    0.024269
g2000    0.016474
g2012    0.011468
g2016    0.008732
g2008    0.008576
g2020    0.008165
g1996    0.005072
g2006    0.000000
g2022    0.000000
dtype: float64
