In [2]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

In [3]:
# 1. Create a DataFrame for the Presidential Election data. 
file_to_open = "prim_gen_joined.csv"
pres_elec_df = pd.read_csv(file_to_open, encoding="ISO-8859-1") 

In [4]:
pres_elec_df.head

<bound method NDFrame.head of         Voter_ID           City  Zip_Code  Party  Birth_Year  \
0         704844    SEVEN HILLS     44131  NOPTY        1959   
1        1658758          PARMA     44130    REP        1968   
2        2005536  BROADVIEW HTS     44147    REP        1976   
3        2465762          SOLON     44139  NOPTY        1991   
4         710633   INDEPENDENCE     44131    REP        1968   
...          ...            ...       ...    ...         ...   
873946    689381      PARMA HTS     44130    DEM        1958   
873947    823935   STRONGSVILLE     44149    REP        1946   
873948    749942         EUCLID     44132    DEM        1927   
873949    276903         EUCLID     44132  NOPTY        1944   
873950   3017222    SEVEN HILLS     44131  NOPTY        1975   

       2020_Primary_Election 2020_General_Election 2016_Primary_Election  \
0                          0                     t                     D   
1                          0                     

In [5]:
pres_elec_df.shape

(873951, 13)

In [6]:
#Column Datatypes
pres_elec_df.dtypes

Voter_ID                  int64
City                     object
Zip_Code                  int64
Party                    object
Birth_Year                int64
2020_Primary_Election    object
2020_General_Election    object
2016_Primary_Election    object
2016_General_Election    object
2012_Primary_Election    object
2012_General_Election    object
2008_Primary_Election    object
2008_General_Election    object
dtype: object

In [15]:
#Replace True and False Values with 0 and 1
pres_elec_df.replace(('f', 't'), (0, 1), inplace=True)

In [16]:
pres_elec_df.head()

Unnamed: 0_level_0,City,Zip_Code,Party,Birth_Year,2020_Primary_Election,2020_General_Election,2016_Primary_Election,2016_General_Election,2012_Primary_Election,2012_General_Election,2008_Primary_Election,2008_General_Election,Generational_Groups
Voter_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
704844,SEVEN HILLS,44131,NOPTY,1959,0,1,D,1,0,1,D,1,Boomers
1658758,PARMA,44130,REP,1968,0,1,R,1,R,1,D,1,Generation X
2005536,BROADVIEW HTS,44147,REP,1976,0,1,R,1,R,1,D,1,Generation X
2465762,SOLON,44139,NOPTY,1991,0,0,0,0,0,1,0,0,Millenials
710633,INDEPENDENCE,44131,REP,1968,0,1,R,1,R,1,0,1,Generation X


In [18]:
# Set voter_id_org column as index
pres_elec_df = pres_elec_df.set_index('Voter_ID')


In [19]:
# Establish the generation bins and group names.

generation_bins = [0, 1945, 1964, 1980, 1996, 2014]
group_names = ["Silent","Boomers", "Generation X", "Millenials", "Generation Z"]

# Categorize spending based on the bins.
pres_elec_df["Generational_Groups"] = pd.cut(pres_elec_df["Birth_Year"], generation_bins, labels=group_names)

pres_elec_df.head()

Unnamed: 0_level_0,City,Zip_Code,Party,Birth_Year,2020_Primary_Election,2020_General_Election,2016_Primary_Election,2016_General_Election,2012_Primary_Election,2012_General_Election,2008_Primary_Election,2008_General_Election,Generational_Groups
Voter_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
704844,SEVEN HILLS,44131,NOPTY,1959,0,1,D,1,0,1,D,1,Boomers
1658758,PARMA,44130,REP,1968,0,1,R,1,R,1,D,1,Generation X
2005536,BROADVIEW HTS,44147,REP,1976,0,1,R,1,R,1,D,1,Generation X
2465762,SOLON,44139,NOPTY,1991,0,0,0,0,0,1,0,0,Millenials
710633,INDEPENDENCE,44131,REP,1968,0,1,R,1,R,1,0,1,Generation X


In [40]:
# Assign the data to X and y for 2020 election
y20=pres_elec_df["2020_General_Election"]
X20=pres_elec_df.drop(columns=["2020_Primary_Election","2020_General_Election", "2016_Primary_Election", "2016_General_Election","2012_Primary_Election", "2012_General_Election", "2008_Primary_Election", "2008_General_Election", "Generational_Groups", "City", "Party"])
print("Shape: ", X20.shape, y20.shape)

Shape:  (873951, 2) (873951,)


In [41]:
# Split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X20, y20, random_state=1)

In [42]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [43]:
classifier.fit(X_train, y_train)

LogisticRegression()

In [44]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6965107107495008
Testing Data Score: 0.6956079967778551


In [35]:
# Assign the data to X and y for 2016 election
y16=pres_elec_df["2016_General_Election"]
X16=pres_elec_df.drop(columns=["2020_Primary_Election","2020_General_Election", "2016_Primary_Election","2016_General_Election" ,"2012_Primary_Election", "2012_General_Election", "2008_Primary_Election", "2008_General_Election", "Generational_Groups", "City", "Party"])
print("Shape: ", X16.shape, y16.shape)

Shape:  (873951, 2) (873951,)


In [36]:
# Split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X16, y16, random_state=1)

In [37]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [38]:
classifier.fit(X_train, y_train)

LogisticRegression()

In [39]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.7053823022809831
Testing Data Score: 0.7049082787155359


In [45]:
# Assign the data to X and y for 2012 election
y12=pres_elec_df["2012_General_Election"]
X12=pres_elec_df.drop(columns=["2020_Primary_Election","2020_General_Election", "2016_Primary_Election","2016_General_Election" ,"2012_Primary_Election", "2012_General_Election", "2008_Primary_Election", "2008_General_Election", "Generational_Groups", "City", "Party"])
print("Shape: ", X12.shape, y12.shape)

Shape:  (873951, 2) (873951,)


In [46]:
# Split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X12, y12, random_state=1)

In [47]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [48]:
classifier.fit(X_train, y_train)

LogisticRegression()

In [49]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.720747013942816
Testing Data Score: 0.7200029292226575


In [50]:
# Assign the data to X and y for 2008 election
y08=pres_elec_df["2008_General_Election"]
X08=pres_elec_df.drop(columns=["2020_Primary_Election","2020_General_Election", "2016_Primary_Election","2016_General_Election" ,"2012_Primary_Election", "2012_General_Election", "2008_Primary_Election", "2008_General_Election", "Generational_Groups", "City", "Party"])
print("Shape: ", X08.shape, y08.shape)

Shape:  (873951, 2) (873951,)


In [51]:
# Split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X08, y08, random_state=1)

In [52]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [53]:
classifier.fit(X_train, y_train)

LogisticRegression()

In [54]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.7343541893287645
Testing Data Score: 0.734571235033503


## Modify the Machine Learning

In [59]:
# Hot Code Generation Data
gen_encoded_data = pd.get_dummies(pres_elec_df, columns = ['Generational_Groups'])
gen_encoded_data.head()

Unnamed: 0_level_0,City,Zip_Code,Party,Birth_Year,2020_Primary_Election,2020_General_Election,2016_Primary_Election,2016_General_Election,2012_Primary_Election,2012_General_Election,2008_Primary_Election,2008_General_Election,Generational_Groups_Silent,Generational_Groups_Boomers,Generational_Groups_Generation X,Generational_Groups_Millenials,Generational_Groups_Generation Z
Voter_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
704844,SEVEN HILLS,44131,NOPTY,1959,0,1,D,1,0,1,D,1,0,1,0,0,0
1658758,PARMA,44130,REP,1968,0,1,R,1,R,1,D,1,0,0,1,0,0
2005536,BROADVIEW HTS,44147,REP,1976,0,1,R,1,R,1,D,1,0,0,1,0,0
2465762,SOLON,44139,NOPTY,1991,0,0,0,0,0,1,0,0,0,0,0,1,0
710633,INDEPENDENCE,44131,REP,1968,0,1,R,1,R,1,0,1,0,0,1,0,0


In [65]:
# Assign the data to X and y for 2020 election - Add Generational Bucket
y20=gen_encoded_data["2020_General_Election"]
X20=gen_encoded_data.drop(columns=["2020_Primary_Election","2020_General_Election", "2016_Primary_Election", "2016_General_Election","2012_Primary_Election", "2012_General_Election", "2008_Primary_Election", "2008_General_Election", "City", "Party"])
print("Shape: ", X20.shape, y20.shape)

Shape:  (873951, 7) (873951,)


In [62]:
# Split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X20, y20, random_state=1)

In [63]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [58]:
classifier.fit(X_train, y_train)

ValueError: could not convert string to float: 'Boomers'