In [1]:
# Import dependencies
from sqlalchemy import create_engine
import json
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
import numpy
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
%matplotlib inline

In [2]:
# Configure settings for RDS
# Defining db info in config var
jdbc_url="jdbc:postgresql://34.67.52.115/team5k"
config = {'user': 'postgres', 
          "password": "team5kteam5k", 
          "driver":"org.postgresql.Driver",
          "location": "34.67.52.115",
          "db": "team5k",
          "port": "5432"}

In [3]:
#postgres://[user]:[password]@[location]:[port]/[database]
create_engine_str = 'postgresql://' + config["user"] + ":" + config["password"] + "@" + config["location"] + ":" + config["port"] + "/" + config["db"]

In [4]:
# Connecting to Postgres instance
engine = create_engine(create_engine_str)

In [5]:
# Printing info for table names 
print (engine.table_names())

['education', 'committee_summary_2020', 'fec_donor_az', 'health_metrics', 'birth_death_rate', 'postal_codes', 'fec_donor_mi', 'fec_donor_wi', 'fec_committee', 'fec_donor_pa', 'pres_votes_6t', 'unemployment', 'fec_donor_nc', 'fec_donor_fl']


In [6]:
# Holding area for function to plot data
def plot_data(X, y, y_pred):
    print("X=" + str(len(X)))
    print(X)
    print("y=" + str(len(y)))
    print(y)
    print("y_pred" + str(len(y_pred)))
    print(y_pred)
    
    plt.scatter(X, y)
    plt.plot(X, y_pred, color='red')
    plt.show()

In [7]:
# Definition to take in the votes data frame. Function takes in the votes dataframe with 20 years of data. 
# This will loop thru every 4 years, runs thru all of the county votes then return it in a list. 
# This will aggregate everything and return a list in a df

def get_votes_intervals(votes_df, state_po):
    votes_states_df = votes_df[votes_df['state_po']==state_po]
    starting_yr = 2000
    ending_yr = 2020
    interval = 4
    i = starting_yr
    
    four_yr_dfs = []
    while (i <= ending_yr):
        votes_states_interval_df = votes_states_df[votes_states_df['year']==i]    
        four_yr_dfs.append(votes_states_interval_df)
        i += interval
    
    return four_yr_dfs

In [8]:
# Goes thru each county (string), to pull the election date and calculate votes in the county that are democrat (blue), republic (red) and other. 

def vote_distribution(county, election_df, state):
    major_parties = ["democrat", "republican"]
        
    county_df = election_df[election_df['county']==county]
    county_blue_df = county_df[county_df['party']==major_parties[0]]
    county_red_df = county_df[county_df['party']==major_parties[1]]  
    
    #Other = not democratic AND not republican  
    other_votes = 0
    blue_votes = pd.to_numeric(county_blue_df["candidatevotes"].sum(), errors='coerce')
    red_votes = pd.to_numeric(county_red_df["candidatevotes"].sum(), errors='coerce')
    
    unique_parties = county_df['party'].unique()
    for party in unique_parties:
        #Get a sum of all non major parties for other category
        if party not in major_parties:
            party_df = county_df[county_df['party']==party]
            other_votes += pd.to_numeric(party_df["candidatevotes"].sum(), errors='coerce')
    
    #Total votes it the sum of blue + red + other
    total_votes = blue_votes + red_votes + other_votes
    
    #Get the respective percentages
    percent_blue = (blue_votes / total_votes)
    percent_red = (red_votes / total_votes)
    percent_other = (other_votes / total_votes)
            
    percent_dict = {
        "blue_votes": blue_votes,
        "red_votes": red_votes,
        "other_votes": other_votes,
        "total_votes": total_votes,
        "percent_blue": percent_blue,
        "percent_red": percent_red,
        "percent_other": percent_other,
        "county": county,
        "state": state
    }
    return percent_dict

In [9]:
def donor_distribution(county, election_df):    
    county = county.strip()
    #Total sum of donations per party per county
    major_parties = ["democrat", "republican"]
        
    county_df = election_df[election_df['county']==county]
    county_blue_df = county_df[county_df['party']==major_parties[0]]
    county_red_df = county_df[county_df['party']==major_parties[1]]  
    
    #Other = not democratic AND not republican  
    other_amt = 0
    blue_amt = pd.to_numeric(county_blue_df["transaction_amt"].sum(), errors='coerce')
    red_amt = pd.to_numeric(county_red_df["transaction_amt"].sum(), errors='coerce')
    
    #TODO enable other amount
    other_amt =0
    #unique_parties = county_df['party'].unique()
    #for party in unique_parties:
    #    #Get a sum of all non major parties for other category
    #    if party not in major_parties:
    #        party_df = county_df[county_df['party']==party]
    #        other_amt += pd.to_numeric(party_df["transaction_amt"].sum(), errors='coerce')
    
    #Total transaction amount it the sum of blue + red + other
    total_amt = blue_amt + red_amt + other_amt
    
    #Get the respective percentages
    percent_blue = (blue_amt / total_amt)
    percent_red = (red_amt / total_amt)
    percent_other = (other_amt / total_amt)
            
    percent_dict = {
        "blue_amt": blue_amt,
        "red_amt": red_amt,
        "other_amt": other_amt,
        "total_amt": total_amt,
        "percent_blue": percent_blue,
        "percent_red": percent_red,
        "percent_other": percent_other
    }
    return percent_dict

In [10]:
# Run every election year in the loop, get election df, look for the vote distribution
# Will be run on a single state and will return a dictionary tha tcountains every county in the state as a key. 
# The value (number of votes) is a dictionary of values.

def county_vote_distribution(four_yr_dfs, state):
    #Organize by county
    county_dicts = []
    #Loop through each election DF
    for i in range(len(four_yr_dfs)):
        county_dict = {}
        election_df = four_yr_dfs[i]
        unique_counties = election_df["county"].unique()
        #Loop through each unique county
        for county in unique_counties:
            #Get the percent of the vote distribution for that county
            percent_dict = vote_distribution(county, election_df, state)
            county_dict[county] = percent_dict
        county_dicts.append(county_dict)
    return county_dicts

In [11]:
"""
ACE	Ace Party	
AKI	Alaskan Independence Party	
AIC	American Independent Conservative	
AIP	American Independent Party	
AMP	American Party	
APF	American People's Freedom Party	
AE	Americans Elect	
CIT	Citizens' Party	
CMD	Commandments Party	
CMP	Commonwealth Party of the U.S.	
COM	Communist Party	
CNC	Concerned Citizens Party Of Connecticut	
CRV	Conservative Party	
CON	Constitution Party	
CST	Constitutional	
COU	Country	
DCG	D.C. Statehood Green Party	
DNL	Democratic -Nonpartisan League	
DEM	Democratic Party	
D/C	Democratic/Conservative	
DFL	Democratic-Farmer-Labor	
DGR	Desert Green Party	
FED	Federalist	
FLP	Freedom Labor Party	
FRE	Freedom Party	
GWP	George Wallace Party	
GRT	Grassroots	
GRE	Green Party	
GR	Green-Rainbow	
HRP	Human Rights Party	
IDP	Independence Party	
IND	Independent	
IAP	Independent American Party	
ICD	Independent Conservative Democratic	
IGR	Independent Green	
IP	Independent Party	
IDE	Independent Party of Delaware	
IGD	Industrial Government Party	
JCN	Jewish/Christian National	
JUS	Justice Party	
LRU	La Raza Unida	Also see RUP
LBR	Labor Party	Also see LAB
LFT	Less Federal Taxes	
LBL	Liberal Party	
LIB	Libertarian Party	
LBU	Liberty Union Party	
MTP	Mountain Party	
NDP	National Democratic Party	
NLP	Natural Law Party	
NA	New Alliance	
NJC	New Jersey Conservative Party	
NPP	New Progressive Party	
NPA	No Party Affiliation	
NOP	No Party Preference	Commonly used in CA & WA
NNE	None	
N	Nonpartisan	
NON	Non-Party	
OE	One Earth Party	
OTH	Other	
PG	Pacific Green	
PSL	Party for Socialism and Liberation	
PAF	Peace And Freedom	Also see PFP
PFP	Peace And Freedom Party	Also see PAF
PFD	Peace Freedom Party	
POP	People Over Politics	
PPY	People's Party	
PCH	Personal Choice Party	
PPD	Popular Democratic Party	
PRO	Progressive Party	
NAP	Prohibition Party	
PRI	Puerto Rican Independence Party	
RUP	Raza Unida Party	Also see LRU
REF	Reform Party	
REP	Republican Party	
RES	Resource Party	
RTL	Right To Life	
SEP	Socialist Equality Party	
SLP	Socialist Labor Party	
SUS	Socialist Party	
SOC	Socialist Party U.S.A.	
SWP	Socialist Workers Party	
TX	Taxpayers	
TWR	Taxpayers Without Representation	
TEA	Tea Party	
THD	Theo-Democratic	
LAB	U.S. Labor Party	Also see LBR
USP	U.S. People's Party	
UST	U.S. Taxpayers Party	
UN	Unaffiliated	
UC	United Citizen	
UNI	United Party	
UNK	Unknown	
VET	Veterans Party	
WTP	We the People	
W	Write-In
"""

"\nACE\tAce Party\t\nAKI\tAlaskan Independence Party\t\nAIC\tAmerican Independent Conservative\t\nAIP\tAmerican Independent Party\t\nAMP\tAmerican Party\t\nAPF\tAmerican People's Freedom Party\t\nAE\tAmericans Elect\t\nCIT\tCitizens' Party\t\nCMD\tCommandments Party\t\nCMP\tCommonwealth Party of the U.S.\t\nCOM\tCommunist Party\t\nCNC\tConcerned Citizens Party Of Connecticut\t\nCRV\tConservative Party\t\nCON\tConstitution Party\t\nCST\tConstitutional\t\nCOU\tCountry\t\nDCG\tD.C. Statehood Green Party\t\nDNL\tDemocratic -Nonpartisan League\t\nDEM\tDemocratic Party\t\nD/C\tDemocratic/Conservative\t\nDFL\tDemocratic-Farmer-Labor\t\nDGR\tDesert Green Party\t\nFED\tFederalist\t\nFLP\tFreedom Labor Party\t\nFRE\tFreedom Party\t\nGWP\tGeorge Wallace Party\t\nGRT\tGrassroots\t\nGRE\tGreen Party\t\nGR\tGreen-Rainbow\t\nHRP\tHuman Rights Party\t\nIDP\tIndependence Party\t\nIND\tIndependent\t\nIAP\tIndependent American Party\t\nICD\tIndependent Conservative Democratic\t\nIGR\tIndependent Green\t\

In [12]:
def map_zip_county(unique_zips, state_zips):
    county_dict = {}
    unique_counties = {}
    for zipcode in unique_zips:
        county_zip = state_zips[state_zips["zip"] == zipcode]
        county_name = county_zip["county"].to_string(index=False).strip()
        county_dict[zipcode] = county_name
        if county_name not in unique_counties:
            unique_counties[county_name] = True
            
    return (county_dict, unique_counties.keys())

In [13]:
def select_columns(df, column_names):
    new_frame = df.loc[:, column_names]
    return new_frame

In [14]:
def one_hot_encode(df):
    # Generate our categorical variable list
    cat_vars = df.dtypes[df.dtypes == "object"].index.tolist()

    # Create a OneHotEncoder instance
    enc = OneHotEncoder(sparse=False)

    # Fit and transform the OneHotEncoder using the categorical variable list
    encode_df = pd.DataFrame(enc.fit_transform(df[cat_vars]))

    # Add the encoded variable names to the DataFrame
    encode_df.columns = enc.get_feature_names(cat_vars)
    
    return encode_df

In [15]:
def label_enc(df):
    # Create encoder
    le = LabelEncoder()
    # Encode first DataFrame 1 (where all values are floats)
    df = df.apply(lambda col: le.fit_transform(col.astype(str)), axis=0, result_type='expand')
    return df

In [16]:
def run_linear_regression(X, y, x_cols):
    # Split the preprocessed data into a training and testing dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, test_size=0.25)
    model = LinearRegression()
    #Train the model 
    model.fit(X_train, y_train)
    #Predict the values based on the X test values
    
    X_test_scaled = MinMaxScaler().fit_transform(X_test)
    
    print("Test Vals!!")
    print(X)
    print(y)
    #print(X.head())
    #print(y.head())
    #print(X_test.head())
    y_pred = model.predict(X_test_scaled)
    print(y_pred)
        
    print("Prediction!!")
    print("Confusion Matrix!!")
    #y_test = y_test["transaction_amt"].tolist()
    y_test = [int(i) for i in y_test]
    
    # [[1], [2], [3]] => [1,2,3]
    #y_pred = numpy.concatenate(y_pred, axis=0 )
    #y_pred = y_pred.tolist()
    y_pred = [int(i) for i in y_pred]
    
    matrix = confusion_matrix(y_test, y_pred)
    print(matrix)
    print("Classificaiton Report!!")
    report = classification_report(y_test, y_pred)
    print(report)

In [17]:
def run_logistic_regression(X, y):
    # Split the preprocessed data into a training and testing dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, test_size=0.25, stratify=y)
    
    classifier = LogisticRegression(solver='lbfgs', random_state=1)
    classifier.fit(X_train, y_train)
    
    predictions = classifier.predict(X_test)
    
    score = accuracy_score(y_test, predictions)
    print(score)

In [18]:
def run_linear_regression_params(df, sml_params, sml_cols):
    run_sml_params(df, sml_params, sml_cols, "linear")

In [19]:
def run_logistic_regression_params(df, sml_params, sml_cols):
    run_sml_params(df, sml_params, sml_cols, "logistic")

In [20]:
def run_sml_params(df, sml_params, sml_cols, model_type):
    #Reduce columns to start with
    print(df.head())
    for sml_param in sml_params:
        df_all = select_columns(df, sml_cols)     
        
        #Run Linear Regression Model on X,y
        #Set Y column to just the ML model
        y_df = select_columns(df, [sml_param])   
        y_df = y_df.fillna(0)
        y_df = label_enc(y_df)
        y = y_df[sml_param].values
        
        print("Y head")
        print(y)

        #Set X Cols to the everything but the parameter to run the ML model
        x_cols = sml_cols
        x_cols.remove(sml_param)
        X_df = select_columns(df, x_cols)
        X_df = X_df.fillna(0)
        X_df = label_enc(X_df)
        X = X_df[x_cols].values
        
        print("X head")
        print(X)
        
        if model_type == 'linear':
            #Run Linear Regresion Model on X,y
            run_linear_regression(X, y, x_cols)
        elif model_type == 'logistic':
            #Run Logistic Regresion Model on X,y
            run_logistic_regression(X, y)

In [21]:
#Add a new column party to the DF that maps the committee party abbreviation to a major party
def merge_cmtid_party(donor_df):        
    party_repub = "republican"
    party_democrat = "democrat"
        
    #Map the affiliation code to the party affiliation
    cmte_party_map = {
        "REP": party_repub,
        "TEA": party_repub,
        "DNL": party_democrat,
        "DNL": party_democrat,
        "DEM": party_democrat,
        "D/C": party_democrat,
        "DFL": party_democrat,
        "THD": party_democrat,
        "PPD": party_democrat
    }
    
    donor_df["party"] = donor_df["cmte_pty_affiliation"].map(cmte_party_map)
    
    return donor_df

In [22]:
#Loop through each of the election year DFs and 
def donation_county_cycle_distribution(four_yr_dfs, state_zips, committee_df):
    if(False):
        return [{'Maricopa': {'blue_amt': 1000, 'red_amt': 500, 'other_amt': 50, 'total_amt': 1600, 'percent_blue': .625, 'percent_red': .3125, 'percent_other': .03125}, 
                 'Pima': {'blue_amt': 2000, 'red_amt': 600, 'other_amt': 10, 'total_amt': 2610, 'percent_blue': .766, 'percent_red': .230, 'percent_other': .004}}]   
    
    #Organize by county
    county_dicts = []
    for i in range(len(four_yr_dfs)):
        county_dict = {}
        election_df = four_yr_dfs[i]
        election_df.dropna(subset=["zip"], inplace=True)
            
        unique_zips = election_df["zip"].unique()
        
        (zip_county_map, unique_counties) = map_zip_county(unique_zips, state_zips)
        
        election_df["county"] = election_df["zip"].map(zip_county_map)
                
        for county in unique_counties:
            percent_dict = donor_distribution(county, election_df)
            county_dict[county] = percent_dict
        county_dicts.append(county_dict)
    return county_dicts

In [23]:
def str_dt(donor_date_str):
    #01/01/1996 - 12/31/1999
    donor_date = datetime.strptime(donor_date_str, '%Y-%m-%d')
    return donor_date

In [24]:
def get_year_from_date_str(donor_date_str):
    donor_date = str_dt(donor_date_str)
    donor_year = donor_date.year
    return donor_year

In [25]:
def get_donors_intervals(donor_df, state):
    donors_states_df = donor_df[donor_df['state']==state.lower()]
    
    starting_yr = 2000
    i = starting_yr
    interval = 4
    prev_year = starting_yr - interval
    ending_yr = 2020
    
    four_yr_dfs = []
        
    while (i <= ending_yr):
        votes_states_interval_df = donors_states_df[(donors_states_df['transaction_dt']>datetime.date(prev_year,1,1)) & (donors_states_df['transaction_dt']<datetime.date(i,3,1))]          
        four_yr_dfs.append(votes_states_interval_df)
        i += interval
        prev_year += interval
        
    return four_yr_dfs

In [26]:
#Get all donation records for a single state and return it in a dataframe
def donor_state_query(state, engine):
    #Run queries to get all donation records from the states into dfs
    donor_table_name = '"fec_donor_{}"'.format(state.lower())    
    donor_select_sql = 'select * from {}'.format(donor_table_name)
    donor_df = pd.read_sql_query(donor_select_sql,con=engine)
    return donor_df

In [27]:
#Machine Learning models run on the Voter data
def votes_linear_regression(votes_df):    
    sml_params = ["blue_votes", "red_votes", "other_votes", "blue_amt", "red_amt", "other_amt"]
    sml_cols = ["blue_votes", "red_votes", "other_votes", "blue_amt", "red_amt", "other_amt", "state", "county"]
    
    run_linear_regression_params(votes_df, sml_params, sml_cols)

In [28]:
def donation_logistic_regression(donor_df):
    sml_params = ["party"]
    sml_cols = ["cmt_id", "city", "state", "zip", "employer", "occupation", "transaction_amt", "party"]
    
    #Run logistic regression to test if we can classify the party
    run_logistic_regression_params(donor_df, sml_params, sml_cols)

In [29]:
#Machine Learning models run on the donation data
def donation_linear_regression(donor_df):
    sml_params = ["transaction_amt", "employer", "occupation"]
    #sml_cols = ["cmt_id", "city", "state", "zip", "employer", "occupation", "transaction_amt", "party"]
    sml_cols = ["cmt_id", "city", "state", "zip", "employer", "occupation", "transaction_amt"]
    
    #Run some machine learning models on the donation of the state
    run_linear_regression_params(donor_df, sml_params, sml_cols)

In [30]:
def predict_votes_linear_regression(state_model_dict):
    election_yr = 2000
    unemployment_df = pd.read_sql_query('select * from "unemployment"',con=engine)
    education_df = pd.read_sql_query('select * from "education"',con=engine)
    birth_death_df = pd.read_sql_query('select * from "birth_death_rate"',con=engine)
    for state in state_model_dict.keys():
        model = state_model_dict[state]
        #TODO enable prediction for voting
        #state_sml(model, state, election_yr, unemployment_df, education_df, birth_death_df)
        election_yr += 4

In [31]:
#Main Loop of the program
def main():
    #Read the various tables into DFs
    health_df = pd.read_sql_query('select * from "health_metrics"',con=engine)
    committee_df = pd.read_sql_query('select * from "fec_committee"',con=engine)
    votes_df = pd.read_sql_query('select * from "pres_votes_6t"',con=engine)
    zips_df = pd.read_sql_query('select * from "postal_codes"',con=engine)
    
    #Lowercase the column
    committee_df['cmte_id'] = committee_df['cmte_id'].str.lower()
    
    #List of swing states to run the analysis on
    supported_states = ["AZ", "MI", "FL", "NC", "PA", "WI"]
    
    #Loop through each state
    state_model_dict = {}
    for state in supported_states:
        #Get the votes related to that state
        votes_intervals_df = get_votes_intervals(votes_df, state)

        #Get the distribution of Red, Blue, and Other votes in a list of dict per election yr e.g. 2000 + 4n
        counties_votes_dicts = county_vote_distribution(votes_intervals_df, state)
        #print(counties_votes_dicts)
        
        #DF that has all donation for a state
        donor_df_orig = donor_state_query(state, engine)
        #Add party column to donor data frame
        donor_df = committee_df.merge(donor_df_orig, left_on='cmte_id', right_on='cmt_id')
        
        #TODO before merging the party, we need to add the party code to the columns.
        donor_df = merge_cmtid_party(donor_df)
        
        #Run the machine learning models on the donation set
        donation_linear_regression(donor_df)
        
        #TODO Once we have full dataset, then enable logistic regression
        #donation_logistic_regression(donor_df)
        
        #Get a list of DFs that for election election year for that state
        donors_intervals_df = get_donors_intervals(donor_df, state)
        #Filter out the zips DF by the state
        state_zips = zips_df[zips_df["state"] == state]
        #Get list of dictionaries 
        donor_dicts = donation_county_cycle_distribution(donors_intervals_df, state_zips, committee_df)
        #Set a tuple to pass to the functions to run machine learning
        state_tuple = (counties_votes_dicts, donor_dicts)

        state_model_dict[state] = state_tuple
        
        #TODO enable the neural networking code
        #state_nn(state_tuple)
    
    #TODO: Now with all states donations and voting results aggregated, predict the number of votes
    #predict_votes_linear_regression(state_model_dict)
    
    #TODO: Run Linear regression on the votes
    #votes_linear_regression(votes_df)

In [32]:
def set_votes_dict(votes_dict, county_votes, donor_dict):
    votes_dict["blue_votes"] = county_votes["blue_votes"]
    votes_dict["red_votes"] = county_votes["red_votes"]
    votes_dict["other_votes"] = county_votes["other_votes"]
    votes_dict["state"] = county_votes["state"]
    votes_dict["county"] = county_votes["county"]

    for donor_c in donor_dict:
        if c == donor_c:
            county_donors = donor_dict[donor_c]
            votes_dict["blue_amt"] = county_donors["blue_amt"]
            votes_dict["red_amt"] = county_donors["red_amt"]
            votes_dict["other_amt"] = county_donors["other_amt"]
            break

    #TODO set the unemployment data
    """  
    unemployment = unemployment_df[(unemployment_df["County"] == c) & (unemployment_df["Stabr"] == state)]
    unemployment_col = "Unemployment_rate_" + str(election_yr)
    votes_dict["POPPCT_URBAN"] = pd.to_numeric(unemployment["POPPCT_URBAN"].values[0])
    votes_dict[unemployment_col] = unemployment[unemployment_col].values[0]
    votes_dict["POPDEN_URBAN"] = unemployment["POPDEN_URBAN"].values[0]
    votes_dict["POPPCT_RURAL"] = unemployment["POPPCT_RURAL"].values[0]
    votes_dict["POPDEN_RURAL"] = unemployment["POPDEN_RURAL"].values[0]
    """ 
    return votes_dict

In [33]:
#Define the Linear Regression Structured Machine Learning
def state_sml(state_tuple, state, election_yr, unemployment_df, education_df, birth_death_df):
    #TODO merge together relevant info for county from unemployment_df, education_df, birth_death_df
    counties_votes_dicts = state_tuple[0]    
    donor_dicts = state_tuple[1]
    
    #Loop through each election year county dict
    for i in range(0, len(counties_votes_dicts)):
        #Select the corresponding counties/votes and donor info for that election yr
        counties_votes_dict = counties_votes_dicts[i]
        donor_dict = donor_dicts[i]
        
        county_dict = {}
        #Loop through all the votes organized by county
        for c in counties_votes_dict:
            #Kepp unique dict of counties
            if c not in county_dict:
                county_dict[c] = {}
            #Get the number of votes by county
            county_votes = counties_votes_dict[c]
            #Update the vote dict
            votes_dict = set_votes_dict(county_dict[c], county_votes, donor_dict)
            #Update the county dict with the updated votes dict
            county_dict[c] = votes_dict

In [34]:
#Run the neural network model on the counties of votes and donors
def run_nn_print(counties_votes_dict, donor_dict):
    print("run_nn_print")
    print(counties_votes_dict)
    print(donor_dict)

In [35]:
#Run the neural network model on the counties of votes and donors
def run_nn(counties_votes_dict, donor_dict):
    #Neural Networking Code
    nn_df = pd.DataFrame(counties_votes_dict)
    
    # Generate our categorical variable list
    votes_mi_cat = nn_df.dtypes[nn_df.dtypes == "object"].index.tolist()
    
    # Check the number of unique values in each column
    nn_df[votes_mi_cat].nunique()
    
    # Create a OneHotEncoder instance
    enc = OneHotEncoder(sparse=False)

    # Fit and transform the OneHotEncoder using the categorical variable list
    encode_df = pd.DataFrame(enc.fit_transform(nn_df[votes_mi_cat]))

    # Add the encoded variable names to the DataFrame
    encode_df.columns = enc.get_feature_names(votes_mi_cat)
    encode_df.head()
    
    # Create a StandardScaler instance
    scaler = StandardScaler()

    # Fit the StandardScaler
    X_scaler = scaler.fit(X_train)

    # Scale the data
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    
    # Define the model - deep neural net
    number_input_features = len(X_train[0])
    hidden_nodes_layer1 =  8
    hidden_nodes_layer2 = 5

    nn = tf.keras.models.Sequential()

    # First hidden layer
    nn.add(
        tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
    )

    # Second hidden layer
    nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

    # Output layer
    nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Check the structure of the model
    nn.summary()

In [36]:
#Given each county vote distribiton and donor distribution, run neural networks.
def state_nn(state_tuple):
    counties_votes_dicts = state_tuple[0]
    donor_dicts = state_tuple[1]
    
    for i in range(0, len(counties_votes_dicts)):
        counties_votes_dict = counties_votes_dicts[i]
        donor_dict = donor_dicts[i]
        #TODO enable the nn function, requires a DF
        run_nn(counties_votes_dict, donor_dict)

In [37]:
#Run the main loop
main()

     cmte_id                    cmte_nm cmte_tp cmte_city cmte_st cmte_zip  \
0  c00280453  JOHN SHADEGG FOR CONGRESS       H  PHOENIX,      AZ    85064   
1  c00280453  JOHN SHADEGG FOR CONGRESS       H  PHOENIX,      AZ    85064   
2  c00280453  JOHN SHADEGG FOR CONGRESS       H  PHOENIX,      AZ    85064   
3  c00280453  JOHN SHADEGG FOR CONGRESS       H  PHOENIX,      AZ    85064   
4  c00280453  JOHN SHADEGG FOR CONGRESS       H  PHOENIX,      AZ    85064   

  cmte_dsgn cmte_pty_affiliation org_tp connected_org_nm  ... occupation  \
0         P                  REP   None             None  ...       None   
1         P                  REP   None             None  ...       None   
2         P                  REP   None             None  ...       None   
3         P                  REP   None             None  ...       None   
4         P                  REP   None             None  ...       None   

   transaction_dt transaction_amt other_id tran_id file_num  memo_cd  \
0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


     cmte_id              cmte_nm cmte_tp cmte_city cmte_st cmte_zip  \
0  c00343863  ROGERS FOR CONGRESS       H  BRIGHTON      MI    48116   
1  c00343863  ROGERS FOR CONGRESS       H  BRIGHTON      MI    48116   
2  c00343863  ROGERS FOR CONGRESS       H  BRIGHTON      MI    48116   
3  c00343863  ROGERS FOR CONGRESS       H  BRIGHTON      MI    48116   
4  c00343863  ROGERS FOR CONGRESS       H  BRIGHTON      MI    48116   

  cmte_dsgn cmte_pty_affiliation org_tp connected_org_nm  ... occupation  \
0         P                  REP   None             None  ...       None   
1         P                  REP   None             None  ...       None   
2         P                  REP   None             None  ...       None   
3         P                  REP   None             None  ...       None   
4         P                  REP   None             None  ...       None   

   transaction_dt transaction_amt other_id tran_id file_num  memo_cd  \
0      1999-06-11             667     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


     cmte_id             cmte_nm cmte_tp cmte_city cmte_st cmte_zip cmte_dsgn  \
0  c00331256  HAYES FOR CONGRESS       H   CONCORD      NC    28026         P   
1  c00331256  HAYES FOR CONGRESS       H   CONCORD      NC    28026         P   
2  c00331256  HAYES FOR CONGRESS       H   CONCORD      NC    28026         P   
3  c00331256  HAYES FOR CONGRESS       H   Concord      NC    28026         P   
4  c00331256  HAYES FOR CONGRESS       H   Concord      NC    28026         P   

  cmte_pty_affiliation org_tp connected_org_nm  ... occupation  \
0                  REP   None             None  ...       None   
1                  REP   None             None  ...       None   
2                  REP   None             None  ...       None   
3                  REP   None             None  ...       None   
4                  REP   None             None  ...       None   

   transaction_dt transaction_amt other_id tran_id file_num  memo_cd  \
0      1999-06-14            1000     None  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.