In [1]:
# Import dependencies
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import json
import pandas as pd
from pandas.io import sql
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
import numpy
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from consts import *
import time
%matplotlib inline

In [2]:
# Connecting to Postgres instance
engine = create_engine(CREATE_ENGINE_STR)

In [3]:
# Printing info for table names 
print (engine.table_names())

['committees', 'candidates', 'education', 'res_log', 'six_state_donations', 'health_results', 'res_rf', 'donations', 'classifying_results', 'health_metrics', 'res_counties', 'birth_death_rate', 'postal_codes', 'res_stats_donations', 'res_stats_voters', 'res_votes_AZ', 'res_votes_FL', 'res_votes_MI', 'pres_votes_6t', 'unemployment', 'res_votes_NC', 'agg_county_votes']


In [4]:
def calculate_election_yr(i):
    return (2000 + i*4)

In [5]:
# Definition to take in the votes data frame. Function takes in the votes dataframe with 20 years of data. 
# This will loop thru every 4 years, runs thru all of the county votes then return it in a list. 
# This will aggregate everything and return a list in a df
def get_votes_intervals(votes_df, state_po):
    votes_states_df = votes_df[votes_df['state_po']==state_po]
    i = ELECTION_STARTING_YR
    
    four_yr_dfs = []
    while (i <= ELECTION_ENDING_YR):
        votes_states_interval_df = votes_states_df[votes_states_df['year']==i]    
        four_yr_dfs.append(votes_states_interval_df)
        i += ELECTION_INTERVAL
        
        #For the 2020 cell, since there are no votes, just duplicate 2016
        if i == ELECTION_ENDING_YR:
            four_yr_dfs.append(votes_states_interval_df)
            break
    
    return four_yr_dfs

In [6]:
# Goes thru each county (string), to pull the election date and calculate votes in the county that are democrat (blue), republic (red) and other. 
def vote_distribution(county, election_df, state, i):        
    county_df = election_df[election_df['county']==county]
    county_blue_df = county_df[county_df['party']==MAJOR_PARTIES[0]]
    county_red_df = county_df[county_df['party']==MAJOR_PARTIES[1]]  
    
    #Other = not democratic AND not republican  
    other_votes = 0
    blue_votes = pd.to_numeric(county_blue_df["candidatevotes"].sum(), errors='coerce')
    red_votes = pd.to_numeric(county_red_df["candidatevotes"].sum(), errors='coerce')
    
    unique_parties = county_df['party'].unique()
    for party in unique_parties:
        #Get a sum of all non major parties for other category
        if party not in MAJOR_PARTIES:
            party_df = county_df[county_df['party']==party]
            other_votes += pd.to_numeric(party_df["candidatevotes"].sum(), errors='coerce')
    
    if other_votes > blue_votes and other_votes > red_votes:
        print(f"Other votes was the max {other_votes} blue: {blue_votes}, red: {red_votes}")
        
    #Total votes it the sum of blue + red + other
    total_votes = blue_votes + red_votes + other_votes
    
    election_year = calculate_election_yr(i)
    
    #Set the unemployment data points from the county for that election year.
    unemployment_sql = f'SELECT * FROM unemployment WHERE "County" = \'{county}\' AND "Stabr" = \'{state}\''
    unemployment_df = pd.read_sql_query(unemployment_sql,con=engine)
    
    winning_party = ''
    if blue_votes > red_votes:
        winning_party = MAJOR_PARTIES[0]
    elif red_votes > blue_votes:
        winning_party = MAJOR_PARTIES[1]
    else:
        winning_party = MAJOR_PARTIES[2]
        
    #Initialize 
    urban_pct = 0
    unemployment_rate = 0
    urban_den = 0
    rural_pct = 0
    rural_den = 0
    
    if not unemployment_df.empty:
        col_election_year = election_year
        #Since we have no unemployment data for 2020, just use 2019 values
        if election_year == ELECTION_ENDING_YR:
            col_election_year = col_election_year - 1
            
        #Get POPPCT_URBAN -> urban_pct
        unemployment_col = "Unemployment_rate_" + str(col_election_year)  
        urban_pct = unemployment_df.loc[(unemployment_df['County'] == county) & (unemployment_df["Stabr"] == state), 'POPPCT_URBAN'].values[0]      
        unemployment_rate = unemployment_df.loc[(unemployment_df['County'] == county) & (unemployment_df["Stabr"] == state), unemployment_col].values[0]
        urban_den = unemployment_df.loc[(unemployment_df['County'] == county) & (unemployment_df["Stabr"] == state), 'POPDEN_URBAN'].values[0]
        rural_pct = unemployment_df.loc[(unemployment_df['County'] == county) & (unemployment_df["Stabr"] == state), 'POPPCT_RURAL'].values[0]
        rural_den = unemployment_df.loc[(unemployment_df['County'] == county) & (unemployment_df["Stabr"] == state), 'POPDEN_RURAL'].values[0]

    county_tuple = (
        blue_votes,
        red_votes,
        other_votes,
        total_votes,
        county,
        state,
        election_year,
        urban_pct,
        unemployment_rate,
        urban_den,
        rural_pct,
        rural_den,
        winning_party
    )
    return county_tuple

In [7]:
def donor_distribution(election_df, county, state, i):    
    county = county.strip()
    county_df = election_df[election_df['county']==county]
    
    #Convert the transaction_amt to a numeric column
    county_df["TRANSACTION_AMT"] = pd.to_numeric(county_df["TRANSACTION_AMT"], errors='coerce')
    
    county_blue_df = county_df[county_df['party']==MAJOR_PARTIES[0]]
    county_red_df = county_df[county_df['party']==MAJOR_PARTIES[1]]  
    
    #These vars hold the total amount donation per party
    other_amt = 0
    blue_amt = county_blue_df["TRANSACTION_AMT"].sum()
    red_amt = county_red_df["TRANSACTION_AMT"].sum()
    
    #These variables hold the number of donations per party
    other_num = 0
    blue_num = county_blue_df["TRANSACTION_AMT"].count()
    red_num = county_red_df["TRANSACTION_AMT"].count()
    
    #Loop through all the parties to find the other amount and total
    unique_parties = county_df['party'].unique()
    for party in unique_parties:
        #Get a sum of all non major parties for other category
        if party not in MAJOR_PARTIES:
            party_df = county_df[county_df['party']==party]
            other_amt += party_df["TRANSACTION_AMT"].sum()
            other_num += party_df["TRANSACTION_AMT"].count()
    
    #Total transaction amount is the sum of blue + red + other
    total_amt = blue_amt + red_amt + other_amt
    #Total number of transactions is the sum of blue + red + other
    total_num = blue_num + red_num + other_num
        
    election_year = calculate_election_yr(i)
    donor_tuple = (
        blue_amt,
        red_amt,
        other_amt,
        total_amt,
        blue_num,
        red_num,
        other_num,
        total_num,
        county,
        state,
        election_year
    )
    return donor_tuple

In [8]:
# Run every election year in the loop, get election df, look for the vote distribution
# Will be run on a single state and will return a dictionary tha tcountains every county in the state as a key. 
# The value (number of votes) is a dictionary of values.
def county_vote_distribution(four_yr_dfs, state):
    #Loop through each election DF
    for i in range(len(four_yr_dfs)):
        election_df = four_yr_dfs[i]
        unique_counties = election_df["county"].unique()
        print(f"County election year: {calculate_election_yr(i)} num countines: {len(unique_counties)}")
        
        county_tuples = []
        #Loop through each unique county
        for county in unique_counties:
            print(f"Votes for county: {county}")
            #Get the percent of the vote distribution for that county
            county_tuple = vote_distribution(county, election_df, state, i)  
            county_tuples.append(county_tuple)
        county_votes_df = pd.DataFrame(county_tuples, columns=VOTES_COLS)
        #Write the vote tallies per county to DB
        county_votes_df.to_sql(TABLE_AGG_VOTES, con=engine, if_exists="append")

In [9]:
def map_zip_county(unique_zips, state_zips):
    county_dict = {}
    unique_counties = {}
    for zipcode_str in unique_zips:
        #If there is a problem casting the zip to an int, just skip it
        try:
            zipcode = int(zipcode_str)
        except ValueError as ve:
            print(ve)
            continue
            
        #Filter out on the zip code from the state_zips DF
        county_zip = state_zips[state_zips["zip"] == zipcode]
        #Get the county name from the DF and convert it to lower
        county_name = county_zip["county"].to_string(index=False).strip().lower()
        #Filter out the county string within
        county_name = county_name.replace(" county", "").capitalize()
        
        county_dict[zipcode_str] = county_name
        if county_name not in unique_counties:
            unique_counties[county_name] = True
            
    return (county_dict, unique_counties.keys())

In [10]:
def one_hot_encode(df):
    # Generate our categorical variable list
    cat_vars = df.dtypes[df.dtypes == "object"].index.tolist()

    # Create a OneHotEncoder instance
    enc = OneHotEncoder(sparse=False)

    # Fit and transform the OneHotEncoder using the categorical variable list
    encode_df = pd.DataFrame(enc.fit_transform(df[cat_vars]))

    # Add the encoded variable names to the DataFrame
    encode_df.columns = enc.get_feature_names(cat_vars)
    
    return encode_df

In [11]:
#Loop through each of the election year DFs and 
def donation_county_cycle_distribution(four_yr_dfs, state_zips, committee_df, state):
    #Loop through each election year DF
    for i in range(len(four_yr_dfs)):
        election_df = four_yr_dfs[i]
        #Without zipcode can't do a county lookup, so drop all null values
        election_df.dropna(subset=["ZIP"], inplace=True)
        #Get the unique values of zip code in the election DF
        unique_zips = election_df["ZIP"].unique()
        #Createa a map of zip to county, and a list of all unique counties in that state
        (zip_county_map, unique_counties) = map_zip_county(unique_zips, state_zips)
        #Map the zipcode to the county name per the map function
        election_df["county"] = election_df["ZIP"].map(zip_county_map)
        
        print(f"Donor distribution election yr: {calculate_election_yr(i)} num counties: {len(unique_counties)}")

        #Loop through each unique county
        for county in unique_counties:
            print(f"Donations for county: {county}")
            #Get the donor distribution for that county, state, election year as a tuple
            donor_tuple = donor_distribution(election_df, county, state, i)
            #Create a DF to store the county donor info
            donor_df = pd.DataFrame([donor_tuple], columns = DONOR_COLS)  
            #Write the donation amounts to the DB
            donor_df.to_sql(TABLE_AGG_DONORS, con=engine, if_exists="append")

In [12]:
def str_dt(donor_date_str):
    #01/01/1996 - 12/31/1999
    donor_date = datetime.strptime(donor_date_str, '%m%d%Y')
    return donor_date

In [13]:
def get_year_from_date_str(donor_date_str):
    donor_date = str_dt(donor_date_str)
    donor_year = donor_date.year
    return donor_year

In [14]:
def get_donors_intervals(donors_states_df):
    donors_states_df['trans_date'] =  pd.to_datetime(donors_states_df['TRANSACTION_DT'], format='%m%d%Y', errors="coerce")
    
    i = ELECTION_STARTING_YR
    prev_year = ELECTION_STARTING_YR - ELECTION_INTERVAL
    
    four_yr_dfs = []
        
    while (i <= ELECTION_ENDING_YR):
        votes_states_interval_df = donors_states_df[(donors_states_df['trans_date'].dt.year > prev_year) & (donors_states_df['trans_date'].dt.year <= i)]          
        four_yr_dfs.append(votes_states_interval_df)
        i += ELECTION_INTERVAL
        prev_year += ELECTION_INTERVAL
        
    return four_yr_dfs

In [15]:
#Get all donation records for a single state and return it in a dataframe
def donor_six_state_query(state):
    donor_select_sql = f'select * from six_state_donations where "STATE" = \'{state}\''
    donor_df = pd.read_sql_query(donor_select_sql,con=engine)
    return donor_df

In [16]:
#Aggregate tables are the output of this script, drop them to start fresh
def drop_agg_tables():
    if DROP_AGG_TABLE:
        sql.execute('DROP TABLE IF EXISTS %s'%TABLE_AGG_DONORS, engine)
        sql.execute('DROP TABLE IF EXISTS %s'%TABLE_AGG_VOTES, engine)

In [17]:
#Main Loop of the program
def main(health_df, committee_df, votes_df, zips_df):
    print("Main")
    start_main = time.time()
    
    #Loop through each state
    for state in SWING_STATES:
        print(f"Aggregating Vote and Donation records... for State: {state}")
        #Get the votes related to that state
        votes_intervals_df = get_votes_intervals(votes_df, state)

        #Get the distribution of Red, Blue, and Other votes in a list of dict per election yr e.g. 2000 + 4n
        county_vote_distribution(votes_intervals_df, state)
        
        print("Doing Donor Query")
        #DF that has all donation records for a state
        start_q = time.time()
        donor_df_state = donor_six_state_query(state)
        end_q = time.time()
        time_q = end_q - start_q
        print(f"Done Donor Query time: {time_q}")
        
        #Add party column to donor data frame
        donor_df = committee_df.merge(donor_df_state, left_on='CMTE_ID', right_on='CMTE_ID')
        donor_df = merge_cmtid_party(donor_df)
        
        #Get a list of DFs per election year per state
        donors_intervals_df = get_donors_intervals(donor_df)
        #Filter out the zips DF by the state
        state_zips = zips_df[zips_df["state"] == state]
                
        #Get list of tuples 
        donation_county_cycle_distribution(donors_intervals_df, state_zips, committee_df, state)
        
        print("\n")
    
    print("Swing States Aggregation Done!")
    end_main = time.time()
    time_main = end_main - start_main
    print(time_main)

In [18]:
#Read the various tables into DFs
health_df = pd.read_sql_query('select * from "health_metrics"',con=engine)
committee_df = pd.read_sql_query('select * from "committees"',con=engine)
votes_df = pd.read_sql_query('select * from "pres_votes_6t"',con=engine)
zips_df = pd.read_sql_query('select * from "postal_codes"',con=engine)
    
#Lowercase the column
committee_df['CMTE_ID'] = committee_df['CMTE_ID'].str.upper()
    
#Drop the aggregate tables to do fresh data analysis
drop_agg_tables()

In [None]:
#Run the main loop
main(health_df, committee_df, votes_df, zips_df)

Main
Aggregating Vote and Donation records... for State: PA
County election year: 2000 num countines: 67
Votes for county: Adams
Votes for county: Allegheny
Votes for county: Armstrong
Votes for county: Beaver
Votes for county: Bedford
Votes for county: Berks
Votes for county: Blair
Votes for county: Bradford
Votes for county: Bucks
Votes for county: Butler
Votes for county: Cambria
Votes for county: Cameron
Votes for county: Carbon
Votes for county: Centre
Votes for county: Chester
Votes for county: Clarion
Votes for county: Clearfield
Votes for county: Clinton
Votes for county: Columbia
Votes for county: Crawford
Votes for county: Cumberland
Votes for county: Dauphin
Votes for county: Delaware
Votes for county: Elk
Votes for county: Erie
Votes for county: Fayette
Votes for county: Forest
Votes for county: Franklin
Votes for county: Fulton
Votes for county: Greene
Votes for county: Huntingdon
Votes for county: Indiana
Votes for county: Jefferson
Votes for county: Juniata
Votes for cou