In [1]:
# Import dependencies
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import json
import pandas as pd
from pandas.io import sql
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score, precision_score, f1_score   
from sklearn.linear_model import LogisticRegression
import numpy
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.metrics import r2_score
from consts import *
from sklearn.svm import SVC
#from gcloud import storage
%matplotlib inline

In [2]:
# Connecting to Postgres instance
engine = create_engine(CREATE_ENGINE_STR)

In [None]:
# Printing info for table names 
print (engine.table_names())

In [None]:
#Define the Linear Regression Structured Machine Learning
def state_sml(state_tuple, state, election_yr, unemployment_df, education_df, birth_death_df):
    #TODO merge together relevant info for county from unemployment_df, education_df, birth_death_df
    counties_tuples = state_tuple[0]    
    donor_tuples = state_tuple[1]
    
    #Loop through each election year county dict
    for i in range(0, len(counties_tuples)):
        #Select the corresponding counties/votes and donor info for that election yr
        counties_votes_dict = counties_tuples[i]
        donor_tuple = donor_tuples[i]
        
        county_dict = {}
        #Loop through all the votes organized by county
        for c in counties_votes_dict:
            #Kepp unique dict of counties
            if c not in county_dict:
                county_dict[c] = {}
            #Get the number of votes by county
            county_votes = counties_votes_dict[c]
            #Update the vote dict
            votes_dict = set_votes_dict(county_dict[c], county_votes, donor_tuple)
            #Update the county dict with the updated votes dict
            county_dict[c] = votes_dict

In [None]:
def predict_votes_linear_regression(state_model_dict):
    election_yr = 2000
    unemployment_df = pd.read_sql_query('select * from "unemployment"',con=engine)
    education_df = pd.read_sql_query('select * from "education"',con=engine)
    birth_death_df = pd.read_sql_query('select * from "birth_death_rate"',con=engine)
    for state in state_model_dict.keys():
        model = state_model_dict[state]
        #TODO enable prediction for voting
        #state_sml(model, state, election_yr, unemployment_df, education_df, birth_death_df)
        election_yr += 4

In [None]:
#Machine Learning models run on the donation data
def donation_votes_linear_regression(df, y_param, state):
    print("donation_votes_linear_regression")
    #Set all the votes cols of interest
    votes_cols = ["PopPct_Urban", "Unemployment", "PopDen_Urban", "PopPct_Rural", "PopDen_Rural"]
    #Set all the donors cols of interest
    donors_cols = ["county", "election_year", "blue_amt", "red_amt", "total_amt", "blue_num", "red_num"]
    
    #Set the Machine Learning X cols
    #sml_cols = votes_cols + donors_cols
    x_cols = donors_cols
    
    #Run machine learning models on the donation of the state
    return run_lr_xy(df, y_param, x_cols, state, "lr_votes")

In [None]:
#Machine Learning models run on the donation data
def donation_linear_regression(donor_df, state):
    y_param = "TRANSACTION_AMT"
    X_cols = ["CITY", "STATE", "ZIP", "EMPLOYER", "OCCUPATION"]
    
    #Run some machine learning models on the donation of the state
    return run_lr_xy(df, y_param, X_cols, state, "lr_trans_amt")

In [None]:
def run_lr_xy(df, y_param, x_cols, state, dir_name):
    #Create the Y values
    y_df = select_columns(df, [y_param])
    y = y_df[y_param].values

    #Set X Cols to the everything but the parameter to run the ML model
    X_df = select_columns(df, x_cols)
    X = X_df[x_cols].values
    
    print(f"y len {len(y)} x len {len(X)}")
    #Run Linear Regression Model on X,y
    print(f"Running a Linear Regression Model with y={y_param} and x_cols={x_cols} state={state}")
    return run_linear_regression(X, y, y_param, state, dir_name)

In [None]:
def run_linear_regression(X, y, y_param, state, dir_name):    
    # Split the preprocessed data into a training and testing dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, test_size=SML_TEST_SIZE)
    
    model_type = "Linear Regression"
    #Test out different models
    #model = LinearRegression()
    model = LogisticRegression()
    file_name = create_file_name("lr", y_param, state)
    
    #Scale the values
    X_train_scaled = MinMaxScaler().fit_transform(X_train)
    X_test_scaled = MinMaxScaler().fit_transform(X_test)
        
    #Train the model 
    model.fit(X_train_scaled, y_train)

    #Predict the values based on the X test values
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)
    
    #Get the score of this model
    r2_score_val = r2_score(y_test, y_test_pred)
    score_str = f"r2_score:{r2_score_val}"
    #Plot the data
    title = create_title(model_type, y_param, score_str)
    plot_data(y_test, y_train, y_test_pred, y_train_pred, title, file_name, dir_name)
    
    #Run the model on 2020 to predict number of votes per county based off of 2016-2020 current donations.
    df = pd.DataFrame()
    df['state'] = [state]
    df['sml_param'] = [y_param]
    df['r2_score'] = [r2_score_val]
    df['file_name'] = [file_name]
    df.to_sql(TABLE_RES_LR, con=engine, if_exists="append")
    
    return model

In [None]:
# Holding area for function to plot data
def plot_data(y_test, y_trained, y_pred_test, y_pred_train, title, file_name, dir_name):
    plt.scatter(y_pred_train, y_pred_train - y_trained, c="blue", label="Training Data")
    plt.scatter(y_pred_test, y_pred_test - y_test, c="orange", label="Testing Data")
    plt.legend()
    plt.hlines(y=0, xmin=y_test.min(), xmax=y_test.max())
    plt.title(title)
    #plt.show()
    
    file_dir = f"results/{dir_name}/{file_name}"
    plt.savefig(file_dir)
    
    #Clear the plot
    plt.clf()
    
    #Save the plot data to gcloud
    #save_image_to_gcloud_lr(plt, file_name)

In [None]:
#Get all donation records for a single state and return it in a dataframe
def query_all(table_name, state):
    #Run queries to get all donation records from the states into dfs
    select_sql = f'select * from {table_name} where election_year != 2020 and state=\'{state}\';'
    df = pd.read_sql_query(select_sql,con=engine)
    return df

In [None]:
#Get all donation records for a single state and return it in a dataframe
def query_total_votes(state):
    table_name = "agg_county_votes"
    #Run queries to get all donation records from the states into dfs
    select_sql = f'select * from {table_name} where election_year = 2016 and state=\'{state}\';'
    df = pd.read_sql_query(select_sql,con=engine)
    return df

In [None]:
#Get all donation records for a single state and return it in a dataframe
def query_this_election(table_name, state):
    #Run queries to get all donation records from the states into dfs
    select_sql = f'select * from {table_name} where election_year = 2020 and state=\'{state}\';'
    df = pd.read_sql_query(select_sql,con=engine)
    return df

In [None]:
#Get all donation records for a single state and return it in a dataframe
def donor_state_query(state):
    #Run queries to get all donation records from the states into dfs
    select_sql = f'select * from {TABLE_SIX_STATE_DONATIONS} where "STATE"=\'{state.upper()}\''
    df = pd.read_sql_query(select_sql, con=engine)
    return df

In [None]:
def agg_ml(state):
    print(f"agg_ml {state}")
    
    zips_df = pd.read_sql_query(f'select * from postal_codes where state=\'{state}\'',con=engine)
    unique_counties = zips_df["county"].unique()
    
    for i in range(len(unique_counties)):
        word = unique_counties[i].split(' County', 1)
        unique_counties[i] = word[0].capitalize()
    
    #2016 total votes
    df_total_votes = query_total_votes(state)
    
    #Get donors for this election year
    agg_donors_this_df = query_this_election(TABLE_AGG_DONORS, state)
    
    #Get all donations from 2000-2016
    agg_donors_df = query_all(TABLE_AGG_DONORS, state)
    #Filter by actual unique counties from the state
    agg_donors_df = agg_donors_df[agg_donors_df["county"].isin(unique_counties)]
    
    agg_votes_df = query_all(TABLE_AGG_VOTES, state)
    
    agg_donors_df = agg_donors_df.fillna(0)
    agg_donors_df = label_enc(agg_donors_df)
    
    agg_votes_df = agg_votes_df.fillna(0)
    print(agg_votes_df.head(1))
    agg_votes_df = label_enc(agg_votes_df)
    print(agg_votes_df.head(1))
    
    #Merge on the three fields that make it unique: county, state, and election_year
    merged_df = agg_donors_df.merge(agg_votes_df, left_on=['county', 'state', 'election_year'], right_on=['county', 'state', 'election_year'], suffixes=("_donors", "_votes"))
        
    #Run a linear regression analysis on the merged dataset, returns a trained model
    model_blue = donation_votes_linear_regression(merged_df, "blue_votes", state)
    model_red = donation_votes_linear_regression(merged_df, "red_votes", state)
    
    #Filter by actual unique counties from the state
    agg_donors_this_df = agg_donors_this_df[agg_donors_this_df["county"].isin(unique_counties)]   
    agg_donors_this_df = agg_donors_this_df.fillna(0)
    this_election_df = df_total_votes.merge(agg_donors_this_df, left_on=['county', 'state'], right_on=['county', 'state'], suffixes=("_donors", "_votes"), how='left')
    this_election_df = this_election_df.dropna()
    
    this_election_df_orig = this_election_df.copy()
    this_election_df = label_enc(this_election_df)
    print(this_election_df.head())
    
    #Set all the donors cols of interest
    votes_cols = ["PopPct_Urban", "Unemployment", "PopDen_Urban", "PopPct_Rural", "PopDen_Rural"]
    donors_cols = ["county", "election_year_donors", "blue_amt", "red_amt", "total_amt", "blue_num", "red_num"]
    #TODO enable the x_cols
    x_cols = votes_cols + donors_cols
    agg_donors_this_df_test = select_columns(this_election_df, donors_cols)   
    
    # Split the preprocessed data into a training and testing dataset    
    X_test_scaled = MinMaxScaler().fit_transform(agg_donors_this_df_test)
    y_test_blue_pred = model_blue.predict(X_test_scaled)
    
    y_test_red_pred = model_red.predict(X_test_scaled)
    
    print("len of y_test")
    print(len(y_test_blue_pred))
    print("this election count")
    print(this_election_df.count)
    
    this_election_df_orig["predict_blue_votes"] = y_test_blue_pred
    this_election_df_orig["predict_red_votes"] = y_test_red_pred
    this_election_df_orig["predict_total_votes"] = this_election_df_orig["predict_blue_votes"] + this_election_df_orig["predict_red_votes"]
    
    this_election_df_orig["predict_blue_votes_net"] = (this_election_df_orig["predict_blue_votes"] / this_election_df_orig["predict_total_votes"]) * this_election_df_orig["total_votes"]
    this_election_df_orig["predict_red_votes_net"] = (this_election_df_orig["predict_red_votes"] / this_election_df_orig["predict_total_votes"]) * this_election_df_orig["total_votes"]
    
    #State, County, blue, red votes
    results_df = select_columns(this_election_df_orig, ["predict_blue_votes_net", "predict_red_votes_net", "state", "county"])   
    
    table_name = f"res_votes_{state}"
    results_df.to_sql(table_name, con=engine, if_exists="replace")
    
    #agg_donors_this_df = agg_donors_this_df.merge(df_total_votes, left_on=['county', 'state'], right_on=['county', 'state'], suffixes=("_donors", "_votes"))
    print(f"Prediction for Votes! States = {state}")
    #print(this_election_df.head())
    #print(this_election_df.shape)
    
    #print(agg_donors_this_df_test.head())
    #Significance of each county
    #Electoral Votes per State

In [None]:
def state_ml(committee_df, state):
    #Loop through each state and run separate LR models on each state.
    print(f"state_ml {state}")
    donor_df = donor_state_query(state)
    
    #Add party column to donor data frame
    donor_df = committee_df.merge(donor_df, left_on='CMTE_ID', right_on='CMTE_ID')
    
    #Print unique ids
    unique_aff = donor_df["CMTE_PTY_AFFILIATION"].unique()
    print(unique_aff)

    donor_df = merge_cmtid_party(donor_df)

    #Run the machine learning models on the donation set
    print(f"Linear Regression on state: {state}")
    donation_linear_regression(donor_df, state)

In [None]:
def main(committee_df):
    print("Main")
    
    #Drop the tables that will be outputed to the results res_lr
    drop_res_lr_tables(engine)
    
    for state in SWING_STATES:
        agg_ml(state)
        #state_ml(committee_df, state)
    
    print("End of Main")

In [None]:
committee_df = pd.read_sql_query('select * from "committees"', con=engine)

In [None]:
main(committee_df)