In [1]:
# Import dependencies
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import json
import pandas as pd
from pandas.io import sql
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
import numpy
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.metrics import r2_score
from consts import *
%matplotlib inline

In [2]:
# Connecting to Postgres instance
engine = create_engine(CREATE_ENGINE_STR)

In [3]:
# Printing info for table names 
print (engine.table_names())

['committees', 'candidates', 'education', 'committee_summary_2020', 'six_state_donations', 'donations', 'health_metrics', 'agg_county_votes', 'birth_death_rate', 'postal_codes', 'fec_committee', 'agg_county_donors', 'pres_votes_6t', 'unemployment']


In [4]:
#Define the Linear Regression Structured Machine Learning
def state_sml(state_tuple, state, election_yr, unemployment_df, education_df, birth_death_df):
    #TODO merge together relevant info for county from unemployment_df, education_df, birth_death_df
    counties_tuples = state_tuple[0]    
    donor_tuples = state_tuple[1]
    
    #Loop through each election year county dict
    for i in range(0, len(counties_tuples)):
        #Select the corresponding counties/votes and donor info for that election yr
        counties_votes_dict = counties_tuples[i]
        donor_tuple = donor_tuples[i]
        
        county_dict = {}
        #Loop through all the votes organized by county
        for c in counties_votes_dict:
            #Kepp unique dict of counties
            if c not in county_dict:
                county_dict[c] = {}
            #Get the number of votes by county
            county_votes = counties_votes_dict[c]
            #Update the vote dict
            votes_dict = set_votes_dict(county_dict[c], county_votes, donor_tuple)
            #Update the county dict with the updated votes dict
            county_dict[c] = votes_dict

In [5]:
def predict_votes_linear_regression(state_model_dict):
    election_yr = 2000
    unemployment_df = pd.read_sql_query('select * from "unemployment"',con=engine)
    education_df = pd.read_sql_query('select * from "education"',con=engine)
    birth_death_df = pd.read_sql_query('select * from "birth_death_rate"',con=engine)
    for state in state_model_dict.keys():
        model = state_model_dict[state]
        #TODO enable prediction for voting
        #state_sml(model, state, election_yr, unemployment_df, education_df, birth_death_df)
        election_yr += 4

In [6]:
#Machine Learning models run on the donation data
def donation_votes_linear_regression(df):
    #Will run a separate LR model on each of the SML_params
    sml_params = ["blue_votes", "red_votes", "blue_amt", "red_amt", "blue_num", "red_num"]
    #Set all the votes cols of interest
    votes_cols = ["blue_votes", "red_votes", "other_votes", "total_votes", "county", "state", "election_year", "PopPct_Urban", "Unemployment", "PopDen_Urban", "PopPct_Rural", "PopDen_Rural"]
    #Set all the donors cols of interest
    donors_cols = ["blue_amt", "red_amt", "other_amt", "total_amt", "blue_num", "red_num"]
    
    sml_cols = votes_cols + donors_cols
    #Run some machine learning models on the donation of the state
    run_linear_regression_params(df, sml_params, sml_cols)

In [7]:
#Machine Learning models run on the donation data
def donation_linear_regression(donor_df):
    sml_params = ["TRANSACTION_AMT"]
    sml_cols = ["CMTE_ID", "CITY", "STATE", "ZIP", "EMPLOYER", "OCCUPATION", "TRANSACTION_AMT", "ENTITY_TP"]
    
    #Run some machine learning models on the donation of the state
    run_linear_regression_params(donor_df, sml_params, sml_cols)

In [8]:
def donation_logistic_regression(donor_df):
    sml_params = ["party"]
    sml_cols = ["CMTE_ID", "CITY", "STATE", "ZIP", "EMPLOYER", "OCCUPATION", "TRANSACTION_AMT", "ENTITY_TP", "party"]
    
    #Run logistic regression to test if we can classify the party
    run_logistic_regression_params(donor_df, sml_params, sml_cols)

In [9]:
def run_sml_params(df, sml_params, sml_cols, model_type):
    #Reduce columns to start with
    #print(df.head())
    for sml_param in sml_params:
        df_all = select_columns(df, sml_cols)     
        
        #Run Linear Regression Model on X,y
        #Set Y column to just the ML model
        y_df = select_columns(df, [sml_param])   
        y_df = y_df.fillna(0)
        y_df = label_enc(y_df)
        y = y_df[sml_param].values
    
        #Set X Cols to the everything but the parameter to run the ML model
        x_cols = sml_cols.copy()        
        x_cols.remove(sml_param)
        X_df = select_columns(df, x_cols)
        X_df = X_df.fillna(0)
        X_df = label_enc(X_df)
        X = X_df[x_cols].values
                
        if model_type == 'linear':
            print(f"Running a Linear Regression Model with y={sml_param} and x_cols={x_cols}")
            #Run Linear Regresion Model on X,y
            run_linear_regression(X, y, sml_param)
        elif model_type == 'logistic':
            print(f"Running a Logistics Regression Model with y={sml_param} and x_cols={x_cols}")
            #Run Logistic Regresion Model on X,y
            run_logistic_regression(X, y, sml_param)

In [10]:
def run_logistic_regression_params(df, sml_params, sml_cols):
    run_sml_params(df, sml_params, sml_cols, "logistic")

In [11]:
def run_linear_regression_params(df, sml_params, sml_cols):
    run_sml_params(df, sml_params, sml_cols, "linear")

In [12]:
def run_logistic_regression(X, y, sml_param):
    # Split the preprocessed data into a training and testing dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, test_size=SML_TEST_SIZE, stratify=y)
    
    model = LogisticRegression(solver='lbfgs', random_state=1)
    model_type = "Logistic Regression"
    #Scale the values
    #X_train_scaled = MinMaxScaler().fit_transform(X_train)
    #Train the model 
    model.fit(X_train, y_train)
    
    #Scale the values
    #X_test_scaled = MinMaxScaler().fit_transform(X_test)
    #Predict the values based on the X test values
    predictions = model.predict(X_test)
    
    score = accuracy_score(y_test, predictions)

In [13]:
def run_linear_regression(X, y, sml_param):    
    # Split the preprocessed data into a training and testing dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, test_size=SML_TEST_SIZE)
    
    #Test out different models
    model = LinearRegression()
    model_type = "Linear Regression"
    
    #Scale the values
    X_train_scaled = MinMaxScaler().fit_transform(X_train)
    X_test_scaled = MinMaxScaler().fit_transform(X_test)
    
    #Train the model 
    model.fit(X_train_scaled, y_train)
    
    #Predict the values based on the X test values
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)
    
    #Save X_test_scaled, y_test, y_pred to table
    #1. Create plot_data and save the image to a S3 bucket?
    #2. Save the output of plot_data(X_test_scaled, y_test, y_pred) county, state, election_year
    #
    
    #Get the score of this model
    score_test = model.score(X_test, y_test)
    score_test_scale = model.score(X_test_scaled, y_test)
    r2_score_val = r2_score(y_test, y_test_pred)
    
    score_str = f"r2_score:{r2_score_val} score_test:{score_test} score_test_scale:{score_test_scale}"
    #Plot the data
    title = f"{model_type}-{sml_param}:{score_str}"
    plot_data(y_test, y_train, y_test_pred, y_train_pred, title)

In [14]:
# Holding area for function to plot data
def plot_data(y_test, y_trained, y_pred_test, y_pred_train, title):
    plt.scatter(y_pred_train, y_pred_train - y_trained, c="blue", label="Training Data")
    plt.scatter(y_pred_test, y_pred_test - y_test, c="orange", label="Testing Data")
    plt.legend()
    plt.hlines(y=0, xmin=y_test.min(), xmax=y_test.max())
    plt.title(title)
    plt.show()    

In [15]:
#Get all donation records for a single state and return it in a dataframe
def query_all(table_name):
    #Run queries to get all donation records from the states into dfs
    select_sql = 'select * from {}'.format(table_name)
    df = pd.read_sql_query(select_sql,con=engine)
    return df

In [16]:
#Get all donation records for a single state and return it in a dataframe
def donor_state_query(state):
    #Run queries to get all donation records from the states into dfs
    select_sql = f'select * from {TABLE_SIX_STATE_DONATIONS} where "STATE"=\'{state.upper()}\''
    donor_df = pd.read_sql_query(select_sql, con=engine)
    return donor_df

In [17]:
def agg_ml():
    print("agg_ml")
    agg_donors_df = query_all(TABLE_AGG_DONORS)
    agg_votes_df = query_all(TABLE_AGG_VOTES)
    #Merge on the three fields that make it unique: county, state, and election_year
    merged_df = agg_donors_df.merge(agg_votes_df, left_on=['county', 'state', 'election_year'], right_on=['county', 'state', 'election_year'], suffixes=("_donors", "_votes"))
        
    #Run a linear regression analysis on the merged dataset
    #donation_votes_linear_regression(merged_df)
    
    #TODO: Now with all states donations and voting results aggregated, predict the number of votes
    #predict_votes_linear_regression(state_model_dict)

In [18]:
def state_ml(committee_df):
    print("state_ml")
    print("Getting FEC committee")
    #Loop through each state and run separate LR models on each state.
    for state in SWING_STATES:
        donor_df = donor_state_query(state)
        
        #Add party column to donor data frame
        donor_df = committee_df.merge(donor_df, left_on='cmte_id', right_on='CMTE_ID')
        donor_df = merge_cmtid_party(donor_df)
        
        #Run the machine learning models on the donation set
        print(f"Linear Regression on state: {state}")
        #donation_linear_regression(donor_df)
        #Run the machine learning models on the donation set
        print(f"Logistic Regression on state: {state}")
        donation_logistic_regression(donor_df)

In [19]:
def main():
    print("Main")
    #agg_ml()
    
    committee_df = pd.read_sql_query('select * from "fec_committee"', con=engine)
    state_ml(committee_df)
    print("End of Main")

In [None]:
main()

Main
state_ml
Getting FEC committee
Linear Regression on state: AZ
Logistic Regression on state: AZ
Running a Logistics Regression Model with y=party and x_cols=['CMTE_ID', 'CITY', 'STATE', 'ZIP', 'EMPLOYER', 'OCCUPATION', 'TRANSACTION_AMT', 'ENTITY_TP']


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
