In [1]:
# Import dependencies
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import json
import pandas as pd
from pandas.io import sql
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score, precision_score, f1_score   
from sklearn.linear_model import LogisticRegression
import numpy
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.metrics import r2_score
from consts import *
from sklearn.svm import SVC
from gcloud import storage
%matplotlib inline

In [2]:
# Connecting to Postgres instance
engine = create_engine(CREATE_ENGINE_STR)

In [3]:
# Printing info for table names 
print (engine.table_names())

['committees', 'candidates', 'education', 'res_lr', 'res_log', 'six_state_donations', 'donations', 'health_metrics', 'agg_county_votes', 'birth_death_rate', 'postal_codes', 'agg_county_donors', 'pres_votes_6t', 'unemployment']


In [4]:
#Define the Linear Regression Structured Machine Learning
def state_sml(state_tuple, state, election_yr, unemployment_df, education_df, birth_death_df):
    #TODO merge together relevant info for county from unemployment_df, education_df, birth_death_df
    counties_tuples = state_tuple[0]    
    donor_tuples = state_tuple[1]
    
    #Loop through each election year county dict
    for i in range(0, len(counties_tuples)):
        #Select the corresponding counties/votes and donor info for that election yr
        counties_votes_dict = counties_tuples[i]
        donor_tuple = donor_tuples[i]
        
        county_dict = {}
        #Loop through all the votes organized by county
        for c in counties_votes_dict:
            #Kepp unique dict of counties
            if c not in county_dict:
                county_dict[c] = {}
            #Get the number of votes by county
            county_votes = counties_votes_dict[c]
            #Update the vote dict
            votes_dict = set_votes_dict(county_dict[c], county_votes, donor_tuple)
            #Update the county dict with the updated votes dict
            county_dict[c] = votes_dict

In [5]:
def predict_votes_linear_regression(state_model_dict):
    election_yr = 2000
    unemployment_df = pd.read_sql_query('select * from "unemployment"',con=engine)
    education_df = pd.read_sql_query('select * from "education"',con=engine)
    birth_death_df = pd.read_sql_query('select * from "birth_death_rate"',con=engine)
    for state in state_model_dict.keys():
        model = state_model_dict[state]
        #TODO enable prediction for voting
        #state_sml(model, state, election_yr, unemployment_df, education_df, birth_death_df)
        election_yr += 4

In [6]:
#Machine Learning models run on the donation data
def donation_votes_linear_regression(df):
    #Will run a separate LR model on each of the SML_params
    sml_params = ["blue_votes", "red_votes", "blue_amt", "red_amt", "blue_num", "red_num"]
    #Set all the votes cols of interest
    votes_cols = ["blue_votes", "red_votes", "total_votes", "county", "state", "election_year", "PopPct_Urban", "Unemployment", "PopDen_Urban", "PopPct_Rural", "PopDen_Rural", "winning_party"]
    #Set all the donors cols of interest
    donors_cols = ["blue_amt", "red_amt", "total_amt", "blue_num", "red_num"]
    
    sml_cols = votes_cols + donors_cols
    #Run some machine learning models on the donation of the state
    run_linear_regression_params(df, sml_params, sml_cols, "all")

In [7]:
#Machine Learning models run on the donation data
def donation_linear_regression(donor_df, state):
    sml_params = ["TRANSACTION_AMT"]
    sml_cols = ["CITY", "STATE", "ZIP", "EMPLOYER", "OCCUPATION", "TRANSACTION_AMT"]
    
    #Run some machine learning models on the donation of the state
    run_linear_regression_params(donor_df, sml_params, sml_cols, state)

In [8]:
def donation_logistic_regression(donor_df, state):
    sml_params = ["party"]
    sml_cols = ["CITY", "STATE", "ZIP", "EMPLOYER", "OCCUPATION", "TRANSACTION_AMT", "party"]
    
    #Run logistic regression to test if we can classify the party
    run_logistic_regression_params(donor_df, sml_params, sml_cols, state)

In [9]:
def donation_svc_linear_regression(donor_df, state):
    sml_params = ["party"]
    sml_cols = ["CITY", "STATE", "ZIP", "EMPLOYER", "OCCUPATION", "TRANSACTION_AMT", "party"]
    
    #Run logistic regression to test if we can classify the party
    run_svc_linear_params(donor_df, sml_params, sml_cols, state)

In [10]:
def run_sml_params(df, sml_params, sml_cols, model_type, state):
    #Reduce columns to start with
    for sml_param in sml_params:
        df_all = select_columns(df, sml_cols)     
                
        y_df = select_columns(df, [sml_param])
        y_df = y_df.fillna(0)
        y_df = label_enc(y_df)
        y = y_df[sml_param].values
        
        x_cols = sml_cols.copy()
        x_cols.remove(sml_param)
        X_df = select_columns(df, x_cols)
        X_df = X_df.fillna(0)
        #Set X Cols to the everything but the parameter to run the ML model
        X_df = label_enc(X_df)
        X = X_df[x_cols].values
            
        if model_type == 'linear':
            #Run Linear Regression Model on X,y
            #Set Y column to just the ML model

            print(f"Running a Linear Regression Model with y={sml_param} and x_cols={x_cols}")
            #Run Linear Regresion Model on X,y
            run_linear_regression(X, y, sml_param, state)
        elif model_type == 'logistic':
            print(f"Running a Logistics Regression Model with y={sml_param} and x_cols={x_cols}")
            
            data_scaler = StandardScaler()
            y_df_scaled = data_scaler.fit_transform(y_df)
            X_df_scaled = data_scaler.fit_transform(X_df)
            
            X = X_df[x_cols].values
            y = y_df[sml_param].values
            #Run Logistic Regresion Model on X,y
            run_logistic_regression(X, y, state, sml_param)
        #elif model_type == 'svc linear':
            #print(f"Running a SVC Linear Model with y={sml_param} and x_cols={x_cols}")
            #Run Logistic Regresion Model on X,y
            #run_svc_regression(X, y, state)

In [11]:
def run_svc_linear_params(df, sml_params, sml_cols, state):
    run_sml_params(df, sml_params, sml_cols, "svc linear", state)

In [12]:
def run_logistic_regression_params(df, sml_params, sml_cols, state):
    run_sml_params(df, sml_params, sml_cols, "logistic", state)

In [13]:
def run_linear_regression_params(df, sml_params, sml_cols, state):
    run_sml_params(df, sml_params, sml_cols, "linear", state)

In [14]:
def run_svc_regression(X, y, state):
    # Split the preprocessed data into a training and testing dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, test_size=SML_TEST_SIZE, stratify=y)
    
    model = SVC(kernel='linear')
    model_type = "SVC Linear"
    
    X_train_scaled = MinMaxScaler().fit_transform(X_train)
    #Train the model 
    model.fit(X_train_scaled, y_train)
    
    #Predict the values based on the X test values
    y_pred = model.predict(X_test)
    
    score = accuracy_score(y_test, y_pred)
    
    matrix = confusion_matrix(y_test, y_pred, output_dict=True)

In [15]:
def run_logistic_regression(X, y, state, sml_param):
    # Split the preprocessed data into a training and testing dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, test_size=SML_TEST_SIZE, stratify=y)
    
    #Which solver to fit the model
    model_type = "Logistic Regression"
        
    #Set the different types of solvers to compare
    solvers = ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']
    solver = solvers[0]
    model = LogisticRegression(solver=solver, max_iter=400, random_state=1)
    
    #Scale the values
    X_train_scaled = MinMaxScaler().fit_transform(X_train)
    X_test_scaled = MinMaxScaler().fit_transform(X_test)
    
    #Train the model 
    model.fit(X_train_scaled, y_train)
    
    #Predict the values based on the X test values
    y_pred = model.predict(X_test_scaled)
    
    # Calculate Scores 
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    matrix = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    #df = pd.DataFrame(report).transpose()
    df = pd.DataFrame()
    df["accuracy"] = accuracy
    df["recall"] = recall
    df["precision"] = precision
    df["f1"] = f1
    df["sml_param"] = sml_param
    df["state"] = state
    df.to_sql(TABLE_RES_LOG, con=engine, if_exists="append")
    
    print(df.head())

In [16]:
def run_linear_regression(X, y, sml_param, state):    
    # Split the preprocessed data into a training and testing dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, test_size=SML_TEST_SIZE)
    
    #Test out different models
    model = LinearRegression()
    #model = RandomForestClassifier()
    model_type = "lr"
    file_name = f"{model_type}_{sml_param}_{state}.png"
    #Scale the values
    X_train_scaled = MinMaxScaler().fit_transform(X_train)
    X_test_scaled = MinMaxScaler().fit_transform(X_test)
        
    #Train the model 
    model.fit(X_train_scaled, y_train)
    
    #Predict the values based on the X test values
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)
    
    #Get the score of this model
    r2_score_val = r2_score(y_test, y_test_pred)
    score_str = f"r2_score:{r2_score_val}"
    #Plot the data
    title = f"{model_type}-{sml_param}:{score_str}"
    plot_data(y_test, y_train, y_test_pred, y_train_pred, title, file_name)
    
    df = pd.DataFrame()
    df['state'] = [state]
    df['sml_param'] = [sml_param]
    df['r2_score'] = [r2_score_val]
    df['file_name'] = [file_name]
    df.to_sql(TABLE_RES_LR, con=engine, if_exists="append")

In [17]:
# Holding area for function to plot data
def plot_data(y_test, y_trained, y_pred_test, y_pred_train, title, file_name):
    plt.scatter(y_pred_train, y_pred_train - y_trained, c="blue", label="Training Data")
    plt.scatter(y_pred_test, y_pred_test - y_test, c="orange", label="Testing Data")
    plt.legend()
    plt.hlines(y=0, xmin=y_test.min(), xmax=y_test.max())
    plt.title(title)
    #plt.show()
    
    file_dir = f"results/{file_name}"
    plt.savefig(file_dir)
    
    #Clear the plot
    plt.clf()
    
    #Save the plot data to gcloud
    #save_image_to_gcloud_lr(plt, file_name)

In [18]:
#Aggregate tables are the output of this script, drop them to start fresh
def drop_res_tables():
    if DROP_AGG_TABLE:
        sql.execute('DROP TABLE IF EXISTS %s'%TABLE_RES_LR, engine)
        sql.execute('DROP TABLE IF EXISTS %s'%TABLE_RES_LOG, engine)
        sql.execute('DROP TABLE IF EXISTS %s'%TABLE_RES_SVC, engine)

In [19]:
#Get all donation records for a single state and return it in a dataframe
def query_all(table_name):
    #Run queries to get all donation records from the states into dfs
    select_sql = f'select * from {table_name}'
    df = pd.read_sql_query(select_sql,con=engine)
    return df

In [20]:
#Get all donation records for a single state and return it in a dataframe
def donor_state_query(state):
    #Run queries to get all donation records from the states into dfs
    #select_sql = f'select * from {TABLE_SIX_STATE_DONATIONS} where "STATE"=\'{state.upper()}\''
    select_sql = f'select * from {TABLE_SIX_STATE_DONATIONS} where "STATE"=\'{state.upper()}\' LIMIT 10000'
    donor_df = pd.read_sql_query(select_sql, con=engine)
    return donor_df

In [21]:
def agg_ml():
    print("agg_ml")
    agg_donors_df = query_all(TABLE_AGG_DONORS)
    agg_votes_df = query_all(TABLE_AGG_VOTES)
    #Merge on the three fields that make it unique: county, state, and election_year
    merged_df = agg_donors_df.merge(agg_votes_df, left_on=['county', 'state', 'election_year'], right_on=['county', 'state', 'election_year'], suffixes=("_donors", "_votes"))
        
    #Run a linear regression analysis on the merged dataset
    donation_votes_linear_regression(merged_df)
    
    #TODO: Now with all states donations and voting results aggregated, predict the number of votes
    #predict_votes_linear_regression(state_model_dict)

In [22]:
def state_ml(committee_df):
    #Loop through each state and run separate LR models on each state.
    for state in SWING_STATES:
        print(f"state_ml {state}")
        donor_df = donor_state_query(state)
        
        #Add party column to donor data frame
        donor_df = committee_df.merge(donor_df, left_on='CMTE_ID', right_on='CMTE_ID')
        
        #Print unique ids
        unique_aff = donor_df["CMTE_PTY_AFFILIATION"].unique()
        print(unique_aff)
        
        donor_df = merge_cmtid_party(donor_df)

        #Run the machine learning models on the donation set
        print(f"Linear Regression on state: {state}")
        donation_linear_regression(donor_df, state)
        #Run the machine learning models on the donation set
        
        print(f"Logistic Regression on state: {state}")
        donation_logistic_regression(donor_df, state)
        
        #print(f"SVC Linear Model on state: {state}")
        #donation_svc_linear_regression(donor_df, state)

In [23]:
def main(committee_df):
    drop_res_tables()
    
    print("Main")
    #agg_ml()
    
    state_ml(committee_df)
    print("End of Main")

In [24]:
committee_df = pd.read_sql_query('select * from "committees"', con=engine)

In [25]:
main(committee_df)

Main
state_ml AZ
['UNK' 'NNE' None 'REP' 'CIT' 'DEM' 'LIB' 'NAT' 'IND' 'DFL' 'OTH']
Linear Regression on state: AZ
Running a Linear Regression Model with y=TRANSACTION_AMT and x_cols=['CITY', 'STATE', 'ZIP', 'EMPLOYER', 'OCCUPATION']
Logistic Regression on state: AZ
Running a Logistics Regression Model with y=party and x_cols=['CITY', 'STATE', 'ZIP', 'EMPLOYER', 'OCCUPATION', 'TRANSACTION_AMT']


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Empty DataFrame
Columns: [accuracy, recall, precision, f1, sml_param, state]
Index: []
state_ml MI
[None 'UNK' 'DEM' 'NNE' 'REP' 'NAT' 'LIB' 'GRE' 'Dem' 'SUB']
Linear Regression on state: MI
Running a Linear Regression Model with y=TRANSACTION_AMT and x_cols=['CITY', 'STATE', 'ZIP', 'EMPLOYER', 'OCCUPATION']
Logistic Regression on state: MI
Running a Logistics Regression Model with y=party and x_cols=['CITY', 'STATE', 'ZIP', 'EMPLOYER', 'OCCUPATION', 'TRANSACTION_AMT']
Empty DataFrame
Columns: [accuracy, recall, precision, f1, sml_param, state]
Index: []
state_ml FL
[None 'NNE' 'UNK' 'REP' 'DEM' 'Rep' 'OTH' 'DFL']
Linear Regression on state: FL
Running a Linear Regression Model with y=TRANSACTION_AMT and x_cols=['CITY', 'STATE', 'ZIP', 'EMPLOYER', 'OCCUPATION']
Logistic Regression on state: FL
Running a Logistics Regression Model with y=party and x_cols=['CITY', 'STATE', 'ZIP', 'EMPLOYER', 'OCCUPATION', 'TRANSACTION_AMT']
Empty DataFrame
Columns: [accuracy, recall, precision, f1, sml_p

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Empty DataFrame
Columns: [accuracy, recall, precision, f1, sml_param, state]
Index: []
state_ml WI
[None 'UNK' 'NNE' 'REP' 'DEM' 'PAC' 'DFL' 'NAT']
Linear Regression on state: WI
Running a Linear Regression Model with y=TRANSACTION_AMT and x_cols=['CITY', 'STATE', 'ZIP', 'EMPLOYER', 'OCCUPATION']
Logistic Regression on state: WI
Running a Logistics Regression Model with y=party and x_cols=['CITY', 'STATE', 'ZIP', 'EMPLOYER', 'OCCUPATION', 'TRANSACTION_AMT']


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Empty DataFrame
Columns: [accuracy, recall, precision, f1, sml_param, state]
Index: []
End of Main


<Figure size 432x288 with 0 Axes>