In [1]:
# Import dependencies
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import json
import pandas as pd
from pandas.io import sql
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
import numpy
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from consts import *
%matplotlib inline

In [2]:
# Connecting to Postgres instance
engine = create_engine(CREATE_ENGINE_STR)

In [3]:
# Printing info for table names 
print (engine.table_names())

['education', 'committee_summary_2020', 'donations', 'agg_county_votes', 'agg_county_donors', 'fec_donor_az', 'health_metrics', 'birth_death_rate', 'postal_codes', 'fec_donor_mi', 'fec_donor_wi', 'fec_committee', 'fec_donor_pa', 'pres_votes_6t', 'unemployment', 'fec_donor_nc', 'fec_donor_fl']


In [4]:
#Define the Linear Regression Structured Machine Learning
def state_sml(state_tuple, state, election_yr, unemployment_df, education_df, birth_death_df):
    #TODO merge together relevant info for county from unemployment_df, education_df, birth_death_df
    counties_tuples = state_tuple[0]    
    donor_tuples = state_tuple[1]
    
    #Loop through each election year county dict
    for i in range(0, len(counties_tuples)):
        #Select the corresponding counties/votes and donor info for that election yr
        counties_votes_dict = counties_tuples[i]
        donor_tuple = donor_tuples[i]
        
        county_dict = {}
        #Loop through all the votes organized by county
        for c in counties_votes_dict:
            #Kepp unique dict of counties
            if c not in county_dict:
                county_dict[c] = {}
            #Get the number of votes by county
            county_votes = counties_votes_dict[c]
            #Update the vote dict
            votes_dict = set_votes_dict(county_dict[c], county_votes, donor_tuple)
            #Update the county dict with the updated votes dict
            county_dict[c] = votes_dict

In [5]:
def predict_votes_linear_regression(state_model_dict):
    election_yr = 2000
    unemployment_df = pd.read_sql_query('select * from "unemployment"',con=engine)
    education_df = pd.read_sql_query('select * from "education"',con=engine)
    birth_death_df = pd.read_sql_query('select * from "birth_death_rate"',con=engine)
    for state in state_model_dict.keys():
        model = state_model_dict[state]
        #TODO enable prediction for voting
        #state_sml(model, state, election_yr, unemployment_df, education_df, birth_death_df)
        election_yr += 4

In [6]:
#Machine Learning models run on the donation data
def donation_votes_linear_regression(df):
    print(df.head())
    #Will run a separate LR model on each of the SML_params
    sml_params = ["blue_votes", "red_votes", "blue_amt", "red_amt", "county", "state_donors", "election_year_votes", "PopPct_Urban", "Unemployment", "PopDen_Urban", "PopPct_Rural", "PopDen_Rural"]
    #Set all the votes cols of interest
    votes_cols = ["blue_votes", "red_votes", "other_votes", "total_votes", "percent_blue_votes", "percent_red_votes", "percent_other_votes", "county", "state_donors", "election_year_votes", "PopPct_Urban", "Unemployment", "PopDen_Urban", "PopPct_Rural", "PopDen_Rural"]
    #Set all the donors cols of interest
    donors_cols = ["blue_amt", "red_amt", "other_amt", "total_amt", "percent_blue_votes", "percent_red_votes", "percent_other_votes"]
    
    sml_cols = votes_cols + donors_cols
    #Run some machine learning models on the donation of the state
    run_linear_regression_params(df, sml_params, sml_cols)

In [7]:
#Machine Learning models run on the donation data
def donation_linear_regression(donor_df):
    sml_params = ["transaction_amt", "employer", "occupation", "entity_tp"]
    sml_cols = ["cmt_id", "city", "state", "zip", "employer", "occupation", "transaction_amt", "entity_tp"]
    
    #Run some machine learning models on the donation of the state
    run_linear_regression_params(donor_df, sml_params, sml_cols)

In [8]:
def donation_logistic_regression(donor_df):
    sml_params = ["party"]
    sml_cols = ["cmt_id", "city", "state", "zip", "employer", "occupation", "transaction_amt", "entity_tp"]
    
    #Run logistic regression to test if we can classify the party
    run_logistic_regression_params(donor_df, sml_params, sml_cols)

In [9]:
def run_sml_params(df, sml_params, sml_cols, model_type):
    #Reduce columns to start with
    #print(df.head())
    for sml_param in sml_params:
        df_all = select_columns(df, sml_cols)     
        
        #Run Linear Regression Model on X,y
        #Set Y column to just the ML model
        y_df = select_columns(df, [sml_param])   
        y_df = y_df.fillna(0)
        y_df = label_enc(y_df)
        y = y_df[sml_param].values
    
        #Set X Cols to the everything but the parameter to run the ML model
        x_cols = sml_cols.copy()
        print(f"sml param{sml_param}")
        
        x_cols.remove(sml_param)
        X_df = select_columns(df, x_cols)
        X_df = X_df.fillna(0)
        X_df = label_enc(X_df)
        X = X_df[x_cols].values
        
        if model_type == 'linear':
            print(f"Running a Linear Regression Model with y={sml_param} and x_cols={x_cols}")
            #Run Linear Regresion Model on X,y
            run_linear_regression(X, y, x_cols)
        elif model_type == 'logistic':
            print(f"Running a Logistics Regression Model with y={sml_param} and x_cols={x_cols}")
            #Run Logistic Regresion Model on X,y
            run_logistic_regression(X, y)

In [10]:
def run_logistic_regression_params(df, sml_params, sml_cols):
    run_sml_params(df, sml_params, sml_cols, "logistic")

In [11]:
def run_linear_regression_params(df, sml_params, sml_cols):
    run_sml_params(df, sml_params, sml_cols, "linear")

In [12]:
def run_logistic_regression(X, y):
    # Split the preprocessed data into a training and testing dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, test_size=SML_TEST_SIZE, stratify=y)
    
    classifier = LogisticRegression(solver='lbfgs', random_state=1)
    classifier.fit(X_train, y_train)
    
    #Scale the values
    X_test_scaled = MinMaxScaler().fit_transform(X_test)
    #Predict the values based on the X test values
    predictions = classifier.predict(X_test_scaled)
    
    score = accuracy_score(y_test, predictions)

In [13]:
def run_linear_regression(X, y, x_cols):
    # Split the preprocessed data into a training and testing dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, test_size=SML_TEST_SIZE)
    model = LinearRegression()
    #Train the model 
    model.fit(X_train, y_train)
    
    #Scale the values
    X_test_scaled = MinMaxScaler().fit_transform(X_test)
    #Predict the values based on the X test values
    y_pred = model.predict(X_test_scaled)
        
    print("Prediction!!")
    print("Confusion Matrix!!")    
    #Cast to int for the confusion matrix
    y_test = [int(i) for i in y_test]
    y_pred = [int(i) for i in y_pred]
    
    matrix = confusion_matrix(y_test, y_pred)
    print(matrix)
    print("Classificaiton Report!!")
    report = classification_report(y_test, y_pred)
    print(report)

In [14]:
# Holding area for function to plot data
def plot_data(X, y, y_pred):
    print("X=" + str(len(X)))
    print(X)
    print("y=" + str(len(y)))
    print(y)
    print("y_pred" + str(len(y_pred)))
    print(y_pred)
    
    plt.scatter(X, y)
    plt.plot(X, y_pred, color='red')
    plt.show()

In [15]:
#Get all donation records for a single state and return it in a dataframe
def query_all(table_name):
    #Run queries to get all donation records from the states into dfs
    select_sql = 'select * from {}'.format(table_name)
    df = pd.read_sql_query(select_sql,con=engine)
    return df

In [16]:
#Get all donation records for a single state and return it in a dataframe
def donor_state_query(state, engine):
    #Run queries to get all donation records from the states into dfs
    donor_table_name = '"fec_donor_{}"'.format(state.lower())    
    donor_select_sql = 'select * from {}'.format(donor_table_name)
    donor_df = pd.read_sql_query(donor_select_sql,con=engine)
    return donor_df

In [17]:
def main():
    agg_donors_df = query_all(TABLE_AGG_DONORS)
    agg_votes_df = query_all(TABLE_AGG_VOTES)
    merged_df = agg_donors_df.merge(agg_votes_df, left_on='county', right_on='county', suffixes=("_donors", "_votes"))
    #Run a linear regression analysis on the merged dataset
    donation_votes_linear_regression(merged_df)
    #TODO: Now with all states donations and voting results aggregated, predict the number of votes
    #predict_votes_linear_regression(state_model_dict)
    
    #Loop through each state and run separate LR models on each state.
    for state in SWING_STATES:
        donor_df = donor_state_query(state, engine)
        #Run the machine learning models on the donation set
        donation_linear_regression(donor_df)
        
        #TODO Once we have full dataset, then enable logistic regression
        #donation_logistic_regression(donor_df)

In [18]:
main()

   index_donors  blue_amt  red_amt  other_amt  total_amt  percent_blue_donors  \
0             0         0   111500          0     111500                  0.0   
1             0         0   111500          0     111500                  0.0   
2             0         0   111500          0     111500                  0.0   
3             0         0   111500          0     111500                  0.0   
4             0         0   111500          0     111500                  0.0   

   percent_red_donors  percent_other_donors    county state_donors  ...  \
0                 1.0                   0.0  Maricopa           AZ  ...   
1                 1.0                   0.0  Maricopa           AZ  ...   
2                 1.0                   0.0  Maricopa           AZ  ...   
3                 1.0                   0.0  Maricopa           AZ  ...   
4                 1.0                   0.0  Maricopa           AZ  ...   

   percent_blue_votes  percent_red_votes  percent_other_votes 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Running a Linear Regression Model with y=PopPct_Rural and x_cols=['blue_votes', 'red_votes', 'other_votes', 'total_votes', 'percent_blue_votes', 'percent_red_votes', 'percent_other_votes', 'county', 'state_donors', 'election_year_votes', 'PopPct_Urban', 'Unemployment', 'PopDen_Urban', 'PopDen_Rural', 'blue_amt', 'red_amt', 'other_amt', 'total_amt', 'percent_blue_votes', 'percent_red_votes', 'percent_other_votes']
Prediction!!
Confusion Matrix!!
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0]
 [

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


sml paramtransaction_amt
Running a Linear Regression Model with y=transaction_amt and x_cols=['cmt_id', 'city', 'state', 'zip', 'employer', 'occupation', 'entity_tp']
Prediction!!
Confusion Matrix!!
[[ 0 13  1  0  0]
 [ 0  1  1  0  0]
 [ 0  1  3  0  0]
 [ 0  3  1  0  0]
 [ 0  1  0  0  0]]
Classificaiton Report!!
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        14
           1       0.05      0.50      0.10         2
           2       0.50      0.75      0.60         4
           5       0.00      0.00      0.00         4
           6       0.00      0.00      0.00         1

    accuracy                           0.16        25
   macro avg       0.11      0.25      0.14        25
weighted avg       0.08      0.16      0.10        25

sml paramemployer
Running a Linear Regression Model with y=employer and x_cols=['cmt_id', 'city', 'state', 'zip', 'occupation', 'transaction_amt', 'entity_tp']
Prediction!!
Confusion Matrix!!
[[0 0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


sml paramtransaction_amt
Running a Linear Regression Model with y=transaction_amt and x_cols=['cmt_id', 'city', 'state', 'zip', 'employer', 'occupation', 'entity_tp']
Prediction!!
Confusion Matrix!!
[[ 0 11  0  0  0]
 [ 0  1  0  0  0]
 [ 0  5  0  0  0]
 [ 0  4  0  0  0]
 [ 0  4  0  0  0]]
Classificaiton Report!!
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        11
           1       0.04      1.00      0.08         1
           3       0.00      0.00      0.00         5
           4       0.00      0.00      0.00         4
           8       0.00      0.00      0.00         4

    accuracy                           0.04        25
   macro avg       0.01      0.20      0.02        25
weighted avg       0.00      0.04      0.00        25

sml paramemployer
Running a Linear Regression Model with y=employer and x_cols=['cmt_id', 'city', 'state', 'zip', 'occupation', 'transaction_amt', 'entity_tp']
Prediction!!
Confusion Matrix!!
[[0 0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


sml paramtransaction_amt
Running a Linear Regression Model with y=transaction_amt and x_cols=['cmt_id', 'city', 'state', 'zip', 'employer', 'occupation', 'entity_tp']
Prediction!!
Confusion Matrix!!
[[ 0  0  0 16  0  0]
 [ 0  0  0  1  0  0]
 [ 0  0  0  4  0  0]
 [ 0  0  0  0  0  0]
 [ 0  0  0  1  0  0]
 [ 0  0  0  3  0  0]]
Classificaiton Report!!
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      16.0
           1       0.00      0.00      0.00       1.0
           3       0.00      0.00      0.00       4.0
           4       0.00      0.00      0.00       0.0
           5       0.00      0.00      0.00       1.0
           6       0.00      0.00      0.00       3.0

    accuracy                           0.00      25.0
   macro avg       0.00      0.00      0.00      25.0
weighted avg       0.00      0.00      0.00      25.0

sml paramemployer
Running a Linear Regression Model with y=employer and x_cols=['cmt_id', 'city', 'state', 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


sml paramtransaction_amt
Running a Linear Regression Model with y=transaction_amt and x_cols=['cmt_id', 'city', 'state', 'zip', 'employer', 'occupation', 'entity_tp']
Prediction!!
Confusion Matrix!!
[[ 0  1  0]
 [ 0 18  0]
 [ 0  6  0]]
Classificaiton Report!!
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.72      1.00      0.84        18
           2       0.00      0.00      0.00         6

    accuracy                           0.72        25
   macro avg       0.24      0.33      0.28        25
weighted avg       0.52      0.72      0.60        25

sml paramemployer
Running a Linear Regression Model with y=employer and x_cols=['cmt_id', 'city', 'state', 'zip', 'occupation', 'transaction_amt', 'entity_tp']
Prediction!!
Confusion Matrix!!
[[0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


sml paramtransaction_amt
Running a Linear Regression Model with y=transaction_amt and x_cols=['cmt_id', 'city', 'state', 'zip', 'employer', 'occupation', 'entity_tp']
Prediction!!
Confusion Matrix!!
[[15  0  0  0  0  0]
 [ 2  0  0  0  0  0]
 [ 2  0  0  0  0  0]
 [ 1  0  0  0  0  0]
 [ 4  0  0  0  0  0]
 [ 1  0  0  0  0  0]]
Classificaiton Report!!
              precision    recall  f1-score   support

           0       0.60      1.00      0.75        15
           2       0.00      0.00      0.00         2
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         4
           8       0.00      0.00      0.00         1

    accuracy                           0.60        25
   macro avg       0.10      0.17      0.12        25
weighted avg       0.36      0.60      0.45        25

sml paramemployer
Running a Linear Regression Model with y=employer and x_cols=['cmt_id', 'city', 'state', 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


sml paramtransaction_amt
Running a Linear Regression Model with y=transaction_amt and x_cols=['cmt_id', 'city', 'state', 'zip', 'employer', 'occupation', 'entity_tp']
Prediction!!
Confusion Matrix!!
[[0 9 0 0 0 0 0]
 [0 4 0 0 0 0 0]
 [0 1 0 0 0 0 0]
 [0 6 0 0 0 0 0]
 [0 2 0 0 0 0 0]
 [0 1 0 0 0 0 0]
 [0 2 0 0 0 0 0]]
Classificaiton Report!!
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         9
           2       0.16      1.00      0.28         4
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         6
           6       0.00      0.00      0.00         2
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         2

    accuracy                           0.16        25
   macro avg       0.02      0.14      0.04        25
weighted avg       0.03      0.16      0.04        25

sml paramemployer
Running a Linear Regression Model with y=e

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
