In [None]:
# Import dependencies
from sqlalchemy import create_engine
import json
import pandas as pd
from pandas.io import sql
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
import numpy
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from consts import *
%matplotlib inline

In [None]:
# Connecting to Postgres instance
engine = create_engine(CREATE_ENGINE_STR)

In [None]:
# Printing info for table names 
print (engine.table_names())

In [None]:
#Define the Linear Regression Structured Machine Learning
def state_sml(state_tuple, state, election_yr, unemployment_df, education_df, birth_death_df):
    #TODO merge together relevant info for county from unemployment_df, education_df, birth_death_df
    counties_tuples = state_tuple[0]    
    donor_tuples = state_tuple[1]
    
    #Loop through each election year county dict
    for i in range(0, len(counties_tuples)):
        #Select the corresponding counties/votes and donor info for that election yr
        counties_votes_dict = counties_tuples[i]
        donor_tuple = donor_tuples[i]
        
        county_dict = {}
        #Loop through all the votes organized by county
        for c in counties_votes_dict:
            #Kepp unique dict of counties
            if c not in county_dict:
                county_dict[c] = {}
            #Get the number of votes by county
            county_votes = counties_votes_dict[c]
            #Update the vote dict
            votes_dict = set_votes_dict(county_dict[c], county_votes, donor_tuple)
            #Update the county dict with the updated votes dict
            county_dict[c] = votes_dict

In [None]:
def predict_votes_linear_regression(state_model_dict):
    election_yr = 2000
    unemployment_df = pd.read_sql_query('select * from "unemployment"',con=engine)
    education_df = pd.read_sql_query('select * from "education"',con=engine)
    birth_death_df = pd.read_sql_query('select * from "birth_death_rate"',con=engine)
    for state in state_model_dict.keys():
        model = state_model_dict[state]
        #TODO enable prediction for voting
        #state_sml(model, state, election_yr, unemployment_df, education_df, birth_death_df)
        election_yr += 4

In [None]:
#Machine Learning models run on the donation data
def donation_linear_regression(donor_df):
    sml_params = ["transaction_amt", "employer", "occupation"]
    #sml_cols = ["cmt_id", "city", "state", "zip", "employer", "occupation", "transaction_amt", "party"]
    sml_cols = ["cmt_id", "city", "state", "zip", "employer", "occupation", "transaction_amt"]
    
    #Run some machine learning models on the donation of the state
    run_linear_regression_params(donor_df, sml_params, sml_cols)

In [None]:
def donation_logistic_regression(donor_df):
    sml_params = ["party"]
    sml_cols = ["cmt_id", "city", "state", "zip", "employer", "occupation", "transaction_amt", "party"]
    
    #Run logistic regression to test if we can classify the party
    run_logistic_regression_params(donor_df, sml_params, sml_cols)

In [None]:
#Machine Learning models run on the Voter data
def votes_linear_regression(votes_df):    
    sml_params = ["blue_votes", "red_votes", "other_votes", "blue_amt", "red_amt", "other_amt"]
    sml_cols = ["blue_votes", "red_votes", "other_votes", "blue_amt", "red_amt", "other_amt", "state", "county"]
    
    run_linear_regression_params(votes_df, sml_params, sml_cols)

In [None]:
def run_sml_params(df, sml_params, sml_cols, model_type):
    #Reduce columns to start with
    #print(df.head())
    for sml_param in sml_params:
        df_all = select_columns(df, sml_cols)     
        
        #Run Linear Regression Model on X,y
        #Set Y column to just the ML model
        y_df = select_columns(df, [sml_param])   
        y_df = y_df.fillna(0)
        y_df = label_enc(y_df)
        y = y_df[sml_param].values
    
        #Set X Cols to the everything but the parameter to run the ML model
        x_cols = sml_cols
        x_cols.remove(sml_param)
        X_df = select_columns(df, x_cols)
        X_df = X_df.fillna(0)
        X_df = label_enc(X_df)
        X = X_df[x_cols].values
        
        if model_type == 'linear':
            #Run Linear Regresion Model on X,y
            run_linear_regression(X, y, x_cols)
        elif model_type == 'logistic':
            #Run Logistic Regresion Model on X,y
            run_logistic_regression(X, y)

In [None]:
def run_logistic_regression_params(df, sml_params, sml_cols):
    run_sml_params(df, sml_params, sml_cols, "logistic")

In [None]:
def run_linear_regression_params(df, sml_params, sml_cols):
    run_sml_params(df, sml_params, sml_cols, "linear")

In [None]:
def run_logistic_regression(X, y):
    # Split the preprocessed data into a training and testing dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, test_size=0.25, stratify=y)
    
    classifier = LogisticRegression(solver='lbfgs', random_state=1)
    classifier.fit(X_train, y_train)
    
    predictions = classifier.predict(X_test)
    
    score = accuracy_score(y_test, predictions)

In [None]:
def run_linear_regression(X, y, x_cols):
    # Split the preprocessed data into a training and testing dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, test_size=0.25)
    model = LinearRegression()
    #Train the model 
    model.fit(X_train, y_train)
    
    #Scale the values
    X_test_scaled = MinMaxScaler().fit_transform(X_test)
    #Predict the values based on the X test values
    y_pred = model.predict(X_test_scaled)
        
    print("Prediction!!")
    print("Confusion Matrix!!")
    #y_test = y_test["transaction_amt"].tolist()
    
    #Cast to int for the confusion matrix
    y_test = [int(i) for i in y_test]
    y_pred = [int(i) for i in y_pred]
    
    matrix = confusion_matrix(y_test, y_pred)
    print(matrix)
    print("Classificaiton Report!!")
    report = classification_report(y_test, y_pred)
    print(report)

In [None]:
# Holding area for function to plot data
def plot_data(X, y, y_pred):
    print("X=" + str(len(X)))
    print(X)
    print("y=" + str(len(y)))
    print(y)
    print("y_pred" + str(len(y_pred)))
    print(y_pred)
    
    plt.scatter(X, y)
    plt.plot(X, y_pred, color='red')
    plt.show()

In [None]:
def main():
    #Run the machine learning models on the donation set
    donation_linear_regression(donor_df)
    
    #TODO Once we have full dataset, then enable logistic regression
    #donation_logistic_regression(donor_df)
    
    #TODO: Now with all states donations and voting results aggregated, predict the number of votes
    #predict_votes_linear_regression(state_model_dict)
    
    #TODO: Run Linear regression on the votes
    #votes_linear_regression(votes_df)

In [None]:
main()