In [46]:
# Import dependencies
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import json
import pandas as pd
from pandas.io import sql
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score, precision_score, f1_score   
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.metrics import r2_score
from consts import *
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from collections import Counter
%matplotlib inline

In [47]:
# Connecting to Postgres instance
engine = create_engine(CREATE_ENGINE_STR)

In [48]:
#Get all donation records for a single state and return it in a dataframe
def donor_state_query(state):
    #Run queries to get all donation records from the states into dfs
    #select_sql = f'select * from {TABLE_SIX_STATE_DONATIONS} where "STATE"=\'{state.upper()}\''
    select_sql = f'select * from {TABLE_SIX_STATE_DONATIONS} where "STATE"=\'{state.upper()}\' LIMIT 10000'
    donor_df = pd.read_sql_query(select_sql, con=engine)
    return donor_df

In [49]:
committee_df = pd.read_sql_query('select * from "committees"', con=engine)

In [50]:
def model_state(state):
    #Get DAta
    donor_df = donor_state_query(state)
    donor_df = committee_df.merge(donor_df, left_on='CMTE_ID', right_on='CMTE_ID')
    donor_df = merge_cmtid_party(donor_df)
    
    #Set Parameters
    y_params = ["party"]
    X_params = ["CMTE_CITY", "CMTE_ST", "CMTE_ZIP", "EMPLOYER", "OCCUPATION", "TRANSACTION_AMT"]
    params = ["party","CMTE_CITY", "CMTE_ST", "CMTE_ZIP", "EMPLOYER", "OCCUPATION", "TRANSACTION_AMT"]
    donor_df2=donor_df[params]
    
    #Format to string for encoding
    donor_df2[["EMPLOYER_STRING"]]=donor_df2[["EMPLOYER"]].astype(str)
    donor_df2[["OCCUPATION_STRING"]]=donor_df2[["OCCUPATION"]].astype(str)    
    donor_df2[["party_STRING"]]=donor_df2[["party"]].astype(str)
    donor_df2[["CMTE_CITY_STRING"]]=donor_df2[["CMTE_CITY"]].astype(str)  
    donor_df2[["CMTE_ST_STRING"]]=donor_df2[["CMTE_ST"]].astype(str)  
    donor_df2[["CMTE_ZIP_STRING"]]=donor_df2[["CMTE_ZIP"]].astype(str)  
    
    #Encode for Model
    le = LabelEncoder()
    donor_df2["party_STRING"] = le.fit_transform(donor_df2["party_STRING"]) 
    donor_df2["CMTE_CITY_STRING"] = le.fit_transform(donor_df2["CMTE_CITY_STRING"]) 
    donor_df2["CMTE_ST_STRING"] = le.fit_transform(donor_df2["CMTE_ST_STRING"]) 
    donor_df2["CMTE_ZIP_STRING"] = le.fit_transform(donor_df2["CMTE_ZIP_STRING"]) 
    donor_df2["EMPLOYER_STRING"] = le.fit_transform(donor_df2["EMPLOYER_STRING"]) 
    donor_df2["OCCUPATION_STRING"] = le.fit_transform(donor_df2["OCCUPATION_STRING"])
    
    #Drop unencoded coumns
    donor_df2=donor_df2.drop("party", axis=1)
    donor_df2=donor_df2.drop("CMTE_CITY", axis=1)
    donor_df2=donor_df2.drop("CMTE_ST", axis=1)
    donor_df2=donor_df2.drop("CMTE_ZIP", axis=1)
    donor_df2=donor_df2.drop("EMPLOYER", axis=1)
    donor_df2=donor_df2.drop("OCCUPATION", axis=1)
    
    #Which ml parameter is the Y
    ml_param = "party"
    X=donor_df2.copy()
    X=X.drop(ml_param, axis=1)
    y=donor_df2[ml_param]
    
    #Split Training / Testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
    # Create a StandardScaler instance
    scaler = StandardScaler()

    # Fit the StandardScaler
    X_scaler = scaler.fit(X_train)

    # Scale the data
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    log_classifier = LogisticRegression(solver="lbfgs",max_iter=200)
    log_classifier.fit(X_train_scaled,y_train)

    # Evaluate the model
    y_pred = log_classifier.predict(X_test_scaled)

    # Calculate Scores 
    acc_score = accuracy_score(y_test,y_pred)
    recall = recall_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f" Logistic regression model accuracy: {acc_score:.3f} recall:{recall} precision{precision} f1{f1}")
    save_to_res_log(acc_score, recall, precision, f1, ml_param, state)
    
    # Create a random forest classifier.
    rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

    # Fitting the model
    rf_model = rf_model.fit(X_train_scaled, y_train)

    # Evaluate the model
    y_pred = rf_model.predict(X_test_scaled)
    print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")
    
    # Calculate Scores 
    acc_score = accuracy_score(y_test,y_pred)
    recall = recall_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f" Logistic regression model accuracy: {acc_score:.3f} recall:{recall} precision{precision} f1{f1}")
    save_to_res_rf(acc_score, recall, precision, f1, ml_param, state)

In [51]:
def save_to_res_log(accuracy, recall, precision, f1, sml_param, state):
    #drop_res_log_tables(engine)
    save_res_db(accuracy, recall, precision, f1, sml_param, state, TABLE_RES_LOG, MODEL_TYPE_LOG)

In [52]:
def save_to_res_rf(accuracy, recall, precision, f1, sml_param, state):
    #drop_res_rf_tables(engine)
    save_res_db(accuracy, recall, precision, f1, sml_param, state, TABLE_RES_RF, MODEL_TYPE_RF)

In [53]:
def save_res_db(accuracy, recall, precision, f1, sml_param, state, table_name, model_type):
    file_name = create_file_name(model_type, sml_param, state)
    log_df = {
        "accuracy": accuracy,
        "recall": recall,
        "precision": precision,
        "f1": f1,
        "sml_param": sml_param,
        "state": state,
        "file_name": file_name
    }
    print(log_df)
    
    df = pd.DataFrame(log_df, index=[0])
    print(df.head())
    df.to_sql(table_name, con=engine, if_exists="append")

In [54]:
for state in SWING_STATES:
    model_state(state)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/

 Logistic regression model accuracy: 0.889 recall:0.8894864805127828 precision0.8768180982008756 f10.8825024345088213
{'accuracy': 0.8894864805127828, 'recall': 0.8894864805127828, 'precision': 0.8768180982008756, 'f1': 0.8825024345088213, 'sml_param': 'party_STRING', 'state': 'AZ', 'file_name': 'log_party_STRING_AZ.png'}
   accuracy    recall  precision        f1     sml_param state  \
0  0.889486  0.889486   0.876818  0.882502  party_STRING    AZ   

                 file_name  
0  log_party_STRING_AZ.png  
 Random forest predictive accuracy: 1.000
 Logistic regression model accuracy: 1.000 recall:0.9997052972813674 precision0.9997053549241169 f10.999692943825565
{'accuracy': 0.9997052972813674, 'recall': 0.9997052972813674, 'precision': 0.9997053549241169, 'f1': 0.999692943825565, 'sml_param': 'party_STRING', 'state': 'AZ', 'file_name': 'rf_party_STRING_AZ.png'}
   accuracy    recall  precision        f1     sml_param state  \
0  0.999705  0.999705   0.999705  0.999693  party_STRING

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/

 Logistic regression model accuracy: 0.844 recall:0.8437394532568343 precision0.8226323198904977 f10.8262561886252079
{'accuracy': 0.8437394532568343, 'recall': 0.8437394532568343, 'precision': 0.8226323198904977, 'f1': 0.8262561886252079, 'sml_param': 'party_STRING', 'state': 'MI', 'file_name': 'log_party_STRING_MI.png'}
   accuracy    recall  precision        f1     sml_param state  \
0  0.843739  0.843739   0.822632  0.826256  party_STRING    MI   

                 file_name  
0  log_party_STRING_MI.png  
 Random forest predictive accuracy: 0.998
 Logistic regression model accuracy: 0.998 recall:0.9982562717966026 precision0.9983026808223203 f10.9982676642404748
{'accuracy': 0.9982562717966026, 'recall': 0.9982562717966026, 'precision': 0.9983026808223203, 'f1': 0.9982676642404748, 'sml_param': 'party_STRING', 'state': 'MI', 'file_name': 'rf_party_STRING_MI.png'}
   accuracy    recall  precision        f1     sml_param state  \
0  0.998256  0.998256   0.998303  0.998268  party_STRI

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/

 Logistic regression model accuracy: 0.792 recall:0.7923986780309619 precision0.7888616702392982 f10.777115854257736
{'accuracy': 0.7923986780309619, 'recall': 0.7923986780309619, 'precision': 0.7888616702392982, 'f1': 0.777115854257736, 'sml_param': 'party_STRING', 'state': 'FL', 'file_name': 'log_party_STRING_FL.png'}
   accuracy    recall  precision        f1     sml_param state  \
0  0.792399  0.792399   0.788862  0.777116  party_STRING    FL   

                 file_name  
0  log_party_STRING_FL.png  
 Random forest predictive accuracy: 0.999
 Logistic regression model accuracy: 0.999 recall:0.999391198469299 precision0.9993909441250038 f10.9993909866107068
{'accuracy': 0.999391198469299, 'recall': 0.999391198469299, 'precision': 0.9993909441250038, 'f1': 0.9993909866107068, 'sml_param': 'party_STRING', 'state': 'FL', 'file_name': 'rf_party_STRING_FL.png'}
   accuracy    recall  precision        f1     sml_param state  \
0  0.999391  0.999391   0.999391  0.999391  party_STRING   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/

 Logistic regression model accuracy: 0.779 recall:0.7787289234760052 precision0.7529975894555504 f10.7467018715913899
{'accuracy': 0.7787289234760052, 'recall': 0.7787289234760052, 'precision': 0.7529975894555504, 'f1': 0.7467018715913899, 'sml_param': 'party_STRING', 'state': 'NC', 'file_name': 'log_party_STRING_NC.png'}
   accuracy    recall  precision        f1     sml_param state  \
0  0.778729  0.778729   0.752998  0.746702  party_STRING    NC   

                 file_name  
0  log_party_STRING_NC.png  
 Random forest predictive accuracy: 0.994
 Logistic regression model accuracy: 0.994 recall:0.9940985732814527 precision0.9940984822752669 f10.9940919438621764
{'accuracy': 0.9940985732814527, 'recall': 0.9940985732814527, 'precision': 0.9940984822752669, 'f1': 0.9940919438621764, 'sml_param': 'party_STRING', 'state': 'NC', 'file_name': 'rf_party_STRING_NC.png'}
   accuracy    recall  precision        f1     sml_param state  \
0  0.994099  0.994099   0.994098  0.994092  party_STRI

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/

 Logistic regression model accuracy: 0.826 recall:0.8257403189066059 precision0.7882953354169847 f10.7910362265082596
{'accuracy': 0.8257403189066059, 'recall': 0.8257403189066059, 'precision': 0.7882953354169847, 'f1': 0.7910362265082596, 'sml_param': 'party_STRING', 'state': 'PA', 'file_name': 'log_party_STRING_PA.png'}
   accuracy   recall  precision        f1     sml_param state  \
0   0.82574  0.82574   0.788295  0.791036  party_STRING    PA   

                 file_name  
0  log_party_STRING_PA.png  
 Random forest predictive accuracy: 1.000
 Logistic regression model accuracy: 1.000 recall:0.9996836243988864 precision0.9996836705626414 f10.9996833855119929
{'accuracy': 0.9996836243988864, 'recall': 0.9996836243988864, 'precision': 0.9996836705626414, 'f1': 0.9996833855119929, 'sml_param': 'party_STRING', 'state': 'PA', 'file_name': 'rf_party_STRING_PA.png'}
   accuracy    recall  precision        f1     sml_param state  \
0  0.999684  0.999684   0.999684  0.999683  party_STRING

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/

 Logistic regression model accuracy: 0.790 recall:0.7901929812512877 precision0.7756287904541457 f10.766474752665086
{'accuracy': 0.7901929812512877, 'recall': 0.7901929812512877, 'precision': 0.7756287904541457, 'f1': 0.766474752665086, 'sml_param': 'party_STRING', 'state': 'WI', 'file_name': 'log_party_STRING_WI.png'}
   accuracy    recall  precision        f1     sml_param state  \
0  0.790193  0.790193   0.775629  0.766475  party_STRING    WI   

                 file_name  
0  log_party_STRING_WI.png  
 Random forest predictive accuracy: 0.980
 Logistic regression model accuracy: 0.980 recall:0.9796717258430053 precision0.9800371198592697 f10.9795887428850553
{'accuracy': 0.9796717258430053, 'recall': 0.9796717258430053, 'precision': 0.9800371198592697, 'f1': 0.9795887428850553, 'sml_param': 'party_STRING', 'state': 'WI', 'file_name': 'rf_party_STRING_WI.png'}
   accuracy    recall  precision        f1     sml_param state  \
0  0.979672  0.979672   0.980037  0.979589  party_STRING