In [1]:
# Import dependencies
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import json
import pandas as pd
from pandas.io import sql
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score, precision_score, f1_score   
from sklearn.linear_model import LogisticRegression
import numpy
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.metrics import r2_score
from consts import *
from sklearn.svm import SVC
%matplotlib inline

In [2]:
# Connecting to Postgres instance
engine = create_engine(CREATE_ENGINE_STR)

In [3]:
# Printing info for table names 
print (engine.table_names())

['committees', 'candidates', 'education', 'six_state_donations', 'health_results', 'donations', 'classifying_results', 'health_metrics', 'agg_county_votes', 'birth_death_rate', 'postal_codes', 'res_lr', 'agg_county_donors', 'pres_votes_6t', 'unemployment', 'res_log']


In [4]:
#Get all donation records for a single state and return it in a dataframe
def donor_state_query(state):
    #Run queries to get all donation records from the states into dfs
    #select_sql = f'select * from {TABLE_SIX_STATE_DONATIONS} where "STATE"=\'{state.upper()}\''
    select_sql = f'select * from {TABLE_SIX_STATE_DONATIONS} where "STATE"=\'{state.upper()}\' LIMIT 10000'
    donor_df = pd.read_sql_query(select_sql, con=engine)
    return donor_df

In [5]:
committee_df = pd.read_sql_query('select * from "committees"', con=engine)

In [6]:
state = "FL"

In [7]:
donor_df = donor_state_query(state)

In [8]:
donor_df = committee_df.merge(donor_df, left_on='CMTE_ID', right_on='CMTE_ID')

In [9]:
donor_df = merge_cmtid_party(donor_df)

In [10]:
y_params = ["party"]
X_params = ["CMTE_CITY", "CMTE_ST", "CMTE_ZIP", "EMPLOYER", "OCCUPATION", "TRANSACTION_AMT"]
params = ["party","CMTE_CITY", "CMTE_ST", "CMTE_ZIP", "EMPLOYER", "OCCUPATION", "TRANSACTION_AMT"]

In [11]:
donor_df.head()

Unnamed: 0,CMTE_ID,CMTE_NM,TRES_NM,CMTE_ST1,CMTE_ST2,CMTE_CITY,CMTE_ST,CMTE_ZIP,CMTE_DSGN,CMTE_TP,...,OCCUPATION,TRANSACTION_DT,TRANSACTION_AMT,OTHER_ID,TRAN_ID,FILE_NUM,MEMO_CD,MEMO_TEXT,SUB_ID,party
0,C00000422,AMERICAN MEDICAL ASSOCIATION POLITICAL ACTION ...,KEVIN WALKER,1101 VERMONT AVENUE N W,,WASHINGTON,DC,20005,U,Q,...,PHYSICIAN,9232016,500,,73709205,1111119,,,4111420161347166825,
1,C00000422,AMERICAN MEDICAL ASSOCIATION POLITICAL ACTION ...,KEVIN WALKER,1101 VERMONT AVENUE N W,,WASHINGTON,DC,20005,U,Q,...,PHYSICIAN,9212016,41,,73693395,1111119,,,4111420161347166595,
2,C00000422,AMERICAN MEDICAL ASSOCIATION POLITICAL ACTION ...,KEVIN WALKER,1101 VERMONT AVENUE N W,,WASHINGTON,DC,20005,U,Q,...,PHYSICIAN,9212016,41,,73693406,1111119,,,4111420161347166606,
3,C00000422,AMERICAN MEDICAL ASSOCIATION POLITICAL ACTION ...,KEVIN WALKER,1101 VERMONT AVENUE N W,,WASHINGTON,DC,20005,U,Q,...,PHYSICIAN,9212016,41,,73693407,1111119,,,4111420161347166607,
4,C00000422,AMERICAN MEDICAL ASSOCIATION POLITICAL ACTION ...,KEVIN WALKER,1101 VERMONT AVENUE N W,,WASHINGTON,DC,20005,U,Q,...,PHYSICIAN,9212016,41,,73693370,1111119,,,4111420161347166576,


In [12]:
X_params

['CMTE_CITY',
 'CMTE_ST',
 'CMTE_ZIP',
 'EMPLOYER',
 'OCCUPATION',
 'TRANSACTION_AMT']

In [13]:
donor_df2=donor_df[params]

In [14]:
donor_df2.head()

Unnamed: 0,party,CMTE_CITY,CMTE_ST,CMTE_ZIP,EMPLOYER,OCCUPATION,TRANSACTION_AMT
0,,WASHINGTON,DC,20005,RADIOLOGY & IMAGING SPECIALISTS OF LAK,PHYSICIAN,500
1,,WASHINGTON,DC,20005,VASCULAR CLINIC,PHYSICIAN,41
2,,WASHINGTON,DC,20005,SELF-EMPLOYED,PHYSICIAN,41
3,,WASHINGTON,DC,20005,SOUTH PALM BEACH NEPHROLOGY PA,PHYSICIAN,41
4,,WASHINGTON,DC,20005,US NAVY,PHYSICIAN,41


In [15]:
donor_df2.dtypes

party              object
CMTE_CITY          object
CMTE_ST            object
CMTE_ZIP           object
EMPLOYER           object
OCCUPATION         object
TRANSACTION_AMT     int64
dtype: object

In [16]:
donor_df2[["EMPLOYER_STRING"]]=donor_df2[["EMPLOYER"]].astype(str)
donor_df2[["OCCUPATION_STRING"]]=donor_df2[["OCCUPATION"]].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [17]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()


donor_df2["party"] = le.fit_transform(donor_df2["party"]) 
donor_df2["CMTE_CITY"] = le.fit_transform(donor_df2["CMTE_CITY"]) 
donor_df2["CMTE_ST"] = le.fit_transform(donor_df2["CMTE_ST"]) 
donor_df2["CMTE_ZIP"] = le.fit_transform(donor_df2["CMTE_ZIP"]) 
donor_df2["EMPLOYER_STRING"] = le.fit_transform(donor_df2["EMPLOYER_STRING"]) 
donor_df2["OCCUPATION_STRING"] = le.fit_transform(donor_df2["OCCUPATION_STRING"]) 
donor_df2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/p

Unnamed: 0,party,CMTE_CITY,CMTE_ST,CMTE_ZIP,EMPLOYER,OCCUPATION,TRANSACTION_AMT,EMPLOYER_STRING,OCCUPATION_STRING
0,0,315,8,107,RADIOLOGY & IMAGING SPECIALISTS OF LAK,PHYSICIAN,500,1030,1302
1,0,315,8,107,VASCULAR CLINIC,PHYSICIAN,41,1359,1302
2,0,315,8,107,SELF-EMPLOYED,PHYSICIAN,41,1118,1302
3,0,315,8,107,SOUTH PALM BEACH NEPHROLOGY PA,PHYSICIAN,41,1158,1302
4,0,315,8,107,US NAVY,PHYSICIAN,41,1340,1302


In [18]:
donor_df2=donor_df2.drop("EMPLOYER", axis=1)
donor_df2=donor_df2.drop("OCCUPATION", axis=1)
donor_df2.head()

Unnamed: 0,party,CMTE_CITY,CMTE_ST,CMTE_ZIP,TRANSACTION_AMT,EMPLOYER_STRING,OCCUPATION_STRING
0,0,315,8,107,500,1030,1302
1,0,315,8,107,41,1359,1302
2,0,315,8,107,41,1118,1302
3,0,315,8,107,41,1158,1302
4,0,315,8,107,41,1340,1302


In [19]:
donor_df2.tail()

Unnamed: 0,party,CMTE_CITY,CMTE_ST,CMTE_ZIP,TRANSACTION_AMT,EMPLOYER_STRING,OCCUPATION_STRING
84827,0,299,9,242,1000,918,1226
84828,0,299,9,241,1000,918,1226
84829,0,299,9,241,1000,918,1226
84830,0,3,38,157,2500,449,874
84831,0,3,38,157,2500,449,874


In [20]:
#Which ml parameter is the Y
ml_param = "party"

X=donor_df2.copy()
X=X.drop(ml_param, axis=1)
y=donor_df2[ml_param]

In [21]:
X.describe()

Unnamed: 0,CMTE_CITY,CMTE_ST,CMTE_ZIP,TRANSACTION_AMT,EMPLOYER_STRING,OCCUPATION_STRING
count,84832.0,84832.0,84832.0,84832.0,84832.0,84832.0
mean,221.32971,14.59131,159.473996,420.712302,756.266916,1080.855008
std,109.663248,9.74453,110.284142,11067.260492,358.020371,603.891951
min,0.0,0.0,0.0,-2700.0,0.0,0.0
25%,131.0,8.0,97.0,22.0,486.0,578.0
50%,279.0,9.0,107.0,46.0,865.0,1216.0
75%,315.0,17.0,231.0,100.0,1056.0,1530.0
max,333.0,42.0,447.0,1000000.0,1436.0,2178.0


In [22]:
X.to_csv("test2.csv")

In [23]:
from collections import Counter

# Create X_train, X_test, y_train, y_test
# YOUR CODE HERE
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
Counter(y)

Counter({0: 71307, 2: 7099, 3: 17, 1: 6409})

In [24]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [25]:
def save_to_res_log(accuracy, recall, precision, f1, sml_param, state):
    file_name = create_file_name(MODEL_TYPE_LOG, sml_param, state)
    
    print(file_name)
    print(sml_param)
    print(accuracy)
    print(recall)
    print(precision)
    print(f1)
    
    print("DF")
    log_df = {
        "accuracy": accuracy,
        "recall": recall,
        "precision": precision,
        "f1": f1,
        "sml_param": sml_param,
        "state": state,
        "file_name": file_name
    }
    
    print(log_df)
    
    drop_res_log_tables(engine)
    
    #df = pd.DataFrame(log_df)
    #df = pd.DataFrame(list(log_df.items()),columns = ['accuracy','recall', 'precision', 'f1', 'sml_param', 'state', 'file_name']) 
    #df = pd.DataFrame.from_dict(log_df)
    df = pd.DataFrame(log_df, index=[0])
    
    print(df.head())
    df.to_sql(TABLE_RES_LOG, con=engine, if_exists="append")

In [26]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)

In [27]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=200)

# Train the model
log_classifier.fit(X_train_scaled,y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test_scaled)

# Calculate Scores 
acc_score = accuracy_score(y_test,y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f" Logistic regression model accuracy: {acc_score:.3f} recall:{recall} precision{precision} f1{f1}")
save_to_res_log(acc_score, recall, precision, f1, ml_param, state)

  _warn_prf(average, modifier, msg_start, len(result))


 Logistic regression model accuracy: 0.842 recall:0.8418992832893247 precision0.7339952693958638 f10.7697677994712779
log_party_FL.png
party
0.8418992832893247
0.8418992832893247
0.7339952693958638
0.7697677994712779
DF
{'accuracy': 0.8418992832893247, 'recall': 0.8418992832893247, 'precision': 0.7339952693958638, 'f1': 0.7697677994712779, 'sml_param': 'party', 'state': 'FL', 'file_name': 'log_party_FL.png'}
   accuracy    recall  precision        f1 sml_param state         file_name
0  0.841899  0.841899   0.733995  0.769768     party    FL  log_party_FL.png


In [28]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

report = classification_report(y_test, y_pred)
print(report)

[[17854     2     0     0]
 [ 1596     1     0     0]
 [ 1751     0     0     0]
 [    4     0     0     0]]
              precision    recall  f1-score   support

           0       0.84      1.00      0.91     17856
           1       0.33      0.00      0.00      1597
           2       0.00      0.00      0.00      1751
           3       0.00      0.00      0.00         4

    accuracy                           0.84     21208
   macro avg       0.29      0.25      0.23     21208
weighted avg       0.73      0.84      0.77     21208



  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
title = create_title(MODEL_TYPE_LOG, ml_param, score_str)
#plot_data(y_test, y_train, y_test_pred, y_train_pred, title, file_name)

NameError: name 'sml_param' is not defined

In [None]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")