In [25]:
# Import dependencies
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import json
import pandas as pd
from pandas.io import sql
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score, precision_score, f1_score   
from sklearn.linear_model import LogisticRegression
import numpy
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.metrics import r2_score
from consts import *
from sklearn.svm import SVC
%matplotlib inline

In [26]:
# Connecting to Postgres instance
engine = create_engine(CREATE_ENGINE_STR)

In [27]:
# Printing info for table names 
print (engine.table_names())

['committees', 'candidates', 'education', 'six_state_donations', 'donations', 'classifying_results', 'health_metrics', 'agg_county_votes', 'birth_death_rate', 'postal_codes', 'res_lr', 'agg_county_donors', 'pres_votes_6t', 'unemployment', 'res_log']


In [28]:
#Get all donation records for a single state and return it in a dataframe
def donor_state_query(state):
    #Run queries to get all donation records from the states into dfs
    #select_sql = f'select * from {TABLE_SIX_STATE_DONATIONS} where "STATE"=\'{state.upper()}\''
    select_sql = f'select * from {TABLE_SIX_STATE_DONATIONS} where "STATE"=\'{state.upper()}\' LIMIT 10000'
    donor_df = pd.read_sql_query(select_sql, con=engine)
    return donor_df

In [29]:
committee_df = pd.read_sql_query('select * from "committees"', con=engine)

In [30]:
donor_df = donor_state_query("FL")

In [31]:
donor_df = committee_df.merge(donor_df, left_on='CMTE_ID', right_on='CMTE_ID')

In [32]:
donor_df = merge_cmtid_party(donor_df)

In [33]:
y_params = ["party"]
X_params = ["CMTE_CITY", "CMTE_ST", "CMTE_ZIP", "EMPLOYER", "OCCUPATION", "TRANSACTION_AMT"]
params = ["party","CMTE_CITY", "CMTE_ST", "CMTE_ZIP", "EMPLOYER", "OCCUPATION", "TRANSACTION_AMT"]

In [34]:
donor_df.head()

Unnamed: 0,CMTE_ID,CMTE_NM,TRES_NM,CMTE_ST1,CMTE_ST2,CMTE_CITY,CMTE_ST,CMTE_ZIP,CMTE_DSGN,CMTE_TP,...,OCCUPATION,TRANSACTION_DT,TRANSACTION_AMT,OTHER_ID,TRAN_ID,FILE_NUM,MEMO_CD,MEMO_TEXT,SUB_ID,party
0,C00001313,REPUBLICAN PARTY OF MINNESOTA,RON CAREY,480 CEDAR STREET SUITE 560,.,ST PAUL,MN,55101,U,Y,...,"THIRD LAKE CAPITAL, LLC",10272016,7804,,620459-997299-1-P,1133652,X,PARTNERSHIP SUBITEMIZATION OF TRUMP VICTORY C/...,4121320161357322916,republican
1,C00001313,REPUBLICAN PARTY OF MINNESOTA,RON CAREY,480 CEDAR STREET SUITE 560,.,ST PAUL,MN,55101,U,Y,...,VITAQUEST INTERNATIONAL,11072016,3042,,620930-997300-1-P,1133652,X,PARTNERSHIP SUBITEMIZATION OF TRUMP VICTORY C/...,4121320161357322885,republican
2,C00001313,REPUBLICAN PARTY OF MINNESOTA,RON CAREY,480 CEDAR STREET SUITE 560,.,ST PAUL,MN,55101,U,Y,...,WELLOSOPHY CORPORATION,11072016,1138,,620920-997300-1-P,1133652,X,PARTNERSHIP SUBITEMIZATION OF TRUMP VICTORY C/...,4121320161357322861,republican
3,C00001313,REPUBLICAN PARTY OF MINNESOTA,RON CAREY,480 CEDAR STREET SUITE 560,.,ST PAUL,MN,55101,U,Y,...,DEZER PROPERTIES,11072016,7804,,620923-997300-1-P,1133652,X,PARTNERSHIP SUBITEMIZATION OF TRUMP VICTORY C/...,4121320161357322869,republican
4,C00001313,REPUBLICAN PARTY OF MINNESOTA,RON CAREY,480 CEDAR STREET SUITE 560,.,ST PAUL,MN,55101,U,Y,...,DEZER PROPERTIES,11072016,7804,,620924-997300-1-P,1133652,X,PARTNERSHIP SUBITEMIZATION OF TRUMP VICTORY C/...,4121320161357322870,republican


In [35]:
X_params

['CMTE_CITY',
 'CMTE_ST',
 'CMTE_ZIP',
 'EMPLOYER',
 'OCCUPATION',
 'TRANSACTION_AMT']

In [36]:
donor_df2=donor_df[params]

In [37]:
donor_df2.head()

Unnamed: 0,party,CMTE_CITY,CMTE_ST,CMTE_ZIP,EMPLOYER,OCCUPATION,TRANSACTION_AMT
0,republican,ST PAUL,MN,55101,CEO,"THIRD LAKE CAPITAL, LLC",7804
1,republican,ST PAUL,MN,55101,PRESIDENT & CEO,VITAQUEST INTERNATIONAL,3042
2,republican,ST PAUL,MN,55101,PHYSICIAN EXECUTIVE,WELLOSOPHY CORPORATION,1138
3,republican,ST PAUL,MN,55101,FOUNDER,DEZER PROPERTIES,7804
4,republican,ST PAUL,MN,55101,ACCOUNTANT,DEZER PROPERTIES,7804


In [38]:
donor_df2.dtypes

party              object
CMTE_CITY          object
CMTE_ST            object
CMTE_ZIP           object
EMPLOYER           object
OCCUPATION         object
TRANSACTION_AMT     int64
dtype: object

In [39]:
donor_df2[["EMPLOYER_STRING"]]=donor_df2[["EMPLOYER"]].astype(str)
donor_df2[["OCCUPATION_STRING"]]=donor_df2[["OCCUPATION"]].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [40]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()


donor_df2["party"] = le.fit_transform(donor_df2["party"]) 
donor_df2["CMTE_CITY"] = le.fit_transform(donor_df2["CMTE_CITY"]) 
donor_df2["CMTE_ST"] = le.fit_transform(donor_df2["CMTE_ST"]) 
donor_df2["CMTE_ZIP"] = le.fit_transform(donor_df2["CMTE_ZIP"]) 
donor_df2["EMPLOYER_STRING"] = le.fit_transform(donor_df2["EMPLOYER_STRING"]) 
donor_df2["OCCUPATION_STRING"] = le.fit_transform(donor_df2["OCCUPATION_STRING"]) 
donor_df2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/p

Unnamed: 0,party,CMTE_CITY,CMTE_ST,CMTE_ZIP,EMPLOYER,OCCUPATION,TRANSACTION_AMT,EMPLOYER_STRING,OCCUPATION_STRING
0,2,74,15,75,CEO,"THIRD LAKE CAPITAL, LLC",7804,338,1348
1,2,74,15,75,PRESIDENT & CEO,VITAQUEST INTERNATIONAL,3042,1495,1408
2,2,74,15,75,PHYSICIAN EXECUTIVE,WELLOSOPHY CORPORATION,1138,1465,1443
3,2,74,15,75,FOUNDER,DEZER PROPERTIES,7804,761,359
4,2,74,15,75,ACCOUNTANT,DEZER PROPERTIES,7804,27,359


In [41]:
donor_df2=donor_df2.drop("EMPLOYER", axis=1)
donor_df2=donor_df2.drop("OCCUPATION", axis=1)
donor_df2.head()

Unnamed: 0,party,CMTE_CITY,CMTE_ST,CMTE_ZIP,TRANSACTION_AMT,EMPLOYER_STRING,OCCUPATION_STRING
0,2,74,15,75,7804,338,1348
1,2,74,15,75,3042,1495,1408
2,2,74,15,75,1138,1465,1443
3,2,74,15,75,7804,761,359
4,2,74,15,75,7804,27,359


In [42]:
donor_df2.tail()

Unnamed: 0,party,CMTE_CITY,CMTE_ST,CMTE_ZIP,TRANSACTION_AMT,EMPLOYER_STRING,OCCUPATION_STRING
45223,1,54,21,13,50000,1565,1299
45224,2,45,11,89,270,1375,880
45225,2,45,11,89,250,1375,880
45226,2,45,11,89,270,1375,880
45227,2,45,11,89,250,1375,880


In [43]:
X=donor_df2.copy()
X=X.drop("party", axis=1)
y=donor_df2["party"]

In [44]:
X.describe()

Unnamed: 0,CMTE_CITY,CMTE_ST,CMTE_ZIP,TRANSACTION_AMT,EMPLOYER_STRING,OCCUPATION_STRING
count,45228.0,45228.0,45228.0,45228.0,45228.0,45228.0
mean,46.041832,15.720195,19.996728,190.542562,1248.214778,844.858185
std,24.027417,6.16781,23.662774,4716.825678,494.747083,355.34479
min,0.0,0.0,0.0,-2.0,0.0,0.0
25%,15.0,12.0,4.0,10.0,1185.0,654.0
50%,54.0,12.0,13.0,25.0,1347.0,868.0
75%,56.0,21.0,13.0,50.0,1565.0,1130.0
max,88.0,30.0,108.0,500000.0,2137.0,1456.0


In [49]:
X.to_csv("test2.csv")

In [50]:
from collections import Counter

# Create X_train, X_test, y_train, y_test
# YOUR CODE HERE
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
Counter(y)

Counter({2: 1678, 1: 23732, 0: 19818})

In [51]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [52]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)

In [58]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=200)

# Train the model
log_classifier.fit(X_train_scaled,y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test_scaled)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.918


In [61]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.999
