In [1]:
# Import dependencies
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import json
import pandas as pd
from pandas.io import sql
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score, precision_score, f1_score   
from sklearn.linear_model import LogisticRegression
import numpy
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.metrics import r2_score
from consts import *
from sklearn.svm import SVC
%matplotlib inline

In [2]:
# Connecting to Postgres instance
engine = create_engine(CREATE_ENGINE_STR)

In [3]:
# Printing info for table names 
print (engine.table_names())

['committees', 'candidates', 'education', 'six_state_donations', 'donations', 'classifying_results', 'health_metrics', 'agg_county_votes', 'birth_death_rate', 'postal_codes', 'res_lr', 'agg_county_donors', 'pres_votes_6t', 'unemployment', 'res_log']


In [4]:
#Get all donation records for a single state and return it in a dataframe
def donor_state_query(state):
    #Run queries to get all donation records from the states into dfs
    #select_sql = f'select * from {TABLE_SIX_STATE_DONATIONS} where "STATE"=\'{state.upper()}\''
    select_sql = f'select * from {TABLE_SIX_STATE_DONATIONS} where "STATE"=\'{state.upper()}\' LIMIT 10000'
    donor_df = pd.read_sql_query(select_sql, con=engine)
    return donor_df

In [5]:
committee_df = pd.read_sql_query('select * from "committees"', con=engine)

In [6]:
donor_df = donor_state_query("FL")

In [7]:
donor_df = committee_df.merge(donor_df, left_on='CMTE_ID', right_on='CMTE_ID')

In [8]:
donor_df = merge_cmtid_party(donor_df)

In [9]:
y_params = ["party"]
X_params = ["CMTE_CITY", "CMTE_ST", "CMTE_ZIP", "EMPLOYER", "OCCUPATION", "TRANSACTION_AMT"]
params = ["party","CMTE_CITY", "CMTE_ST", "CMTE_ZIP", "EMPLOYER", "OCCUPATION", "TRANSACTION_AMT"]

In [10]:
donor_df.head()

Unnamed: 0,CMTE_ID,CMTE_NM,TRES_NM,CMTE_ST1,CMTE_ST2,CMTE_CITY,CMTE_ST,CMTE_ZIP,CMTE_DSGN,CMTE_TP,...,OCCUPATION,TRANSACTION_DT,TRANSACTION_AMT,OTHER_ID,TRAN_ID,FILE_NUM,MEMO_CD,MEMO_TEXT,SUB_ID,party
0,C00000885,INTERNATIONAL UNION OF PAINTERS & ALLIED TRADE...,JAMES A WILLIAMS,"1750 NEW YORK AVE, NW",.,WASHINGTON,DC,20006,U,Q,...,GLAZING / GLASS WORKER,12222016,9,,40437820,1145830,,,4020820171369953268,other
1,C00000885,INTERNATIONAL UNION OF PAINTERS & ALLIED TRADE...,JAMES A WILLIAMS,"1750 NEW YORK AVE, NW",.,WASHINGTON,DC,20006,U,Q,...,GLAZING / GLASS WORKER,12222016,9,,40439681,1145830,,,4020820171369953292,other
2,C00000885,INTERNATIONAL UNION OF PAINTERS & ALLIED TRADE...,JAMES A WILLIAMS,"1750 NEW YORK AVE, NW",.,WASHINGTON,DC,20006,U,Q,...,PAINTER,12222016,10,,40439461,1145830,,,4020820171369953289,other
3,C00000885,INTERNATIONAL UNION OF PAINTERS & ALLIED TRADE...,JAMES A WILLIAMS,"1750 NEW YORK AVE, NW",.,WASHINGTON,DC,20006,U,Q,...,TRADE SHOW,12222016,10,,40432902,1145830,,,4020820171369953121,other
4,C00000885,INTERNATIONAL UNION OF PAINTERS & ALLIED TRADE...,JAMES A WILLIAMS,"1750 NEW YORK AVE, NW",.,WASHINGTON,DC,20006,U,Q,...,TRADE SHOW,12222016,10,,40432220,1145830,,,4020820171369953094,other


In [11]:
X_params

['CMTE_CITY',
 'CMTE_ST',
 'CMTE_ZIP',
 'EMPLOYER',
 'OCCUPATION',
 'TRANSACTION_AMT']

In [12]:
donor_df2=donor_df[params]

In [13]:
donor_df2.head()

Unnamed: 0,party,CMTE_CITY,CMTE_ST,CMTE_ZIP,EMPLOYER,OCCUPATION,TRANSACTION_AMT
0,other,WASHINGTON,DC,20006,IUPAT,GLAZING / GLASS WORKER,9
1,other,WASHINGTON,DC,20006,IUPAT,GLAZING / GLASS WORKER,9
2,other,WASHINGTON,DC,20006,IUPAT,PAINTER,10
3,other,WASHINGTON,DC,20006,IUPAT DISTRICT COUNCIL 78,TRADE SHOW,10
4,other,WASHINGTON,DC,20006,IUPAT DISTRICT COUNCIL 78,TRADE SHOW,10


In [14]:
donor_df2.dtypes

party              object
CMTE_CITY          object
CMTE_ST            object
CMTE_ZIP           object
EMPLOYER           object
OCCUPATION         object
TRANSACTION_AMT     int64
dtype: object

In [15]:
donor_df2[["EMPLOYER_STRING"]]=donor_df2[["EMPLOYER"]].astype(str)
donor_df2[["OCCUPATION_STRING"]]=donor_df2[["OCCUPATION"]].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [16]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()


donor_df2["party"] = le.fit_transform(donor_df2["party"]) 
donor_df2["CMTE_CITY"] = le.fit_transform(donor_df2["CMTE_CITY"]) 
donor_df2["CMTE_ST"] = le.fit_transform(donor_df2["CMTE_ST"]) 
donor_df2["CMTE_ZIP"] = le.fit_transform(donor_df2["CMTE_ZIP"]) 
donor_df2["EMPLOYER_STRING"] = le.fit_transform(donor_df2["EMPLOYER_STRING"]) 
donor_df2["OCCUPATION_STRING"] = le.fit_transform(donor_df2["OCCUPATION_STRING"]) 
donor_df2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/p

Unnamed: 0,party,CMTE_CITY,CMTE_ST,CMTE_ZIP,EMPLOYER,OCCUPATION,TRANSACTION_AMT,EMPLOYER_STRING,OCCUPATION_STRING
0,1,160,6,51,IUPAT,GLAZING / GLASS WORKER,9,947,599
1,1,160,6,51,IUPAT,GLAZING / GLASS WORKER,9,947,599
2,1,160,6,51,IUPAT,PAINTER,10,947,965
3,1,160,6,51,IUPAT DISTRICT COUNCIL 78,TRADE SHOW,10,948,1430
4,1,160,6,51,IUPAT DISTRICT COUNCIL 78,TRADE SHOW,10,948,1430


In [17]:
donor_df2=donor_df2.drop("EMPLOYER", axis=1)
donor_df2=donor_df2.drop("OCCUPATION", axis=1)
donor_df2.head()

Unnamed: 0,party,CMTE_CITY,CMTE_ST,CMTE_ZIP,TRANSACTION_AMT,EMPLOYER_STRING,OCCUPATION_STRING
0,1,160,6,51,9,947,599
1,1,160,6,51,9,947,599
2,1,160,6,51,10,947,965
3,1,160,6,51,10,948,1430
4,1,160,6,51,10,948,1430


In [18]:
donor_df2.tail()

Unnamed: 0,party,CMTE_CITY,CMTE_ST,CMTE_ZIP,TRANSACTION_AMT,EMPLOYER_STRING,OCCUPATION_STRING
59196,1,160,6,45,1000,1306,1192
59197,1,160,6,45,1000,1306,1192
59198,1,112,33,138,250,169,753
59199,1,112,33,138,250,169,753
59200,1,112,33,138,250,169,753


In [19]:
X=donor_df2.copy()
X=X.drop("party", axis=1)
y=donor_df2["party"]

In [20]:
X.describe()

Unnamed: 0,CMTE_CITY,CMTE_ST,CMTE_ZIP,TRANSACTION_AMT,EMPLOYER_STRING,OCCUPATION_STRING
count,59201.0,59201.0,59201.0,59201.0,59201.0,59201.0
mean,108.029864,13.952129,43.008902,162.774193,1227.29611,870.790949
std,50.823305,6.927287,44.621256,1447.463095,487.186353,412.808325
min,0.0,0.0,0.0,-5000.0,0.0,0.0
25%,91.0,6.0,20.0,11.0,983.0,487.0
50%,103.0,14.0,22.0,25.0,1306.0,925.0
75%,160.0,21.0,47.0,100.0,1521.0,1192.0
max,168.0,33.0,213.0,100000.0,2079.0,1601.0


In [21]:
from collections import Counter

# Create X_train, X_test, y_train, y_test
# YOUR CODE HERE
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
Counter(y)

Counter({1: 33284, 0: 20020, 2: 5897})

In [22]:
df_scaled = StandardScaler().fit_transform(X)
print(df_scaled[0:5])

[[ 1.02257369 -1.14795252  0.17908876 -0.10623794 -0.5753414  -0.65840059]
 [ 1.02257369 -1.14795252  0.17908876 -0.10623794 -0.5753414  -0.65840059]
 [ 1.02257369 -1.14795252  0.17908876 -0.10554707 -0.5753414   0.22821692]
 [ 1.02257369 -1.14795252  0.17908876 -0.10554707 -0.57328878  1.35465721]
 [ 1.02257369 -1.14795252  0.17908876 -0.10554707 -0.57328878  1.35465721]]


In [32]:
X_scaled=scaler.fit(X_train)
print(X_scaled[0:5])

TypeError: 'StandardScaler' object is not subscriptable

In [23]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)

In [25]:
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)
# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [28]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=100, random_state=78)

In [30]:
# Train the data
rf_model=rf_model.fit(X_test_scaled, y_train)

ValueError: Found input variables with inconsistent numbers of samples: [14801, 44400]