In [1]:
# Import dependencies
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import json
import pandas as pd
from pandas.io import sql
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score, precision_score, f1_score   
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.metrics import r2_score
from consts import *
from sklearn.svm import SVC
%matplotlib inline

In [2]:
# Connecting to Postgres instance
engine = create_engine(CREATE_ENGINE_STR)

In [3]:
# Printing info for table names 
print (engine.table_names())

['committees', 'candidates', 'education', 'six_state_donations', 'health_results', 'donations', 'classifying_results', 'health_metrics', 'agg_county_votes', 'birth_death_rate', 'postal_codes', 'res_lr', 'agg_county_donors', 'pres_votes_6t', 'unemployment', 'res_log']


In [4]:
#Get all donation records for a single state and return it in a dataframe
def donor_state_query(state):
    #Run queries to get all donation records from the states into dfs
    #select_sql = f'select * from {TABLE_SIX_STATE_DONATIONS} where "STATE"=\'{state.upper()}\''
    select_sql = f'select * from {TABLE_SIX_STATE_DONATIONS} where "STATE"=\'{state.upper()}\' LIMIT 10000'
    donor_df = pd.read_sql_query(select_sql, con=engine)
    return donor_df

In [5]:
committee_df = pd.read_sql_query('select * from "committees"', con=engine)

In [6]:
donor_df = donor_state_query("FL")

In [7]:
donor_df = committee_df.merge(donor_df, left_on='CMTE_ID', right_on='CMTE_ID')

In [8]:
donor_df = merge_cmtid_party(donor_df)

In [9]:
y_params = ["party"]
X_params = ["CMTE_CITY", "CMTE_ST", "CMTE_ZIP", "EMPLOYER", "OCCUPATION", "TRANSACTION_AMT"]
params = ["party","CMTE_CITY", "CMTE_ST", "CMTE_ZIP", "EMPLOYER", "OCCUPATION", "TRANSACTION_AMT"]

In [10]:
donor_df.head()

Unnamed: 0,CMTE_ID,CMTE_NM,TRES_NM,CMTE_ST1,CMTE_ST2,CMTE_CITY,CMTE_ST,CMTE_ZIP,CMTE_DSGN,CMTE_TP,...,OCCUPATION,TRANSACTION_DT,TRANSACTION_AMT,OTHER_ID,TRAN_ID,FILE_NUM,MEMO_CD,MEMO_TEXT,SUB_ID,party
0,C00002089,CWA-COPE POLITICAL CONTRIBUTIONS COMMITTEE,BARBARA J EASTERLING,501 THIRD STREET NW,,WASHINGTON,DC,20001,U,Q,...,SERVICE TECH,7072016,40,,C25371856,1125051,,* PAYROLL DEDUCTION: $40 MONTHLY,4120120161356398690,
1,C00002089,CWA-COPE POLITICAL CONTRIBUTIONS COMMITTEE,BARBARA J EASTERLING,501 THIRD STREET NW,,WASHINGTON,DC,20001,U,Q,...,,7072016,30,,C25371759,1125051,,* PAYROLL DEDUCTION: $30 MONTHLY,4120120161356400345,
2,C00002089,CWA-COPE POLITICAL CONTRIBUTIONS COMMITTEE,BARBARA J EASTERLING,501 THIRD STREET NW,,WASHINGTON,DC,20001,U,Q,...,ORGANIZING COORDINATOR,7212016,48,,C25372084,1125051,,* PAYROLL DEDUCTION: $24 BI-WEEKLY,4120120161356399864,
3,C00002089,CWA-COPE POLITICAL CONTRIBUTIONS COMMITTEE,BARBARA J EASTERLING,501 THIRD STREET NW,,WASHINGTON,DC,20001,U,Q,...,CSA,7072016,40,,C25373324,1125051,,* PAYROLL DEDUCTION: $40 MONTHLY,4120120161356399618,
4,C00002089,CWA-COPE POLITICAL CONTRIBUTIONS COMMITTEE,BARBARA J EASTERLING,501 THIRD STREET NW,,WASHINGTON,DC,20001,U,Q,...,FACILITY TECH,7072016,40,,C25371773,1125051,,* PAYROLL DEDUCTION: $40 MONTHLY,4120120161356400283,


In [11]:
X_params

['CMTE_CITY',
 'CMTE_ST',
 'CMTE_ZIP',
 'EMPLOYER',
 'OCCUPATION',
 'TRANSACTION_AMT']

In [12]:
donor_df2=donor_df[params]

In [13]:
donor_df2.head()

Unnamed: 0,party,CMTE_CITY,CMTE_ST,CMTE_ZIP,EMPLOYER,OCCUPATION,TRANSACTION_AMT
0,,WASHINGTON,DC,20001,BELLSOUTH TELECOMMS,SERVICE TECH,40
1,,WASHINGTON,DC,20001,BELLSOUTH TELECOMMS,,30
2,,WASHINGTON,DC,20001,COMM. WORKERS OF AMER.,ORGANIZING COORDINATOR,48
3,,WASHINGTON,DC,20001,BELLSOUTH TELECOMMS,CSA,40
4,,WASHINGTON,DC,20001,BELLSOUTH TELECOMMS,FACILITY TECH,40


In [14]:
donor_df2.dtypes

party              object
CMTE_CITY          object
CMTE_ST            object
CMTE_ZIP           object
EMPLOYER           object
OCCUPATION         object
TRANSACTION_AMT     int64
dtype: object

In [15]:
donor_df2[["EMPLOYER_STRING"]]=donor_df2[["EMPLOYER"]].astype(str)
donor_df2[["OCCUPATION_STRING"]]=donor_df2[["OCCUPATION"]].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [16]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()


donor_df2["party"] = le.fit_transform(donor_df2["party"]) 
donor_df2["CMTE_CITY"] = le.fit_transform(donor_df2["CMTE_CITY"]) 
donor_df2["CMTE_ST"] = le.fit_transform(donor_df2["CMTE_ST"]) 
donor_df2["CMTE_ZIP"] = le.fit_transform(donor_df2["CMTE_ZIP"]) 
donor_df2["EMPLOYER_STRING"] = le.fit_transform(donor_df2["EMPLOYER_STRING"]) 
donor_df2["OCCUPATION_STRING"] = le.fit_transform(donor_df2["OCCUPATION_STRING"]) 
donor_df2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/p

Unnamed: 0,party,CMTE_CITY,CMTE_ST,CMTE_ZIP,EMPLOYER,OCCUPATION,TRANSACTION_AMT,EMPLOYER_STRING,OCCUPATION_STRING
0,0,222,7,49,BELLSOUTH TELECOMMS,SERVICE TECH,40,201,1636
1,0,222,7,49,BELLSOUTH TELECOMMS,,30,201,1217
2,0,222,7,49,COMM. WORKERS OF AMER.,ORGANIZING COORDINATOR,48,425,1238
3,0,222,7,49,BELLSOUTH TELECOMMS,CSA,40,201,378
4,0,222,7,49,BELLSOUTH TELECOMMS,FACILITY TECH,40,201,729


In [17]:
donor_df2=donor_df2.drop("EMPLOYER", axis=1)
donor_df2=donor_df2.drop("OCCUPATION", axis=1)
donor_df2.head()

Unnamed: 0,party,CMTE_CITY,CMTE_ST,CMTE_ZIP,TRANSACTION_AMT,EMPLOYER_STRING,OCCUPATION_STRING
0,0,222,7,49,40,201,1636
1,0,222,7,49,30,201,1217
2,0,222,7,49,48,425,1238
3,0,222,7,49,40,201,378
4,0,222,7,49,40,201,729


In [18]:
donor_df2.tail()

Unnamed: 0,party,CMTE_CITY,CMTE_ST,CMTE_ZIP,TRANSACTION_AMT,EMPLOYER_STRING,OCCUPATION_STRING
57018,0,2,38,100,1000,1111,1000
57019,0,2,38,100,1000,1575,1513
57020,0,89,9,132,50,1575,1513
57021,0,89,9,132,50,1575,1513
57022,0,89,9,132,50,1575,1513


In [19]:
X=donor_df2.copy()
X=X.drop("party", axis=1)
y=donor_df2["party"]

In [20]:
X.describe()

Unnamed: 0,CMTE_CITY,CMTE_ST,CMTE_ZIP,TRANSACTION_AMT,EMPLOYER_STRING,OCCUPATION_STRING
count,57023.0,57023.0,57023.0,57023.0,57023.0,57023.0
mean,163.992021,17.433948,88.991161,265.351262,1218.419778,1104.320695
std,70.045896,12.341171,76.174715,2542.49235,532.49684,539.699557
min,0.0,0.0,0.0,-2700.0,0.0,0.0
25%,150.0,7.0,32.0,20.0,795.0,686.0
50%,182.0,9.0,53.0,50.0,1344.0,1287.0
75%,222.0,29.0,118.0,100.0,1575.0,1513.0
max,238.0,42.0,301.0,125000.0,2143.0,1964.0


In [21]:
X.to_csv("test2.csv")

In [22]:
from collections import Counter

# Create X_train, X_test, y_train, y_test
# YOUR CODE HERE
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
Counter(y)

Counter({0: 37900, 2: 3151, 1: 15967, 3: 5})

In [23]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [24]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)

In [25]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=200)

# Train the model
log_classifier.fit(X_train_scaled,y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test_scaled)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.841


In [26]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.995
