<a href="https://colab.research.google.com/github/zwcrowley/module_21_deep_learning_challenge/blob/main/deep_learning_charity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Project 4: Predicting Congressional Bill Passage**

# Senate Model: Machine Learning Optimization and Model Output

## Team 7


## Import dependencies and read in data:

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from joblib import dump, load
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#  Import and read the cleaned house data from AWS S3:
# house_df = pd.read_csv("https://project-4-team7.s3.ca-central-1.amazonaws.com/cleaned_house.csv")
# house_df.head()

In [3]:
# Import from google drive folder:
# Mount google drive to get data:
from google.colab import drive
drive.mount('/content/gdrive')
senate_filepath = "/content/gdrive/MyDrive/DataClassNotebooks/Project-4/Resources/senate_cleaned.csv"

# Read in senate data using pandas:
senate_df = pd.read_csv(senate_filepath)
# Glimpse senate data:
senate_df.head()

Mounted at /content/gdrive


Unnamed: 0,Bill Type,Congress,Number of Cosponsors,Cosponsor Dems,Cosponsor Reps,Cosponsor Ind,Cosponsor States,Committees,Latest Action,Subject,...,International Narcotics Control,Ethics,Indian Affairs,Intelligence,Printing,Taxation,Library,Economic,bill_passed,bill_referred_committee
0,S.,113,15,15,0,0,12,Senate - Judiciary,Read twice and referred to the Committee on th...,Border security and unlawful immigration,...,0,0,0,0,0,0,0,0,0,1
1,S.,113,16,16,0,0,12,Senate - Judiciary,Read twice and referred to the Committee on th...,Child health,...,0,0,0,0,0,0,0,0,0,1
2,S.,113,16,15,0,1,11,"Senate - Health, Education, Labor, and Pensions",Read twice and referred to the Committee on He...,Child safety and welfare,...,0,0,0,0,0,0,0,0,0,1
3,S.,113,14,14,0,0,12,"Senate - Commerce, Science, and Transportation",Read twice and referred to the Committee on Co...,Academic performance and assessments,...,0,0,0,0,0,0,0,0,0,1
4,S.,113,31,30,0,1,24,Senate - Judiciary,Read twice and referred to the Committee on th...,Crime and Law Enforcement,...,0,0,0,0,0,0,0,0,0,1


# Senate Model

## Preprocessing:

In [4]:
# Check for NAs, duplicates and get the shape of the data:
print(f'The shape of the senate_df data is: {senate_df.shape}')
print(f'The number of NAs in the senate_df data: {senate_df.isnull().sum()}')
print(f'The duplicate rows of NAs in the senate_df data: {senate_df.duplicated().sum()}')
# There are 40 columns, 15593 rows, and no NAs or duplicates in the senate_df dataset.
# target is bill_passed

The shape of the senate_df data is: (15593, 40)
The number of NAs in the senate_df data: Bill Type                                       0
Congress                                        0
Number of Cosponsors                            0
Cosponsor Dems                                  0
Cosponsor Reps                                  0
Cosponsor Ind                                   0
Cosponsor States                                0
Committees                                    312
Latest Action                                   0
Subject                                       369
Sponsor Title                                   0
Sponsor Party                                   0
Sponsor State                                   0
Month Introduced                                0
Agriculture, Nutrition, and Forestry            0
Appropriations                                  0
Armed Services                                  0
Banking, Housing, and Urban Affairs             0
Budget     

In [5]:
# Numeric variable stats
senate_df.describe()

Unnamed: 0,Congress,Number of Cosponsors,Cosponsor Dems,Cosponsor Reps,Cosponsor Ind,Cosponsor States,Month Introduced,"Agriculture, Nutrition, and Forestry",Appropriations,Armed Services,...,International Narcotics Control,Ethics,Indian Affairs,Intelligence,Printing,Taxation,Library,Economic,bill_passed,bill_referred_committee
count,15593.0,15593.0,15593.0,15593.0,15593.0,15593.0,15593.0,15593.0,15593.0,15593.0,...,15593.0,15593.0,15593.0,15593.0,15593.0,15593.0,15593.0,15593.0,15593.0,15593.0
mean,114.9494,5.413006,2.804912,2.498044,0.110049,4.4275,5.422241,0.047393,0.027961,0.048419,...,0.020009,0.020009,0.035465,0.022382,0.020009,0.020009,0.020009,0.020009,0.02918,0.786827
std,1.4721,9.351292,6.024581,5.603372,0.355918,6.585518,3.225571,0.212485,0.164867,0.214657,...,0.140035,0.140035,0.184957,0.147927,0.140035,0.140035,0.140035,0.140035,0.168316,0.409562
min,113.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,114.0,1.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,115.0,2.0,1.0,1.0,0.0,2.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,116.0,6.0,3.0,2.0,0.0,5.0,7.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,118.0,99.0,55.0,53.0,3.0,50.0,12.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
# Get the column names:
list(senate_df.columns)


['Bill Type',
 'Congress',
 'Number of Cosponsors',
 'Cosponsor Dems',
 'Cosponsor Reps',
 'Cosponsor Ind',
 'Cosponsor States',
 'Committees',
 'Latest Action',
 'Subject',
 'Sponsor Title',
 'Sponsor Party',
 'Sponsor State',
 'Month Introduced',
 'Agriculture, Nutrition, and Forestry',
 'Appropriations',
 'Armed Services',
 'Banking, Housing, and Urban Affairs',
 'Budget',
 'Commerce, Science, and Transportation',
 'Energy and Natural Resources',
 'Environment and Public Works',
 'Finance',
 'Foreign Relations',
 'Health, Education, Labor, and Pensions',
 'Homeland Security and Governmental Affairs',
 'Judiciary',
 'Rules and Administration',
 'Small Business and Entrepreneurship',
 'Veterans Affairs',
 'International Narcotics Control',
 'Ethics',
 'Indian Affairs',
 'Intelligence',
 'Printing',
 'Taxation',
 'Library',
 'Economic',
 'bill_passed',
 'bill_referred_committee']

In [7]:
# Check out Subject column:
print(f'Count of values for Subject column: \n{senate_df["Subject"].value_counts()}')

Count of values for Subject column: 
Armed Forces and National Security              857
Administrative law and regulatory procedures    830
Health                                          763
Taxation                                        534
Crime and Law Enforcement                       411
                                               ... 
Germany                                           1
Israel                                            1
Nebraska                                          1
Federal Emergency Management Agency (FEMA)        1
Congressional districts and representation        1
Name: Subject, Length: 689, dtype: int64


In [8]:
# Model Target:
senate_df["bill_passed"].value_counts()
# 455 bills  in the dataset which originated from the House became law in the 113th-118th Congresses

0    15138
1      455
Name: bill_passed, dtype: int64

In [9]:
# Drop the non-beneficial columns: 'Unnamed' column.
senate_df = senate_df.drop(["Bill Type", "Committees", "Latest Action" ], axis='columns')
senate_df.head()

Unnamed: 0,Congress,Number of Cosponsors,Cosponsor Dems,Cosponsor Reps,Cosponsor Ind,Cosponsor States,Subject,Sponsor Title,Sponsor Party,Sponsor State,...,International Narcotics Control,Ethics,Indian Affairs,Intelligence,Printing,Taxation,Library,Economic,bill_passed,bill_referred_committee
0,113,15,15,0,0,12,Border security and unlawful immigration,Sen.,D,NV,...,0,0,0,0,0,0,0,0,0,1
1,113,16,16,0,0,12,Child health,Sen.,D,NV,...,0,0,0,0,0,0,0,0,0,1
2,113,16,15,0,1,11,Child safety and welfare,Sen.,D,NV,...,0,0,0,0,0,0,0,0,0,1
3,113,14,14,0,0,12,Academic performance and assessments,Sen.,D,NV,...,0,0,0,0,0,0,0,0,0,1
4,113,31,30,0,1,24,Crime and Law Enforcement,Sen.,D,NV,...,0,0,0,0,0,0,0,0,0,1


In [10]:
senate_df.dtypes
# Need to create dummies for: Subject, Sponsor Title,  Sponsor Party, Sponsor State columns
# Need to bin Subject column before creating dummy vars.

Congress                                       int64
Number of Cosponsors                           int64
Cosponsor Dems                                 int64
Cosponsor Reps                                 int64
Cosponsor Ind                                  int64
Cosponsor States                               int64
Subject                                       object
Sponsor Title                                 object
Sponsor Party                                 object
Sponsor State                                 object
Month Introduced                               int64
Agriculture, Nutrition, and Forestry           int64
Appropriations                                 int64
Armed Services                                 int64
Banking, Housing, and Urban Affairs            int64
Budget                                         int64
Commerce, Science, and Transportation          int64
Energy and Natural Resources                   int64
Environment and Public Works                  

In [11]:
# Determine the number of unique values in each column.
senate_df.nunique()

Congress                                        6
Number of Cosponsors                           86
Cosponsor Dems                                 53
Cosponsor Reps                                 53
Cosponsor Ind                                   4
Cosponsor States                               51
Subject                                       689
Sponsor Title                                   1
Sponsor Party                                   3
Sponsor State                                  50
Month Introduced                               12
Agriculture, Nutrition, and Forestry            2
Appropriations                                  2
Armed Services                                  2
Banking, Housing, and Urban Affairs             2
Budget                                          2
Commerce, Science, and Transportation           2
Energy and Natural Resources                    2
Environment and Public Works                    2
Finance                                         2


In [12]:
# Look at Subject value counts for binning
Subject_counts_sen = senate_df["Subject"].value_counts()
print(f'Count of values for Subject column: \n{Subject_counts_sen}')

Count of values for Subject column: 
Armed Forces and National Security              857
Administrative law and regulatory procedures    830
Health                                          763
Taxation                                        534
Crime and Law Enforcement                       411
                                               ... 
Germany                                           1
Israel                                            1
Nebraska                                          1
Federal Emergency Management Agency (FEMA)        1
Congressional districts and representation        1
Name: Subject, Length: 689, dtype: int64


In [13]:
# Now change cutoff value to get  bins for Subject
# Cutoff value of 100 to bin: 
# use the variable name `Subject_types_to_replace_sen`
Subject_types_to_replace_sen = list(Subject_counts_sen[Subject_counts_sen<100].index)

# Replace in dataframe
for sub in Subject_types_to_replace_sen:
    senate_df['Subject'] = senate_df['Subject'].replace(sub,"Other")

# Check to make sure binning was successful
senate_df['Subject'].value_counts()
# 27 Bins

Other                                           7401
Armed Forces and National Security               857
Administrative law and regulatory procedures     830
Health                                           763
Taxation                                         534
Crime and Law Enforcement                        411
Government Operations and Politics               394
Public Lands and Natural Resources               325
Congressional oversight                          313
International Affairs                            310
Finance and Financial Sector                     291
Education                                        280
Transportation and Public Works                  262
Commerce                                         253
Science, Technology, Communications              204
Agriculture and Food                             192
Energy                                           190
Advisory bodies                                  176
Appropriations                                

In [14]:
# Convert categorical data to numeric with `pd.get_dummies`
senate_df = pd.get_dummies(senate_df,dtype=float)
senate_df.head()
# Creates a df of 114 columns

Unnamed: 0,Congress,Number of Cosponsors,Cosponsor Dems,Cosponsor Reps,Cosponsor Ind,Cosponsor States,Month Introduced,"Agriculture, Nutrition, and Forestry",Appropriations,Armed Services,...,Sponsor State_SD,Sponsor State_TN,Sponsor State_TX,Sponsor State_UT,Sponsor State_VA,Sponsor State_VT,Sponsor State_WA,Sponsor State_WI,Sponsor State_WV,Sponsor State_WY
0,113,15,15,0,0,12,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,113,16,16,0,0,12,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,113,16,15,0,1,11,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,113,14,14,0,0,12,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,113,31,30,0,1,24,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Split our preprocessed data into our features and target arrays
y = senate_df["bill_passed"].values
X = senate_df.drop(["bill_passed"], axis='columns').values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [16]:
# Create a StandardScaler instances
senate_scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = senate_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

### Attempt #1


*   Total layers: 5 total
*   Activation function to for each layer is: relu, relu, tanh, tanh,  sigmoid
*   Number of neurons for each hidden layers: 9,7,7,5,1
*   Epochs: 100

In [17]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  9
hidden_nodes_layer2_3 = 7
hidden_nodes_layer4 = 5

nn_sen = tf.keras.models.Sequential()

# First hidden layer
nn_sen.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn_sen.add(tf.keras.layers.Dense(units=hidden_nodes_layer2_3, activation="relu"))

# Third hidden layer
nn_sen.add(tf.keras.layers.Dense(units=hidden_nodes_layer2_3, activation="tanh"))

# Fourth hidden layer
nn_sen.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation="tanh"))

# Output layer
nn_sen.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn_sen.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 9)                 1026      
                                                                 
 dense_1 (Dense)             (None, 7)                 70        
                                                                 
 dense_2 (Dense)             (None, 7)                 56        
                                                                 
 dense_3 (Dense)             (None, 5)                 40        
                                                                 
 dense_4 (Dense)             (None, 1)                 6         
                                                                 
Total params: 1,198
Trainable params: 1,198
Non-trainable params: 0
_________________________________________________________________


In [18]:
# Compile the model
nn_sen.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [19]:
# Train the model
fit_model = nn_sen.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [20]:
# Evaluate the Senate model using the test data:
model_loss_1, model_accuracy_1 = nn_sen.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {round(model_loss_1,4)}, Accuracy: {round(model_accuracy_1,4)}")

122/122 - 0s - loss: 0.1704 - accuracy: 0.9574 - 364ms/epoch - 3ms/step
Loss: 0.1704, Accuracy: 0.9574


In [21]:
# Export the model from optimization attempt 2 to HDF5 file
from google.colab import files

output_filepath = "/content/gdrive/MyDrive/DataClassNotebooks/Project-4/output"

# Save model 2, nn_2, and download a copy to local machine:
nn_sen.save(f'{output_filepath}/senate_model.h5')
files.download(f'{output_filepath}/senate_model.h5')

# Save the StandardScaler() instance, senate_scaler, for use in the flask app later:
dump(senate_scaler, f'{output_filepath}/senate_scaler.bin', compress=True)
files.download(f'{output_filepath}/senate_scaler.bin')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>