In [74]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
import requests

In [75]:
# Read in data from S3 Buckets
url = "https://group9-capsproj.s3.us-east-2.amazonaws.com/insurance_indexed.csv"
r = requests.get(url)
complete_data_df = pd.read_csv(url)
complete_data_df.head()

Unnamed: 0.1,Unnamed: 0,id,age,average_income,sex,bmi,children,smoker,region,charges
0,0,0,18,"$10,753.50",male,33.77,1,no,southeast,1725.5523
1,1,1,18,"$10,753.50",male,34.1,0,no,southeast,1137.011
2,2,2,18,"$10,753.50",female,26.315,0,no,northeast,2198.18985
3,3,3,18,"$10,753.50",female,38.665,2,no,northeast,3393.35635
4,4,4,18,"$10,753.50",female,35.625,0,no,northeast,2211.13075


In [76]:
# Configure settings for RDS
from getpass import getpass
password = getpass('Enter database password')
mode = "append"
jdbc_url="jdbc:postgresql://capsproj.c3oqll57cdog.us-east-2.rds.amazonaws.com:5432/AWSPROJ"
config = {"user":"postgres", 
          "password": "<password>", 
          "driver":"org.postgresql.Driver"}

In [91]:
url1 = "https://group9-capsproj.s3.us-east-2.amazonaws.com/average_age_by_state.csv"
average_age_by_state_df = pd.read_csv(url1)
average_age_by_state_df.head()

Unnamed: 0,id,region,average_age
0,0,northwest,79.88
1,1,northeast,79.24
2,2,southwest,76.96
3,3,southeast,78.43


In [78]:
complete_data_df['average_income'] = complete_data_df['average_income'].str.replace(",", "")
complete_data_df['average_income'] = complete_data_df['average_income'].str.replace("$", "")
complete_data_df['average_income'] = complete_data_df['average_income'].astype(float)

  


In [79]:
complete_data_df.drop('Unnamed: 0', axis = 1)

Unnamed: 0,id,age,average_income,sex,bmi,children,smoker,region,charges
0,0,18,10753.50,male,33.770,1,no,southeast,1725.55230
1,1,18,10753.50,male,34.100,0,no,southeast,1137.01100
2,2,18,10753.50,female,26.315,0,no,northeast,2198.18985
3,3,18,10753.50,female,38.665,2,no,northeast,3393.35635
4,4,18,10753.50,female,35.625,0,no,northeast,2211.13075
...,...,...,...,...,...,...,...,...,...
1333,1333,64,87985.91,female,31.825,2,no,northeast,16069.08475
1334,1334,64,87985.91,female,26.885,0,yes,northwest,29330.98315
1335,1335,64,87985.91,male,26.410,0,no,northeast,14394.55790
1336,1336,64,87985.91,male,36.960,2,yes,southeast,49577.66240


In [80]:
complete_data_df.nunique()

Unnamed: 0        1338
id                1338
age                 47
average_income      47
sex                  2
bmi                548
children             6
smoker               2
region               4
charges           1337
dtype: int64

In [81]:
complete_data_cat = complete_data_df.dtypes[complete_data_df.dtypes == "object"].index.tolist()
complete_data_df

Unnamed: 0.1,Unnamed: 0,id,age,average_income,sex,bmi,children,smoker,region,charges
0,0,0,18,10753.50,male,33.770,1,no,southeast,1725.55230
1,1,1,18,10753.50,male,34.100,0,no,southeast,1137.01100
2,2,2,18,10753.50,female,26.315,0,no,northeast,2198.18985
3,3,3,18,10753.50,female,38.665,2,no,northeast,3393.35635
4,4,4,18,10753.50,female,35.625,0,no,northeast,2211.13075
...,...,...,...,...,...,...,...,...,...,...
1333,1333,1333,64,87985.91,female,31.825,2,no,northeast,16069.08475
1334,1334,1334,64,87985.91,female,26.885,0,yes,northwest,29330.98315
1335,1335,1335,64,87985.91,male,26.410,0,no,northeast,14394.55790
1336,1336,1336,64,87985.91,male,36.960,2,yes,southeast,49577.66240


In [82]:
complete_data_df.head(5)

Unnamed: 0.1,Unnamed: 0,id,age,average_income,sex,bmi,children,smoker,region,charges
0,0,0,18,10753.5,male,33.77,1,no,southeast,1725.5523
1,1,1,18,10753.5,male,34.1,0,no,southeast,1137.011
2,2,2,18,10753.5,female,26.315,0,no,northeast,2198.18985
3,3,3,18,10753.5,female,38.665,2,no,northeast,3393.35635
4,4,4,18,10753.5,female,35.625,0,no,northeast,2211.13075


In [83]:
dummies = pd.get_dummies(complete_data_df[['sex', 'smoker', 'region']], drop_first=True)
text_features = pd.concat([complete_data_df.drop(['sex', 'smoker', 'region'],axis=1), dummies],axis=1)
text_features.head()

Unnamed: 0.1,Unnamed: 0,id,age,average_income,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,0,0,18,10753.5,33.77,1,1725.5523,1,0,0,1,0
1,1,1,18,10753.5,34.1,0,1137.011,1,0,0,1,0
2,2,2,18,10753.5,26.315,0,2198.18985,0,0,0,0,0
3,3,3,18,10753.5,38.665,2,3393.35635,0,0,0,0,0
4,4,4,18,10753.5,35.625,0,2211.13075,0,0,0,0,0


In [84]:
text_features.drop("Unnamed: 0", axis = 1)

Unnamed: 0,id,age,average_income,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,0,18,10753.50,33.770,1,1725.55230,1,0,0,1,0
1,1,18,10753.50,34.100,0,1137.01100,1,0,0,1,0
2,2,18,10753.50,26.315,0,2198.18985,0,0,0,0,0
3,3,18,10753.50,38.665,2,3393.35635,0,0,0,0,0
4,4,18,10753.50,35.625,0,2211.13075,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1333,1333,64,87985.91,31.825,2,16069.08475,0,0,0,0,0
1334,1334,64,87985.91,26.885,0,29330.98315,0,1,1,0,0
1335,1335,64,87985.91,26.410,0,14394.55790,1,0,0,0,0
1336,1336,64,87985.91,36.960,2,49577.66240,1,1,0,1,0


In [85]:
# Split our preprocessed data into our features and target arrays
y = text_features['charges'].values
X = text_features.drop(['charges'],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

  This is separate from the ipykernel package so we can avoid doing imports until


In [86]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [87]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 80
hidden_nodes_layer2 = 60


nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="tanh"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="tanh"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="tanh"))

# Check the structure of the model
nn.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 80)                960       
                                                                 
 dense_10 (Dense)            (None, 60)                4860      
                                                                 
 dense_11 (Dense)            (None, 1)                 61        
                                                                 
Total params: 5,881
Trainable params: 5,881
Non-trainable params: 0
_________________________________________________________________


In [88]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [89]:
# Train the model
fit_model = nn.fit(X_train, y_train,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [90]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

11/11 - 0s - loss: 71220.4609 - accuracy: 0.0000e+00 - 85ms/epoch - 8ms/step
Loss: 71220.4609375, Accuracy: 0.0
