In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

In [2]:
import numpy as np
import json

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from google.cloud import bigquery
from witwidget.notebook.visualization import WitWidget, WitConfigBuilder

In [4]:
# Here we're getting 4 features from the original dataset, along with baby weight (the thing our model will predict). 
#The dataset goes back many years but for this model we'll use only data from after 2000:

query="""
SELECT
  weight_pounds,
  is_male,
  mother_age,
  plurality,
  gestation_weeks
FROM
  publicdata.samples.natality
WHERE year > 2000
LIMIT 10000
"""
df = bigquery.Client().query(query).to_dataframe()
df.head()

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
0,6.68662,True,18,1,43.0
1,9.360828,True,32,1,41.0
2,8.437091,False,30,1,39.0
3,6.124442,False,24,1,40.0
4,7.12534,False,26,1,41.0


For larger datasets, use the BigQuery connector instead of the Pandas integration.(https://github.com/tensorflow/io/tree/master/tensorflow_io/bigquery)

In [5]:
df.describe()

Unnamed: 0,weight_pounds,mother_age,plurality,gestation_weeks
count,9989.0,10000.0,10000.0,9890.0
mean,7.297602,27.2989,1.0344,38.699798
std,1.291685,6.165838,0.192926,2.539957
min,0.612885,12.0,1.0,17.0
25%,6.624891,22.0,1.0,38.0
50%,7.374463,27.0,1.0,39.0
75%,8.124034,32.0,1.0,40.0
max,12.257702,50.0,3.0,47.0


In [6]:
#let's get some data on our boolean column indicating the baby's gender

df['is_male'].value_counts()

True     5150
False    4850
Name: is_male, dtype: int64

In [7]:
df = df.dropna()
df = shuffle(df, random_state=2)

#We are dropping the missing values and shuffling the data

In [8]:
#Next, extract the label column into a separate variable and create a DataFrame with only our features

labels = df['weight_pounds']
data = df.drop(columns=['weight_pounds'])

In [9]:
#Since is_male is a boolean, we'll convert it to an integer so that all inputs to our model are numeric

data['is_male'] = data['is_male'].astype(int)

In [10]:
data.head()

Unnamed: 0,is_male,mother_age,plurality,gestation_weeks
39,1,32,1,41.0
6132,0,28,1,30.0
5986,0,44,1,38.0
7682,0,34,1,38.0
4910,1,31,1,40.0


Splitting the Data into train and test split

In [11]:
x,y = data,labels
x_train,x_test,y_train,y_test = train_test_split(x,y)

Building and Training the TensorFlow model

In [12]:
model = Sequential([
    Dense(64, activation='relu', input_shape=(len(x_train.iloc[0]),)),
    Dense(32, activation='relu'),
    Dense(1)]
)

2021-10-11 13:48:40.013464: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2199995000 Hz
2021-10-11 13:48:40.015500: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x5610d55c8be0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2021-10-11 13:48:40.015571: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2021-10-11 13:48:40.015770: I tensorflow/core/common_runtime/process_util.cc:147] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Then we'll compile our model so we can train it. Here we'll choose the model's optimizer, loss function, and metrics we'd like the model to log during training. Since this is a regression model (predicting a numerical value), we're using mean squared error instead of accuracy as our metric:



In [13]:
model.compile(optimizer=tf.keras.optimizers.RMSprop(),
              loss=tf.keras.losses.MeanSquaredError(),
              metrics=['mae', 'mse'])

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                320       
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 2,433
Trainable params: 2,433
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.fit(x_train, y_train, epochs=10, validation_split=0.1)

Train on 6670 samples, validate on 742 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f9ebc419950>

Generating predictions on test examples

In [16]:
num_examples = 10
predictions = model.predict(x_test[:num_examples])

And then we'll iterate over our model's predictions, comparing them to the actual value

In [17]:
for i in range(num_examples):
    print('Predicted val: ', predictions[i][0])
    print('Actual val: ',y_test.iloc[i])
    print()

Predicted val:  7.1617064
Actual val:  7.67429134022

Predicted val:  6.898228
Actual val:  7.62578964258

Predicted val:  7.0596375
Actual val:  8.1240343547

Predicted val:  6.9613814
Actual val:  8.1681268071

Predicted val:  7.326471
Actual val:  7.1429772888

Predicted val:  8.025053
Actual val:  8.1460805809

Predicted val:  7.2405424
Actual val:  7.3744626639

Predicted val:  6.8723507
Actual val:  8.24969784404

Predicted val:  7.1835804
Actual val:  7.68751907594

Predicted val:  7.5247025
Actual val:  6.3118345610599995



Prepare data for the What-If Tool

In [28]:
wit_data = pd.concat([x_test, y_test], axis=1)

In [30]:
def custom_predict(wit_data):
    preds = model.predict(wit_data)
    return preds

In [31]:
config_builder = (WitConfigBuilder(wit_data[:500].values.tolist(), data.columns.tolist() + ['weight_pounds'])
  .set_custom_predict_fn(custom_predict)
  .set_target_feature('weight_pounds')
  .set_model_type('regression'))
WitWidget(config_builder, height=800)

WitWidget(config={'model_type': 'regression', 'label_vocab': [], 'feature_names': ['is_male', 'mother_age', 'p…

AttributeError: 'Series' object has no attribute 'reshape'

In [68]:
config_builder = WitConfigBuilder(wit_data[:500]).set_estimator_and_feature_spec(
    regressor, weight_pounds).set_model_type('regression')
WitWidget(config_builder, height=800)

KeyError: 0