# Title

This demo will take you through the following steps:
- Query data from Big Query using datalab.bigquery
- Create Pandas dataframe and generate descriptive statistics
- Visualize your data using Matplotlib
- Create first heuristic as a benchmark

In [4]:
#BUCKET = 'erwinh-ml-demo'
#PROJECT = 'erwinh-mldemo'
#REGION = 'us-central1'

In [5]:
#import os
#os.environ['BUCKET'] = BUCKET
#os.environ['PROJECT'] = PROJECT
#os.environ['REGION'] = REGION

In [6]:
#gcs_data_dir = 'gs://{0}/data/financialtimeseries/'.format(BUCKET)
#gcs_model_dir = 'gs://{0}/ml-models/financialtimeseries/'.format(BUCKET)

# Libraries

The ones that we need for this demo are:
- BigQuery
- Seaborn
- Pandas
- Matplotlib
- Sys

In [35]:
#import the needed packages. 

#BigQuery package
import google.datalab.bigquery as bq

#Data Libraries 
#import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shutil
import tensorflow as tf

import sys

%matplotlib inline
tf.logging.set_verbosity(tf.logging.ERROR)

# Fetch Data from BigQuery

Now we need to get the data from Big Query. We do this using the BQ library that lets us run queries using the BQ API. 

In [9]:
# General query for the training set
query ="""
SELECT
  Survived,
  Pclass, 
  Age, 
  Fare 
FROM
  Titanic.Train
"""

In [10]:
# General query for the test set

query2 ="""
SELECT
  Pclass, 
  Age, 
  Fare 
FROM
  Titanic.Test
"""

In [11]:
#Here we execute the query and create a Pandas Dataframe
titanic_train = bq.Query(query).execute().result().to_dataframe()

#Lets fetch 5 records 
titanic_train.head(5)

Unnamed: 0,Survived,Pclass,Age,Fare
0,0,1,,0.0
1,0,1,,0.0
2,0,1,40.0,0.0
3,0,1,38.0,0.0
4,0,1,39.0,0.0


In [12]:
# We can then use the Pandas dataframe to explore the data
titanic_train.describe()

Unnamed: 0,Survived,Pclass,Age,Fare
count,891.0,891.0,714.0,891.0
mean,0.383838,2.308642,29.699118,32.204208
std,0.486592,0.836071,14.526497,49.693429
min,0.0,1.0,0.42,0.0
25%,0.0,2.0,20.125,7.9104
50%,0.0,3.0,28.0,14.4542
75%,1.0,3.0,38.0,31.0
max,1.0,3.0,80.0,512.3292


In [13]:
# We do the same for the test set
titanic_test = bq.Query(query2).execute().result().to_dataframe()
titanic_test.head(5)

Unnamed: 0,Pclass,Age,Fare
0,1,59.0,51.4792
1,1,51.0,39.4
2,1,67.0,221.7792
3,1,76.0,78.85
4,1,58.0,512.3292


# Save to data to csv

Now we want to save the data to a csv file. This is a best practice for if we have 'Big Data'. This scales better and we can use the CSV files in our input pipeline that we are building later on. 

In [14]:
# We save the training set and test set as a csv
titanic_train.to_csv('data/train-data.csv', header=False, index_label=False, index=False)
titanic_test.to_csv('data/test-data.csv', header=False, index_label=False, index=False)

In [15]:
%%bash
ls data/.

test-data.csv
train-data.csv


In [16]:
# Now lets check if the csv are created correctly. 
df_train = pd.read_csv('data/train-data.csv', header=None)
df_test = pd.read_csv('data/test-data.csv', header=None)

print("training instance:{}".format(len(df_train)))
print("test instance:{}".format(len(df_test)))

training instance:891
test instance:418


# Upload datasets to Google Cloud Storage (GCS)

Now we are going to store our CSV files in a Google Cloud Storage Bucket. This way we can scale easily and re-use the data whenever we want to. 

In [17]:
# Migrate data to GCS
%%bash 
gsutil -m cp data/*-data.csv gs://erwinh-ml-demo/data/titanic

Copying file://data/test-data.csv [Content-Type=text/csv]...
Copying file://data/train-data.csv [Content-Type=text/csv]...
/ [0/2 files][    0.0 B/ 17.4 KiB]   0% Done                                    / [0/2 files][    0.0 B/ 17.4 KiB]   0% Done                                    / [1/2 files][ 17.4 KiB/ 17.4 KiB]  99% Done                                    / [2/2 files][ 17.4 KiB/ 17.4 KiB] 100% Done                                    
Operation completed over 2 objects/17.4 KiB.                                     


In [19]:
# Lets first check what Tensorflow version we are using. 
print(tf.__version__)

1.5.0


In [20]:
# We can store the path for the data in a variable
file_path = "gs://erwinh-ml-demo/data/titanic/train-data.csv"
file_test = "gs://erwinh-ml-demo/data/titanic/test-data.csv"

# Setup metadata

We need to define the metadata that we are going to use for training our ML model.

In [21]:
#Define header names
HEADER = ['Survived',
           'Pclass', 
           'Age', 
           'Fare']

#Setup default values for missing
DEFAULTS = [[0], [2.0], [29.0], [32.0]]

NUMERIC_FEATURE_NAMES = ['Pclass', 
                         'Age', 
                         'Fare']

CATEGORICAL_FEATURE_NAMES = []

#Variable with all the feature names
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES

#Set Target
TARGET_NAME = 'Survived'

TARGET_VALUES = [1,0]

#Unused features
UNUSED_FEATURE_NAMES = set(HEADER) - set(FEATURE_NAMES) - set([TARGET_NAME])

#Print features + unused features
print("Header: {}".format(HEADER))
print("Numeric Features: {}".format(NUMERIC_FEATURE_NAMES))
print("Categorical Features: {}".format(CATEGORICAL_FEATURE_NAMES))
print("Target: {} - labels: {}".format(TARGET_NAME, TARGET_VALUES))
print("Unused Features: {}".format(UNUSED_FEATURE_NAMES))

Header: ['Survived', 'Pclass', 'Age', 'Fare']
Numeric Features: ['Pclass', 'Age', 'Fare']
Categorical Features: []
Target: Survived - labels: [1, 0]
Unused Features: set([])


# Setup input function

We use a inpute function that we can re-use for different models and different datasets. For this we use the Dataset API. 

In [22]:
def input_fn(file_path, perform_shuffle=False, repeat_count=1):
  def decode_csv(line):
    # Convert CSV records to tensors. Each column maps to one tensor.
    parsed_line = tf.decode_csv(line, DEFAULTS)
    label = parsed_line[0]
    del parsed_line[0]
    features = parsed_line
    d = dict(zip(FEATURE_NAMES, features)), label
    return d
  
  dataset = (tf.data.TextLineDataset(file_path)
            .skip(1)
            .map(decode_csv))
  
  if perform_shuffle:
        # Randomizes input using a window of 256 elements (read into memory)
        dataset = dataset.shuffle(buffer_size=256)
  dataset = dataset.repeat(repeat_count)  # Repeats dataset this # times
  dataset = dataset.batch(32)  # Batch size to use
  iterator = dataset.make_one_shot_iterator()
  batch_features, batch_labels = iterator.get_next()
  return batch_features, batch_labels
  

In [23]:
next_batch = input_fn(file_path, True) # Will return 32 random elements

# Now let's try it out, retrieving and printing one batch of data.
# Although this code looks strange, you don't need to understand
# the details.
with tf.Session() as sess:
    first_batch = sess.run(next_batch)
print(first_batch)

({'Fare': array([ 39.6   ,  53.1   ,  15.5   ,  10.5   ,  73.5   , 263.    ,
         7.75  ,  21.    ,  31.    ,   7.8542,  13.    ,   7.75  ,
        13.    ,  18.    ,  11.5   ,  30.5   ,  32.3208,  26.55  ,
        26.55  ,  15.5   ,  26.55  ,  13.    ,  10.5   ,   7.75  ,
         0.    ,  25.925 ,  29.125 ,  24.    ,  35.5   ,   7.25  ,
        10.5   ,   7.7292], dtype=float32), 'Age': array([29. , 37. , 40. , 66. , 21. , 19. , 31. , 27. , 29. , 14. , 30. ,
       45. , 25. , 31. , 21. , 55. , 61. , 29. , 60. , 29. , 56. , 39. ,
       36. , 70.5, 39. , 29. ,  2. , 30. , 45. , 29. , 28. , 29. ],
      dtype=float32), 'Pclass': array([1., 1., 3., 2., 2., 1., 3., 2., 1., 3., 2., 3., 2., 3., 2., 1., 1.,
       1., 1., 3., 1., 2., 2., 3., 1., 1., 3., 2., 1., 3., 2., 3.],
      dtype=float32)}, array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32))


In [24]:
def create_feature_columns():
  
  Pclass=tf.feature_column.numeric_column('Pclass')
  Age=tf.feature_column.numeric_column('Age')
  Fare=tf.feature_column.numeric_column('Fare')

  features_columns = [Pclass, Age, Fare]
  
  return features_columns

In [25]:
feature_columns = create_feature_columns()

# Build Machine Learning Model

Now its time to build a Machine Learning model. Here we will initiate our estimator and train it using the training dataset that we have created earlier. 

In [28]:
# Here we specify some of the metadata that we are using in our estimator model
num_hidden_units =[512, 256, 128] 
number_classes = 2
directory = "./Checkpoints/checkpoints_tutorial17-2/"

In [29]:
# Now we will initate our DNNClassifier
classifier = tf.estimator.DNNClassifier(
    feature_columns=feature_columns,
    hidden_units=num_hidden_units,
    n_classes = number_classes, 
    model_dir=directory)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f3707b0c990>, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': './Checkpoints/checkpoints_tutorial17-2/', '_save_summary_steps': 100}


In [30]:
classifier.train(
    input_fn=lambda: input_fn(file_path, True, 100))

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from ./Checkpoints/checkpoints_tutorial17-2/model.ckpt-2782
INFO:tensorflow:Saving checkpoints for 2783 into ./Checkpoints/checkpoints_tutorial17-2/model.ckpt.
INFO:tensorflow:loss = 23.453457, step = 2783
INFO:tensorflow:global_step/sec: 162.66
INFO:tensorflow:loss = 25.080727, step = 2883 (0.616 sec)
INFO:tensorflow:global_step/sec: 175.399
INFO:tensorflow:loss = 15.017276, step = 2983 (0.570 sec)
INFO:tensorflow:global_step/sec: 171.328
INFO:tensorflow:loss = 23.791424, step = 3083 (0.584 sec)
INFO:tensorflow:global_step/sec: 159.774
INFO:tensorflow:loss = 10.750647, step = 3183 (0.626 sec)
INFO:tensorflow:global_step/sec: 162.342
INFO:tensorflow:loss = 24.38977, step = 3283 (0.616 sec)
INFO:tensorflow:global_step/sec: 155.851
INFO:tensorflow:loss = 19.440926, step = 3383 (0.642 sec)
INFO:tensorflow:global_step/sec: 155.546
INFO:tensorflow:loss = 18.085102, step = 3483 (0.644 sec)
INFO:tensorflow:global

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7f3707dba5d0>

# Now we need to check how our model performs on our test dataset. 

For this we will create a new input function.

In [31]:
# Test input function 
def test_input_fn(file_path, perform_shuffle=False, repeat_count=1):
  def decode_csv(line):
    # Convert CSV records to tensors. Each column maps to one tensor.
    parsed_line = tf.decode_csv(line, DEFAULTS)
    label = parsed_line[0]
    del parsed_line[0]
    features = parsed_line
    d = dict(zip(FEATURE_NAMES, features)), label
    return d
  
  dataset = (tf.data.TextLineDataset(file_path)
            .skip(1)
            .map(decode_csv))
  
  if perform_shuffle:
        # Randomizes input using a window of 256 elements (read into memory)
        dataset = dataset.shuffle(buffer_size=256)
  dataset = dataset.repeat(repeat_count)  # Repeats dataset this # times
  dataset = dataset.batch(32)  # Batch size to use
  iterator = dataset.make_one_shot_iterator()
  batch_features, batch_labels = iterator.get_next()
  return batch_features, batch_labels

In [32]:
# Evaluate our model using the examples contained in FILE_TEST
# Return value will contain evaluation_metrics such as: loss & average_loss
evaluate_result = classifier.evaluate(
    input_fn=lambda: input_fn(file_path, False, 4))
print("Evaluation results")
for key in evaluate_result:
    print("   {}, was: {}".format(key, evaluate_result[key]))

INFO:tensorflow:Starting evaluation at 2018-03-11-18:09:29
INFO:tensorflow:Restoring parameters from ./Checkpoints/checkpoints_tutorial17-2/model.ckpt-5564
INFO:tensorflow:Finished evaluation at 2018-03-11-18:09:30
INFO:tensorflow:Saving dict for global step 5564: accuracy = 0.7089888, accuracy_baseline = 0.61573035, auc = 0.7325362, auc_precision_recall = 0.6228764, average_loss = 0.6157158, global_step = 5564, label/mean = 0.38426965, loss = 19.570967, prediction/mean = 0.49716985
Evaluation results
   loss, was: 19.5709667206
   accuracy_baseline, was: 0.615730345249
   global_step, was: 5564
   auc, was: 0.732536196709
   prediction/mean, was: 0.497169852257
   label/mean, was: 0.384269654751
   average_loss, was: 0.615715801716
   auc_precision_recall, was: 0.622876405716
   accuracy, was: 0.708988785744
