In [1]:
import numpy
import pandas as pd
import tensorflow as tf

TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

In [3]:
train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)  # downloads a file from specified url and stores as train.csv
test_file_path = tf.keras.utils.get_file("test.csv", TEST_DATA_URL)

Downloading data from https://storage.googleapis.com/tf-datasets/titanic/train.csv
Downloading data from https://storage.googleapis.com/tf-datasets/titanic/eval.csv


**Loading Downloaded data**

In [10]:
LABEL_COLUMN = "survived"

# helper function to get the needed dataset for operation
def get_dataset(file_path, **kwargs):
    
    # prefetches a dataset that match param from file path of csv
    dataset = tf.data.experimental.make_csv_dataset(
        file_path,
        batch_size = 5,
        label_name = LABEL_COLUMN,
        na_value = "?",
        num_epochs = 10,
        ignore_errors = True,
        **kwargs
    )
    return dataset

In [11]:
raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)

In [16]:
# helper to show the batch obtained from prefetched data
def show_batch(dataset):
    for batch, label in dataset.take(1): # iter trough 1st observatin in batch(i.e 1st batch only)
        for key, value in batch.items():
            print(f"{key}. {value.numpy()}")
            

show_batch(raw_test_data)

sex. [b'male' b'male' b'male' b'female' b'male']
age. [38. 16. 20. 62. 58.]
n_siblings_spouses. [0 1 0 0 0]
parch. [0 1 0 0 2]
fare. [  8.6625  20.25     4.0125  80.     113.275 ]
class. [b'Third' b'Third' b'Third' b'First' b'First']
deck. [b'unknown' b'unknown' b'unknown' b'B' b'D']
embark_town. [b'Southampton' b'Southampton' b'Cherbourg' b'unknown' b'Cherbourg']
alone. [b'y' b'n' b'y' b'y' b'n']


In [17]:
# csv already has col names, but if not best to do operation
CSV_COLUMNS = ["survived", "sex", "age", "n_siblings_spouses", "parch",
              "fare", "class", "deck", "embark_town", "alone"]

temp_dataset = get_dataset(train_file_path, column_names = CSV_COLUMNS)

show_batch(temp_dataset)

sex. [b'male' b'male' b'male' b'female' b'male']
age. [19. 38. 38. 48. 28.]
n_siblings_spouses. [0 0 0 1 0]
parch. [0 1 0 0 0]
fare. [  7.8958 153.4625   7.05    39.6      7.75  ]
class. [b'Third' b'First' b'Third' b'First' b'Third']
deck. [b'unknown' b'C' b'unknown' b'A' b'unknown']
embark_town. [b'Southampton' b'Southampton' b'Southampton' b'Cherbourg' b'Queenstown']
alone. [b'y' b'n' b'y' b'n' b'y']


In [18]:
# selecting cols we want out of a csv
CSV_COLUMNS = ["survived", "class", "deck", "embark_town", "alone"]

temp_dataset = get_dataset(train_file_path, select_columns = CSV_COLUMNS)

show_batch(temp_dataset)

class. [b'Third' b'First' b'Second' b'Second' b'First']
deck. [b'unknown' b'unknown' b'unknown' b'unknown' b'unknown']
embark_town. [b'Southampton' b'Cherbourg' b'Southampton' b'Southampton' b'Cherbourg']
alone. [b'n' b'n' b'n' b'n' b'y']
