In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

In [None]:
# https://storage.googleapis.com/tf-datasets/titanic/train.csv
# https://storage.googleapis.com/tf-datasets/titanic/eval.csv
train_file = "./data/titanic/train.csv"
eval_file = "./data/titanic/eval.csv"

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)

print(train_df.head())
print(eval_df.head())

In [None]:
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')

print(train_df.head())
print(eval_df.head())
print(y_train.head())
print(y_eval.head())

In [None]:
train_df.describe()

In [None]:
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class',
                       'deck', 'embark_town', 'alone']
numeric_columns = ['age', 'fare']

feature_columns = []
for categorical_column in categorical_columns:
    vocab = train_df[categorical_column].unique()
    print(categorical_column, vocab)
    feature_columns.append(
        tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                categorical_column, vocab)))

for categorical_column in numeric_columns:
    feature_columns.append(
        tf.feature_column.numeric_column(
            categorical_column, dtype=tf.float32))

In [None]:
def make_dataset(data_df, label_df, epochs = 10, shuffle = True,
                 batch_size = 32):
    dataset = tf.data.Dataset.from_tensor_slices(
        (dict(data_df), label_df))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

In [None]:
'''BaseLineClassifier()工作原理：
统计输入样本各种类别的比例，形成一个分布，对于新输入的数据，根据这个分布去猜测新数据的标签。
所以这个方法也只能提供一个baseline
'''

output_dir = 'baseline_model'  # 定义文件夹用于保存模型 
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

#  tf.estimator.BaselineClassifier()生成一个baseline_estimator 
baseline_estimator = tf.estimator.BaselineClassifier(
    model_dir = output_dir,
    n_classes = 2)
# 训练
baseline_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))


In [None]:
# 测试
baseline_estimator.evaluate(input_fn = lambda : make_dataset(
    eval_df, y_eval, epochs = 1, shuffle = False, batch_size = 20))

In [None]:
# 应用linear_classifier
linear_output_dir = 'linear_model'  #训练的相关信息会保存在这个文件夹中
if not os.path.exists(linear_output_dir):
    os.mkdir(linear_output_dir)
linear_estimator = tf.estimator.LinearClassifier(
    model_dir = linear_output_dir,
    n_classes = 2,
    feature_columns = feature_columns)  #需要输入feature_column这个参数，他会自动被应用到input_fn返回的数据集上
linear_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))

In [None]:
linear_estimator.evaluate(input_fn = lambda : make_dataset(
    eval_df, y_eval, epochs = 1, shuffle = False))

In [None]:
# 应用DNN_calssifier(深度神经网络模型)
dnn_output_dir = './dnn_model'
if not os.path.exists(dnn_output_dir):
    os.mkdir(dnn_output_dir)
dnn_estimator = tf.estimator.DNNClassifier(
    model_dir = dnn_output_dir,
    n_classes = 2,
    feature_columns=feature_columns,
    hidden_units = [128, 128],
    activation_fn = tf.nn.relu,
    optimizer = 'Adam')
dnn_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))

In [None]:
dnn_estimator.evaluate(input_fn = lambda : make_dataset(
    eval_df, y_eval, epochs = 1, shuffle = False))