# 结构化数据建模流程范例

## 准备数据

## 定义模型

使用Keras接口有以下3种方式构建模型：
1. 使用Sequential按层顺序构建模型；
2. 使用函数式API构建任意结构模型
3. 继承Model基类构建自定义模型

以下为Sequerntial按层顺序模型示例：

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import models,layers

tf.keras.backend.clear_session()

model = models.Sequential()
model.add(layers.Dense(20,activation='relu',input_shape=(15,)))
model.add(layers.Dense(10,activation='relu'))
model.add(layers.Dense(1,activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 20)                320       
_________________________________________________________________
dense_1 (Dense)              (None, 10)                210       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 541
Trainable params: 541
Non-trainable params: 0
_________________________________________________________________


## 训练模型

训练模型通常有三种方法：
1. 内置fit方法
2. 内置train_on_batch方法
3. 自定义训练循环

In [None]:
# 二分类问题选择二元交叉熵损失函数
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['AUC'])

histroy = model.fit(x_train,y_train,
                    batch_size=64,
                    epochs=30,
                    validation_split=0.2 #分割一株分训练集用于验证
                   )

## 评估模型

In [None]:
def plot_metric(history,metric):
    train_metrics = history.history[metric]
    val_metrics = history.history['val'+metric]
    epochs = range(1,len(train_metrics) + 1)
    plt.plot(epochs, train_metrics, 'bo--')
    plt.plot(epochs,val_metrics,'ro-')
    plt.title('Training and validation '+metric)
    plt.xlabel('Epochs')
    plt.ylabel(metric)
    plt.legend(['train_'+metric, 'val_'+metric])
    plt.show()

## 使用模型

In [None]:
model.predict()
model.evaluate()

## 保存模型

Keras方式保存仅适合使用Python环境恢复模型，Tensorflow原生方式可以跨平台进行部署。

# 文本数据建模流程范例

## 准备数据

tensorflow种完成文本数据预处理的常用方案又两种：
1. 利用tf.keras.preprocessing种的Tokenizer词典构建工具和tf.kears.utils.Sequence构建文本数据生成器管道
2. 使用tf.data.Dataset搭配keras.layers.experimental.preprocessing.TextVectorization预处理层

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import models,layers,preprocessing,optimizers,losses,metrics
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import re,string

physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, 'Not enough GPU hardware devices available'
tf.config.experimental.set_memory_growth(physical_devices[0], True)
# 以上三行用于处理cuDNN无法卷积的错误


train_data_path = './data/train_imdb'
test_data_path = './data/test_imdb'

MAX_WORDS = 10000 # 仅考虑最高频的10000词
MAX_LEN = 200     # 每个样本保留200个词长度
BATCH_SIZE = 20

# 构建管道
def split_line(line):
    arr = tf.strings.split(line,'\t')
    label = tf.expand_dims(tf.strings.to_number(arr[0],tf.int32),axis=0) # 从标量变为一维矢量
    text = tf.expand_dims(arr[1],axis=0)                                 # 从标量变为一维矢量
    return(text,label)

ds_train_raw = tf.data.TextLineDataset(filenames=[train_data_path]) \
    .map(split_line,num_parallel_calls=tf.data.experimental.AUTOTUNE) \
    .shuffle(buffer_size=1000).batch(BATCH_SIZE) \
    .prefetch(tf.data.experimental.AUTOTUNE)

ds_test_raw = tf.data.TextLineDataset(filenames=[test_data_path]) \
    .map(split_line,num_parallel_calls=tf.data.experimental.AUTOTUNE) \
    .shuffle(buffer_size=1000).batch(BATCH_SIZE) \
    .prefetch(tf.data.experimental.AUTOTUNE)

# 构建词典
def clean_text(text):
    lowercase = tf.strings.lower(text)
    stripped_html = tf.strings.regex_replace(lowercase,'<br />',' ')
    cleaned_punctuation = tf.strings.regex_replace(stripped_html,
                            '[%s]'%re.escape(string.punctuation),'')
    return cleaned_punctuation

vectorize_layer = TextVectorization(
    standardize=clean_text,
    split = 'whitespace',
    max_tokens=MAX_WORDS-1, # 有一个留给占位符
    output_mode='int',
    output_sequence_length=MAX_LEN)

ds_text = ds_train_raw.map(lambda text,label:text)
vectorize_layer.adapt(ds_text)
print(vectorize_layer.get_vocabulary()[0:100])

# 单词编码
ds_train = ds_train_raw.map(lambda text,label:(vectorize_layer(text),label)) \
    .prefetch(tf.data.experimental.AUTOTUNE)
ds_test = ds_test_raw.map(lambda text,label:(vectorize_layer(text),label)) \
    .prefetch(tf.data.experimental.AUTOTUNE)

[b'the', b'and', b'a', b'of', b'to', b'is', b'in', b'it', b'i', b'this', b'that', b'was', b'as', b'for', b'with', b'movie', b'but', b'film', b'on', b'not', b'you', b'his', b'are', b'have', b'be', b'he', b'one', b'its', b'at', b'all', b'by', b'an', b'they', b'from', b'who', b'so', b'like', b'her', b'just', b'or', b'about', b'has', b'if', b'out', b'some', b'there', b'what', b'good', b'more', b'when', b'very', b'she', b'even', b'my', b'no', b'would', b'up', b'time', b'only', b'which', b'story', b'really', b'their', b'were', b'had', b'see', b'can', b'me', b'than', b'we', b'much', b'well', b'get', b'been', b'will', b'into', b'people', b'also', b'other', b'do', b'bad', b'because', b'great', b'first', b'how', b'him', b'most', b'dont', b'made', b'then', b'them', b'films', b'movies', b'way', b'make', b'could', b'too', b'any', b'after', b'characters']


## 定义模型

In [2]:
tf.keras.backend.clear_session()

class CnnModel(models.Model):
    def __init__(self):
        super(CnnModel, self).__init__()
        
    def build(self,input_shape):
        self.embedding = layers.Embedding(MAX_WORDS,7,input_length=MAX_LEN)
        self.conv_1 = layers.Conv1D(16,kernel_size=5,name='conv_1',activation='relu')
        self.pool = layers.MaxPool1D()
        self.conv_2 = layers.Conv1D(128,kernel_size=2,name='conv_2',activation='relu')
        self.flatten = layers.Flatten()
        self.dense = layers.Dense(1,activation='sigmoid')
        super().build(input_shape)
        
    def call(self,x):
        x = self.embedding(x)
        x = self.conv_1(x)
        x = self.pool(x)
        x = self.conv_2(x)
        x = self.pool(x)
        x = self.flatten(x)
        x = self.dense(x)
        return x
    
model = CnnModel()
model.build(input_shape=(None,MAX_LEN))
model.summary()

Model: "cnn_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  70000     
_________________________________________________________________
conv_1 (Conv1D)              multiple                  576       
_________________________________________________________________
max_pooling1d (MaxPooling1D) multiple                  0         
_________________________________________________________________
conv_2 (Conv1D)              multiple                  4224      
_________________________________________________________________
flatten (Flatten)            multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  6145      
Total params: 80,945
Trainable params: 80,945
Non-trainable params: 0
_____________________________________________________

## 训练模型

In [8]:
# 打印时间分割线
@tf.function
def printbar():
    ts = tf.timestamp()
    today_ts = ts%(24*60*60)
    
    hour = tf.cast(today_ts//3600+8,tf.int32)%tf.constant(24)
    minite = tf.cast((today_ts%3600)//60,tf.int32)
    second = tf.cast(tf.floor(today_ts%60),tf.int32)
    
    def timeformat(m):
        if tf.strings.length(tf.strings.format('{}',m)) == 1:
            return tf.strings.format('0{}',m)
        else:
            return tf.strings.format('{}',m)
        
    timestring = tf.strings.join([timeformat(hour),timeformat(minite),
                                timeformat(second)],separator=':')
    tf.print('=========='*8,end='')
    tf.print(timestring)

In [9]:
optimizer = optimizers.Nadam()
loss_func = losses.BinaryCrossentropy()

train_loss = metrics.Mean(name='train_loss')
train_metric = metrics.BinaryAccuracy(name='train_accuracy')

valid_loss = metrics.Mean(name='valid_loss')
valid_metric = metrics.BinaryAccuracy(name='loss_accuracy')

@tf.function
def train_step(model,features,labels):
    with tf.GradientTape() as tape:
        predictions = model(features,training=True)
        loss = loss_func(labels,predictions)
    gradients = tape.gradient(loss,model.trainable_variables)
    optimizer.apply_gradients(zip(gradients,model.trainable_variables))
    
    train_loss.update_state(loss)
    train_metric.update_state(labels,predictions)
    
@tf.function
def valid_step(model,features,labels):
    predictions = model(features,training=False)
    batch_loss = loss_func(labels,predictions)
    valid_loss.update_state(batch_loss)
    valid_metric.update_state(labels,predictions)
    
def train_model(model,ds_train,ds_valid,epochs):
    for epoch in tf.range(1,epochs+1):
        for features, labels in ds_train:
            train_step(model,features,labels)
            
        for features, labels in ds_valid:
            valid_step(model,features,labels)
            
        # 此处logs模板需要根据metric具体情况修改
        logs = 'Epoch={},Loss:{},Accuracy:{},Valid Loss:{}, Valid Accuaracy:{}'
        
        if epoch%1 == 0:
            printbar()
            tf.print(tf.strings.format(logs,
                (epoch,train_loss.result(),train_metric.result(),valid_loss.result(),valid_metric.result())))
            tf.print('')
            
        train_loss.reset_states()
        valid_loss.reset_states()
        train_metric.reset_states()
        valid_metric.reset_states()
        
train_model(model,ds_train,ds_test,epochs=6)

Epoch=1,Loss:0.140176237,Accuracy:0.94895,Valid Loss:0.43160513, Valid Accuaracy:0.8572

Epoch=2,Loss:0.0940719694,Accuracy:0.9671,Valid Loss:0.528031051, Valid Accuaracy:0.86

Epoch=3,Loss:0.0548578352,Accuracy:0.9817,Valid Loss:0.677602291, Valid Accuaracy:0.8564

Epoch=4,Loss:0.0248573553,Accuracy:0.99235,Valid Loss:0.936459482, Valid Accuaracy:0.851

Epoch=5,Loss:0.01326383,Accuracy:0.9956,Valid Loss:1.22467899, Valid Accuaracy:0.8448

Epoch=6,Loss:0.0151005872,Accuracy:0.9948,Valid Loss:1.24381316, Valid Accuaracy:0.8454



## 评估模型
通过自定义训练循环的模型没有经过编译，无法直接使用model.evaluate(ds_valid)方法

In [10]:
def evaluate_model(model,ds_valid):
    for features, labels in ds_valid:
        valid_step(model,features,labels)
    logs = 'Valid Loss:{},Valid Accuracy:{}'
    tf.print(tf.strings.format(logs,(valid_loss.result(),valid_metric.result())))
    
    valid_loss.reset_states()
    train_metric.reset_states()
    valid_metric.reset_states()

In [11]:
evaluate_model(model,ds_test)

Valid Loss:1.24381328,Valid Accuracy:0.8454


## 使用模型
可以使用以下方法：
* model.predict(ds_test)
* model(x_test)
* model.call(x_test)
* model.predict_on_batch(x_test)

推荐优先使用model.predict(ds_test)方法，既可以对Dataset，设可以对Tensor使用。

In [12]:
model.predict(ds_test)

array([[2.0103064e-09],
       [5.9294913e-10],
       [4.9843590e-10],
       ...,
       [9.7289586e-01],
       [2.3512270e-10],
       [1.0000000e+00]], dtype=float32)

In [13]:
for x_test,_ in ds_test.take(1):
    print(model(x_test))
    # 一下新方法等价
    print(model.call(x_test))
    print(model.predict_on_batch(x_test))

tf.Tensor(
[[5.3290337e-01]
 [1.7145459e-06]
 [1.8435133e-08]
 [1.0000000e+00]
 [9.9999869e-01]
 [9.9999976e-01]
 [2.2574291e-01]
 [5.9972596e-01]
 [8.4904391e-01]
 [1.0000000e+00]
 [1.0000000e+00]
 [1.0307520e-15]
 [3.6671326e-11]
 [8.5513610e-01]
 [1.0000000e+00]
 [5.2997330e-14]
 [4.7708176e-02]
 [2.1040199e-13]
 [6.0539908e-20]
 [9.0309720e-07]], shape=(20, 1), dtype=float32)
tf.Tensor(
[[5.3290337e-01]
 [1.7145459e-06]
 [1.8435133e-08]
 [1.0000000e+00]
 [9.9999869e-01]
 [9.9999976e-01]
 [2.2574291e-01]
 [5.9972596e-01]
 [8.4904391e-01]
 [1.0000000e+00]
 [1.0000000e+00]
 [1.0307520e-15]
 [3.6671326e-11]
 [8.5513610e-01]
 [1.0000000e+00]
 [5.2997330e-14]
 [4.7708176e-02]
 [2.1040199e-13]
 [6.0539908e-20]
 [9.0309720e-07]], shape=(20, 1), dtype=float32)
[[5.3290337e-01]
 [1.7145459e-06]
 [1.8435133e-08]
 [1.0000000e+00]
 [9.9999869e-01]
 [9.9999976e-01]
 [2.2574291e-01]
 [5.9972596e-01]
 [8.4904391e-01]
 [1.0000000e+00]
 [1.0000000e+00]
 [1.0307520e-15]
 [3.6671326e-11]
 [8.5513610e-

## 保存模型

In [15]:
model.save('./data/tf_model_savemodel_imbd',save_format='tf')
print('export saved model')

model_loaded = tf.keras.models.load_model('./data/tf_model_savemodel_imbd',compile=False)
# 加入compile=False可消除模型中没有训练配置的警告
model_loaded.predict(ds_test)

INFO:tensorflow:Assets written to: ./data/tf_model_savemodel_imbd/assets
export saved model


array([[9.9904126e-01],
       [1.6086851e-13],
       [1.0053123e-12],
       ...,
       [7.4865986e-03],
       [9.9999082e-01],
       [1.0000000e+00]], dtype=float32)