In [2]:
# 时间序列数据预测——质量预测
import os
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from matplotlib import pyplot as plt

# 从CSV文件中导入数据
fname = os.path.join("self_testing_data_2022_20_good.csv")
with open(fname) as f:
    data = f.read()

lines = data.split("\n")
header = lines[0].split(",")
lines = lines[1:]
print(f"header: {header}")

# np.zeros(shape, dtype, order) 返回给定形状和类型的用0填充的数组
#     shape形状;
#     dtype数据类型,默认numpy.float64;
#     order顺序, C代表行优先, F代表列优先.
pressure_drop = np.zeros((len(lines),))
print(f"shape of pressure_drop: {pressure_drop.shape}")
raw_data = np.zeros((len(lines), len(header) - 1))
print(f"shape of raw_data: {raw_data.shape}")

# enumerate()枚举序列中的内容, 返回下标i和内容line
for i, line in enumerate(lines):
    values = [float(x) for x in line.split(",")[1:]]
    pressure_drop[i] = values[1]
    raw_data[i, :] = values[:]

#plt.plot(range(len(pressure_drop)), pressure_drop)
#plt.show()
#plt.plot(range(1440), pressure_drop[:1440])
#plt.show()

# 计算用于训练（50%）、验证（25%）和测试（25%）的样本数
num_train_samples = int(0.5 * len(raw_data))
num_val_samples = int(0.25 * len(raw_data))
num_test_samples = len(raw_data) - num_train_samples - num_val_samples

# 数据规范化: 均值为0, 标准差为1
mean = raw_data[:num_train_samples].mean(axis=0)
raw_data -= mean
std = raw_data[:num_train_samples].std(axis=0)
raw_data /= std

# 创建3个数据集，分别用于训练、验证和测试
sampling_rate = 6
sequence_length = 120
delay = sampling_rate * (sequence_length + 24 - 1)
batch_size = 256

train_dataset = keras.utils.timeseries_dataset_from_array(
    raw_data[:-delay],
    targets=pressure_drop[delay:],
    sampling_rate=sampling_rate,
    sequence_length=sequence_length,
    shuffle=True,
    batch_size=batch_size,
    start_index=0,
    end_index=num_train_samples)

val_dataset = keras.utils.timeseries_dataset_from_array(
    raw_data[:-delay],
    targets=pressure_drop[delay:],
    sampling_rate=sampling_rate,
    sequence_length=sequence_length,
    shuffle=True,
    batch_size=batch_size,
    start_index=num_train_samples,
    end_index=num_train_samples + num_val_samples)

test_dataset = keras.utils.timeseries_dataset_from_array(
    raw_data[:-delay],
    targets=pressure_drop[delay:],
    sampling_rate=sampling_rate,
    sequence_length=sequence_length,
    shuffle=True,
    batch_size=batch_size,
    start_index=num_train_samples + num_val_samples)

# 平均绝对误差（MAE）: np.mean(np.abs(preds - targets))

# 方法一：计算基于常识的基准的MAE
#     输出：Validation MAE: 93.42
#          Test MAE: 93.79

def evaluate_naive_method(dataset):
    total_abs_err = 0.
    samples_seen = 0
    for samples, targets in dataset:
        preds = samples[:, -1, 1] * std[1] + mean[1]
        total_abs_err += np.sum(np.abs(preds - targets))
        samples_seen += samples.shape[0]
    return total_abs_err / samples_seen

print(f"Validation MAE: {evaluate_naive_method(val_dataset):.2f}")
print(f"Test MAE: {evaluate_naive_method(test_dataset):.2f}")


# 方法二：训练并评估一个密集连接模型
#     输出：Test MAE: 1504.79
"""
inputs = keras.Input(shape=(sequence_length, raw_data.shape[-1]))
x = layers.Flatten()(inputs)
x = layers.Dense(16, activation="relu")(x)
outputs = layers.Dense(1)(x)
model = keras.Model(inputs, outputs)

callbacks = [
    keras.callbacks.ModelCheckpoint("self_test_data.keras",
                                   save_best_only=True)
]
model.compile(optimizer="rmsprop", loss="mse", metrics=["mae"])
history = model.fit(train_dataset,
                    epochs=10,
                    validation_data=val_dataset,
                    callbacks=callbacks)
model = keras.models.load_model("self_test_data.keras")
print(f"Test MAE: {model.evaluate(test_dataset)[1]:.2f}")

# 绘制结果
"""

# 方法三：一维卷积模型  ***** 结果非常理想 *****
#     输出：Test MAE: 68.94
""" 方法三：一维卷积神经网络(1D CNNs)模型
    layers.Conv1D(filters, kernel_size, activation="relu")(input_shape)
        -- filters: 卷积中滤波器的数量, 滤波器也成为特征检测器，代表输出空间的第三维度。
        -- kernel_size: 卷积核的大小。
    layers.MaxPooling1D()
    layers.GlobalAveragePooling1D()
    layers.Dense()()

inputs = keras.Input(shape=(sequence_length, raw_data.shape[-1]))
x = layers.Conv1D(8, 24, activation="relu")(inputs)
x = layers.MaxPooling1D(2)(x)
x = layers.Conv1D(8, 12, activation="relu")(x)
x = layers.MaxPooling1D(2)(x)
x = layers.Conv1D(8, 6, activation="relu")(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1)(x)
model = keras.Model(inputs, outputs)

callbacks = [
    keras.callbacks.ModelCheckpoint("self_test_conv.keras",
                                   save_best_only=True)
]
model.compile(optimizer="rmsprop", loss="mse", metrics=["mae"])
history = model.fit(train_dataset,
                    epochs=10,
                    validation_data=val_dataset,
                    callbacks=callbacks)
model = keras.models.load_model("self_test_conv.keras")
print(f"Test MAE: {model.evaluate(test_dataset)[1]:.2f}")
"""
# 绘制结果




# 方法四：基于LSTM的简单模型
#     输出：Test MAE: 3795.17
"""
inputs = keras.Input(shape=(sequence_length, raw_data.shape[-1]))
x = layers.LSTM(16)(inputs)    # 关键差异
outputs = layers.Dense(1)(x)
model = keras.Model(inputs, outputs)

callbacks = [
    keras.callbacks.ModelCheckpoint("self_test_lstm.keras",
                                   save_best_only=True)
]
model.compile(optimizer="rmsprop", loss="mse", metrics=["mae"])
history = model.fit(train_dataset,
                    epochs=10,
                    validation_data=val_dataset,
                    callbacks=callbacks)
model = keras.models.load_model("self_test_lstm.keras")
print(f"Test MAE: {model.evaluate(test_dataset)[1]:.2f}")

# 绘制结果
"""

# 方法五：使用dropout正则化的LSTM模型
#     输出：Test MAE: 2.51
#     耗时：约2.36小时
"""
inputs = keras.Input(shape=(sequence_length, raw_data.shape[-1]))
x = layers.LSTM(32, recurrent_dropout=0.25)(inputs)    # 关键差异
x = layers.Dropout(0.5)(x)    # 关键差异
outputs = layers.Dense(1)(x)
model = keras.Model(inputs, outputs)

callbacks = [
    keras.callbacks.ModelCheckpoint("jena_lstm_dropout.keras",
                                   save_best_only=True)
]
model.compile(optimizer="rmsprop", loss="mse", metrics=["mae"])
history = model.fit(train_dataset,
                    epochs=50,
                    validation_data=val_dataset,
                    callbacks=callbacks)
model = keras.models.load_model("jena_lstm_dropout.keras")
print(f"Test MAE: {model.evaluate(test_dataset)[1]:.2f}")

# 绘制结果


# 绘制结果
loss = history.history["mae"]
val_loss = history.history["val_mae"]
epochs = range(1, len(loss) + 1)
plt.figure()
plt.plot(epochs, loss, "bo", label="Training MAE")
plt.plot(epochs, val_loss, "b", label="Validation MAE")
plt.title("Training and validation MAE")
plt.legend()
plt.show()
"""

header: ['\ufeffDate-Time', 'Pressure_Drop', 'Roundness', 'Circumference']
shape of pressure_drop: (872888,)
shape of raw_data: (872888, 3)
Validation MAE: 0.04
Test MAE: 0.04


'\ninputs = keras.Input(shape=(sequence_length, raw_data.shape[-1]))\nx = layers.LSTM(32, recurrent_dropout=0.25)(inputs)    # 关键差异\nx = layers.Dropout(0.5)(x)    # 关键差异\noutputs = layers.Dense(1)(x)\nmodel = keras.Model(inputs, outputs)\n\ncallbacks = [\n    keras.callbacks.ModelCheckpoint("jena_lstm_dropout.keras",\n                                   save_best_only=True)\n]\nmodel.compile(optimizer="rmsprop", loss="mse", metrics=["mae"])\nhistory = model.fit(train_dataset,\n                    epochs=50,\n                    validation_data=val_dataset,\n                    callbacks=callbacks)\nmodel = keras.models.load_model("jena_lstm_dropout.keras")\nprint(f"Test MAE: {model.evaluate(test_dataset)[1]:.2f}")\n\n# 绘制结果\n\n\n# 绘制结果\nloss = history.history["mae"]\nval_loss = history.history["val_mae"]\nepochs = range(1, len(loss) + 1)\nplt.figure()\nplt.plot(epochs, loss, "bo", label="Training MAE")\nplt.plot(epochs, val_loss, "b", label="Validation MAE")\nplt.title("Training and val