In [106]:
# 中间数据保存

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

In [107]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()

In [108]:
from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(
    housing.data, housing.target, random_state=7)
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train_all, y_train_all, random_state=11)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)

(11610, 8) (11610,)
(3870, 8) (3870,)
(5160, 8) (5160,)


In [109]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)

In [110]:
for file_idx, row_indices in enumerate(np.array_split(np.arange(20), 4)):
    print(file_idx, row_indices)

0 [0 1 2 3 4]
1 [5 6 7 8 9]
2 [10 11 12 13 14]
3 [15 16 17 18 19]


In [111]:
# 把特征工程后的数据存为csv文件
output_dir = 'generate_csv'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)


def save_to_csv(output_dir, data, name_prefix, header=None, n_parts=10):
    # 生成文件名 格式generate_csv/{}_{:02d}.csv
    path_format = os.path.join(output_dir, '{}_{:02d}.csv')
    filenames = []
    # 把数据分为n_parts份写入文件中
    for file_idx, row_indices in enumerate(np.array_split(np.arange(len(data)), n_parts)):
        # 生成子文件名
        part_csv = path_format.format(name_prefix, file_idx)
        # 文件名添加到列表
        filenames.append(part_csv)
        with open(part_csv, 'w', encoding='utf-8') as f:
            # 先写头部
            if header is not None:
                f.write((header + '\n'))
                for row_index in row_indices:
                    # 把字符串化后的每个字符串用逗号拼接起来
                    f.write(",".join([repr(col) for col in data[row_index]]))
                    f.write('\n')
    return filenames

# np.c_沿第二个轴将切片对象转换为串联
train_data = np.c_[x_train_scaled, y_train]
valid_data = np.c_[x_valid_scaled, y_valid]
test_data = np.c_[x_test_scaled, y_test]
# 头部=特征名+目标名
header_cols = housing.feature_names + ["MidianHouseValue"]
header_str = ','.join(header_cols)
print(header_str)
print('-' * 50)
train_filenames = save_to_csv(output_dir, train_data, 'train', header_str, n_parts=20)
valid_filenames = save_to_csv(output_dir, valid_data, 'valid', header_str, n_parts=10)
test_filenames = save_to_csv(output_dir, test_data, 'test', header_str, n_parts=10)

MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MidianHouseValue
--------------------------------------------------


In [112]:
temp_array=np.array([[1,2,3],[4,5,6]])
# savetxt会自动将整型数或者浮点数转为字符串存储 以空格分开
np.savetxt('temp.csv',temp_array)

In [113]:
print(train_filenames)
import pprint
print('')
print("train filenames:")
pprint.pprint(train_filenames)
print("valid filenames:")
pprint.pprint(valid_filenames)
print("test filenames:")
pprint.pprint(test_filenames)

['generate_csv\\train_00.csv', 'generate_csv\\train_01.csv', 'generate_csv\\train_02.csv', 'generate_csv\\train_03.csv', 'generate_csv\\train_04.csv', 'generate_csv\\train_05.csv', 'generate_csv\\train_06.csv', 'generate_csv\\train_07.csv', 'generate_csv\\train_08.csv', 'generate_csv\\train_09.csv', 'generate_csv\\train_10.csv', 'generate_csv\\train_11.csv', 'generate_csv\\train_12.csv', 'generate_csv\\train_13.csv', 'generate_csv\\train_14.csv', 'generate_csv\\train_15.csv', 'generate_csv\\train_16.csv', 'generate_csv\\train_17.csv', 'generate_csv\\train_18.csv', 'generate_csv\\train_19.csv']

train filenames:
['generate_csv\\train_00.csv',
 'generate_csv\\train_01.csv',
 'generate_csv\\train_02.csv',
 'generate_csv\\train_03.csv',
 'generate_csv\\train_04.csv',
 'generate_csv\\train_05.csv',
 'generate_csv\\train_06.csv',
 'generate_csv\\train_07.csv',
 'generate_csv\\train_08.csv',
 'generate_csv\\train_09.csv',
 'generate_csv\\train_10.csv',
 'generate_csv\\train_11.csv',
 'generat

In [114]:
# # list_files默认行为是按不确定的随机混排顺序返回文件名
filenames_dataset=tf.data.Dataset.list_files(train_filenames)
for filename in filenames_dataset:
    print(filename)

tf.Tensor(b'generate_csv\\train_15.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_08.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_06.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_00.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_04.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_16.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_14.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_18.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_03.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_11.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_13.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_09.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_10.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_17.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_02.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\

In [115]:
filenames_mydataset=tf.data.Dataset.from_tensor_slices(train_filenames)
filenames_mydataset=filenames_mydataset.repeat(1)
for i in filenames_mydataset:
    print(i)

tf.Tensor(b'generate_csv\\train_00.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_01.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_02.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_03.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_04.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_05.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_06.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_07.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_08.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_09.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_10.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_11.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_12.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_13.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_14.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\

In [116]:
# 从文件中读取保存的数据
n_readers=5
dataset=filenames_mydataset.interleave(
    lambda filename:tf.data.TextLineDataset(filename),
    cycle_length = n_readers,block_length=2) # cycle_length和block_length增加获取了数据的随机性
for line in dataset.take(15):
    print(line)

tf.Tensor(b'MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MidianHouseValue', shape=(), dtype=string)
tf.Tensor(b'0.801544314532886,0.27216142415910205,-0.11624392696666119,-0.2023115137272354,-0.5430515742518128,-0.021039615516440048,-0.5897620622908205,-0.08241845654707416,3.226', shape=(), dtype=string)
tf.Tensor(b'MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MidianHouseValue', shape=(), dtype=string)
tf.Tensor(b'0.8115083791797953,-0.04823952235146133,0.5187339067174729,-0.029386394873127775,-0.034064024638222286,-0.05081594842905086,-0.7157356834231196,0.9162751241885168,2.147', shape=(), dtype=string)
tf.Tensor(b'MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MidianHouseValue', shape=(), dtype=string)
tf.Tensor(b'0.401276648075221,-0.9293421252555106,-0.05333050451405854,-0.1865945262276826,0.6545661895448709,0.026434465728210874,0.9312527706398824,-1.4406417263474771,2.512', shape=(), dtype=string

In [117]:
# 每一行数据切分为对应类型
# 通过decode_csv解析(parse)csv   tf.io.decode_csv(str, record_defaults)
sample_str='1,2,3,4,5'
record_defaults=[tf.constant(0,dtype=tf.int32),
                 0,
                 np.nan,
                 'hello1',
                 tf.constant([])] # 没有固定类型默认是float32
#按照record_defaults对sample_str数据格式化
parsed_fields=tf.io.decode_csv(sample_str,record_defaults)
print(parsed_fields)

[<tf.Tensor: shape=(), dtype=int32, numpy=1>, <tf.Tensor: shape=(), dtype=int32, numpy=2>, <tf.Tensor: shape=(), dtype=float32, numpy=3.0>, <tf.Tensor: shape=(), dtype=string, numpy=b'4'>, <tf.Tensor: shape=(), dtype=float32, numpy=5.0>]


In [118]:
# 传一个空字符串测试
try:
    parsed_fields=tf.io.decode_csv(',,,1,2',record_defaults)
except tf.errors.InvalidArgumentError as ex:
    print(ex)
print(parsed_fields)

[<tf.Tensor: shape=(), dtype=int32, numpy=0>, <tf.Tensor: shape=(), dtype=int32, numpy=0>, <tf.Tensor: shape=(), dtype=float32, numpy=nan>, <tf.Tensor: shape=(), dtype=string, numpy=b'1'>, <tf.Tensor: shape=(), dtype=float32, numpy=2.0>]


In [119]:
#%%

# 给值过多
try:
    parsed_fields=tf.io.decode_csv(',,,1,2,3',record_defaults)
except tf.errors.InvalidArgumentError as ex:
    print(ex)

Expect 5 fields but have 6 in record 0 [Op:DecodeCSV]


In [120]:
# 解析一行
def parse_csv_line(line,n_fields=9):
    # 初始置为np.nan,若读取到缺失数据就会记录成np.nan
    defs=[tf.constant(np.nan)]*n_fields
    # decode_csv解析
    parsed_fields=tf.io.decode_csv(line,record_defaults=defs)
    x=tf.stack(parsed_fields[0:-1])
    y=tf.stack(parsed_fields[-1:])
    return x,y
print(parse_csv_line(b'-0.9868720801669367,0.832863080552588,-0.18684708416901633,-0.14888949288707784,-0.4532302419670616,-0.11504995754593579,1.6730974284189664,-0.7465496877362412,1.138',
               n_fields=9))

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
array([-0.9868721 ,  0.8328631 , -0.18684709, -0.1488895 , -0.45323023,
       -0.11504996,  1.6730974 , -0.74654967], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.138], dtype=float32)>)


In [121]:
# filename -> dataset
# read file -> dataset -> datasets -> merge
# parse csv
# 整个流程
def csv_reader_dataset(filenames,n_readers=5,batch_size=32,
                       n_parse_threads=5,
                       shuffle_buffer_size=1000):
    # 文件名类别变为dataset tensor
    dataset=tf.data.Dataset.list_files(filenames)
    # RepeatDataset
    dataset=dataset.repeat()
    dataset=dataset.interleave(
        # skip(1)跳过保存的特征名和目标名
        lambda filename:tf.data.TextLineDataset(filename).skip(1),
        cycle_length=n_readers
    )
    # 对数据进行洗牌变混乱
    dataset.shuffle(shuffle_buffer_size)
    # map传入dataset中的tensor，通过parse_csv_line对数据集进行映射
    dataset=dataset.map(parse_csv_line,
                        num_parallel_calls=n_parse_threads)
    dataset=dataset.batch(batch_size)
    return dataset

In [122]:
# 测试
train_set=csv_reader_dataset(train_filenames,batch_size=4)
print(train_set)
print('-'*50)
i=0
for x_batch,y_batch in train_set.take(2):
    i+=1
    print('x:')
    pprint.pprint(x_batch)
    print('y:')
    pprint.pprint(y_batch)
print(i)

<BatchDataset element_spec=(TensorSpec(shape=(None, 8), dtype=tf.float32, name=None), TensorSpec(shape=(None, 1), dtype=tf.float32, name=None))>
--------------------------------------------------
x:
<tf.Tensor: shape=(4, 8), dtype=float32, numpy=
array([[ 0.8015443 ,  0.27216142, -0.11624393, -0.20231152, -0.5430516 ,
        -0.02103962, -0.5897621 , -0.08241846],
       [ 0.15782312,  0.4323619 ,  0.3379948 , -0.01588031, -0.37338907,
        -0.05305246,  0.80061346, -1.2359096 ],
       [-0.097193  , -1.2497431 ,  0.36232963,  0.02690608,  1.0338118 ,
         0.04588159,  1.3418335 , -1.635387  ],
       [ 0.09734604,  0.75276285, -0.20218964, -0.19547   , -0.40605137,
         0.00678553, -0.81371516,  0.6566148 ]], dtype=float32)>
y:
<tf.Tensor: shape=(4, 1), dtype=float32, numpy=
array([[3.226],
       [3.169],
       [1.832],
       [1.119]], dtype=float32)>
x:
<tf.Tensor: shape=(4, 8), dtype=float32, numpy=
array([[ 0.40127665, -0.92934215, -0.0533305 , -0.18659453,  0.654566

In [123]:
%%time
batch_size = 32
train_set=csv_reader_dataset(train_filenames,batch_size=batch_size)
valid_set=csv_reader_dataset(valid_filenames,batch_size=batch_size)
test_set=csv_reader_dataset(test_filenames,batch_size=batch_size)
print(train_set)
print(valid_set)
print(test_set)

<BatchDataset element_spec=(TensorSpec(shape=(None, 8), dtype=tf.float32, name=None), TensorSpec(shape=(None, 1), dtype=tf.float32, name=None))>
<BatchDataset element_spec=(TensorSpec(shape=(None, 8), dtype=tf.float32, name=None), TensorSpec(shape=(None, 1), dtype=tf.float32, name=None))>
<BatchDataset element_spec=(TensorSpec(shape=(None, 8), dtype=tf.float32, name=None), TensorSpec(shape=(None, 1), dtype=tf.float32, name=None))>
CPU times: total: 62.5 ms
Wall time: 63.9 ms


In [124]:
model = keras.models.Sequential([
    keras.layers.Dense(30, activation='relu',
                       input_shape=[8]),
    keras.layers.Dense(1),
])
model.compile(loss="mean_squared_error", optimizer="sgd")
callbacks = [keras.callbacks.EarlyStopping(
    patience=5, min_delta=1e-2)]
#BatchDataset必须制定steps_per_epoch，validation_steps
history=model.fit(train_set,validation_data=valid_set,
                  steps_per_epoch=11160//batch_size,
                  validation_steps=3780//batch_size,
                  epochs=100,callbacks=callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100


In [125]:
print(model.evaluate(test_set,steps=5160//batch_size))

0.3929949104785919


In [126]:
dataset=tf.data.Dataset.range(8)
dataset=dataset.batch(4)
print(list(dataset))

[<tf.Tensor: shape=(4,), dtype=int64, numpy=array([0, 1, 2, 3], dtype=int64)>, <tf.Tensor: shape=(4,), dtype=int64, numpy=array([4, 5, 6, 7], dtype=int64)>]
