In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import sklearn
import os
import sys
print(tf.__version__)
print(sys.version_info)
for module in tf, np, pd, sklearn:
    print(module.__name__, module.__version__)

2.2.0-rc3
sys.version_info(major=3, minor=8, micro=2, releaselevel='final', serial=0)
tensorflow 2.2.0-rc3
numpy 1.18.3
pandas 1.0.3
sklearn 0.22.2.post1


# 从numpy数组构建数据集

In [2]:
dataset = tf.data.Dataset.from_tensor_slices(np.arange(10))
print(dataset)

<TensorSliceDataset shapes: (), types: tf.int64>


In [3]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(9, shape=(), dtype=int64)


In [4]:
# repeat 将数据重复几次，相当于epoch的作用
# batch 每批次获取的样本数量
dataset1 = dataset.repeat(4).batch(5)
for item in dataset1:
    print(item)

tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int64)
tf.Tensor([5 6 7 8 9], shape=(5,), dtype=int64)
tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int64)
tf.Tensor([5 6 7 8 9], shape=(5,), dtype=int64)
tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int64)
tf.Tensor([5 6 7 8 9], shape=(5,), dtype=int64)
tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int64)
tf.Tensor([5 6 7 8 9], shape=(5,), dtype=int64)


In [5]:
for item in dataset.repeat(3).batch(7):
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int64)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int64)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int64)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int64)
tf.Tensor([8 9], shape=(2,), dtype=int64)


In [9]:
# interleave 将dataset中元素进行处理，相当于map的功能
# cycle_length 多少个Tensor一个循环
# block_lenght 一次获取v的多少个元素
dataset2 = dataset1.interleave(lambda v: tf.data.Dataset.from_tensor_slices(v), cycle_length=4, block_length=3)
for item in dataset2:
    print(item)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(9, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(9, shape=(), dtype=int64)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype

# 从元组/字典构建数据集

In [10]:
x = np.array([[1,2,3], [2,3,4], [3,4,5]])
y = np.array(['Beijing', 'Shanghai', 'Chongqing'])
dataset3 = tf.data.Dataset.from_tensor_slices((x,y))
for item in dataset3:
    print(item)

(<tf.Tensor: shape=(3,), dtype=int64, numpy=array([1, 2, 3])>, <tf.Tensor: shape=(), dtype=string, numpy=b'Beijing'>)
(<tf.Tensor: shape=(3,), dtype=int64, numpy=array([2, 3, 4])>, <tf.Tensor: shape=(), dtype=string, numpy=b'Shanghai'>)
(<tf.Tensor: shape=(3,), dtype=int64, numpy=array([3, 4, 5])>, <tf.Tensor: shape=(), dtype=string, numpy=b'Chongqing'>)


In [11]:
for item_x, item_y in dataset3:
    print(item_x.numpy(), item_y.numpy().decode('utf-8'))

[1 2 3] Beijing
[2 3 4] Shanghai
[3 4 5] Chongqing


In [12]:
for item in dataset3:
    print(item[0], item[1])

tf.Tensor([1 2 3], shape=(3,), dtype=int64) tf.Tensor(b'Beijing', shape=(), dtype=string)
tf.Tensor([2 3 4], shape=(3,), dtype=int64) tf.Tensor(b'Shanghai', shape=(), dtype=string)
tf.Tensor([3 4 5], shape=(3,), dtype=int64) tf.Tensor(b'Chongqing', shape=(), dtype=string)


In [17]:
ds = tf.data.Dataset.from_tensor_slices({'coordinate': x, 'city': y})
for item in ds:
    print(item)

{'coordinate': <tf.Tensor: shape=(3,), dtype=int64, numpy=array([1, 2, 3])>, 'city': <tf.Tensor: shape=(), dtype=string, numpy=b'Beijing'>}
{'coordinate': <tf.Tensor: shape=(3,), dtype=int64, numpy=array([2, 3, 4])>, 'city': <tf.Tensor: shape=(), dtype=string, numpy=b'Shanghai'>}
{'coordinate': <tf.Tensor: shape=(3,), dtype=int64, numpy=array([3, 4, 5])>, 'city': <tf.Tensor: shape=(), dtype=string, numpy=b'Chongqing'>}


In [14]:
for item in tf.data.Dataset.from_tensor_slices({'coordinate': x, 'city': y}):
    print(item['city'].numpy(), item['coordinate'].numpy())

b'Beijing' [1 2 3]
b'Shanghai' [2 3 4]
b'Chongqing' [3 4 5]


In [15]:
for item in ds:
    print(item.keys())

dict_keys(['coordinate', 'city'])
dict_keys(['coordinate', 'city'])
dict_keys(['coordinate', 'city'])


In [16]:
for item in ds:
    print(item['coordinate'])
    print(item['city'])
    print()

tf.Tensor([1 2 3], shape=(3,), dtype=int64)
tf.Tensor(b'Beijing', shape=(), dtype=string)

tf.Tensor([2 3 4], shape=(3,), dtype=int64)
tf.Tensor(b'Shanghai', shape=(), dtype=string)

tf.Tensor([3 4 5], shape=(3,), dtype=int64)
tf.Tensor(b'Chongqing', shape=(), dtype=string)



# 生成CSV文件

In [18]:
# 获取california_housing数据集
from sklearn.datasets import fetch_california_housing
house = fetch_california_housing()

Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to /home/kdd/scikit_learn_data


In [19]:
from sklearn.model_selection import train_test_split
x_train_all, x_test, y_train_all, y_test = train_test_split(house.data, house.target,random_state=7)
x_train, x_valid, y_train, y_valid = train_test_split(x_train_all, y_train_all, random_state=11)
print(x_train_all.shape, y_train_all.shape)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)

(15480, 8) (15480,)
(11610, 8) (11610,)
(3870, 8) (3870,)
(5160, 8) (5160,)


In [20]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)

In [21]:
output_dir = "./data/csv/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
# 将feature和label拼接起来   
train_data = np.c_[x_train_scaled, y_train]
valid_data = np.c_[x_valid_scaled, y_valid]
test_data = np.c_[x_test_scaled, y_test]
# 获取字段名称
house_cols = house.feature_names + ['MHV']
cols_str = ','.join(house_cols)


In [22]:
# 定义一个保存csv文件的方法， 也可以转化为pd.DataFrame然后写入csv
def save_to_csv(output_dir, data, name_prefix, header=None, n_parts=10):
    '''
    output_dir: 保存路径
    data: 要保存的数据
    name_prefix: 保存的文件名前缀
    header: 表头
    n_parts: 保存几部分
    '''
    path_format = os.path.join(output_dir, '{}_{:2d}.csv')
    filenames = []
    for file_idx, row_indices in enumerate(np.array_split(np.arange(len(data)), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filenames.append(part_csv)
        with open(part_csv, 'wt', encoding='utf-8') as f:
            if header is not None:
                f.write(header+'\n')
            for row_index in row_indices:
                f.write(','.join([repr(col) for col in data[row_index]]))
                f.write('\n')
    return filenames

In [23]:
train_filenames = save_to_csv(output_dir, train_data, 'train', cols_str, n_parts=20)
valid_filenames = save_to_csv(output_dir, valid_data, 'valid', cols_str, n_parts=10)
test_filenames = save_to_csv(output_dir, test_data, 'test', cols_str, n_parts=10)

# 读取CSV文件

In [24]:
import pprint
print('train_filenames:')
pprint.pprint(train_filenames)

train_filenames:
['./data/csv/train_ 0.csv',
 './data/csv/train_ 1.csv',
 './data/csv/train_ 2.csv',
 './data/csv/train_ 3.csv',
 './data/csv/train_ 4.csv',
 './data/csv/train_ 5.csv',
 './data/csv/train_ 6.csv',
 './data/csv/train_ 7.csv',
 './data/csv/train_ 8.csv',
 './data/csv/train_ 9.csv',
 './data/csv/train_10.csv',
 './data/csv/train_11.csv',
 './data/csv/train_12.csv',
 './data/csv/train_13.csv',
 './data/csv/train_14.csv',
 './data/csv/train_15.csv',
 './data/csv/train_16.csv',
 './data/csv/train_17.csv',
 './data/csv/train_18.csv',
 './data/csv/train_19.csv']


In [25]:
print(train_filenames)

['./data/csv/train_ 0.csv', './data/csv/train_ 1.csv', './data/csv/train_ 2.csv', './data/csv/train_ 3.csv', './data/csv/train_ 4.csv', './data/csv/train_ 5.csv', './data/csv/train_ 6.csv', './data/csv/train_ 7.csv', './data/csv/train_ 8.csv', './data/csv/train_ 9.csv', './data/csv/train_10.csv', './data/csv/train_11.csv', './data/csv/train_12.csv', './data/csv/train_13.csv', './data/csv/train_14.csv', './data/csv/train_15.csv', './data/csv/train_16.csv', './data/csv/train_17.csv', './data/csv/train_18.csv', './data/csv/train_19.csv']


In [26]:
# 构建文件名数据集
filenames_dataset = tf.data.Dataset.list_files(train_filenames)
print(filenames_dataset)
for filename in filenames_dataset:
    print(filename)

<ShuffleDataset shapes: (), types: tf.string>
tf.Tensor(b'./data/csv/train_ 4.csv', shape=(), dtype=string)
tf.Tensor(b'./data/csv/train_ 8.csv', shape=(), dtype=string)
tf.Tensor(b'./data/csv/train_ 5.csv', shape=(), dtype=string)
tf.Tensor(b'./data/csv/train_12.csv', shape=(), dtype=string)
tf.Tensor(b'./data/csv/train_11.csv', shape=(), dtype=string)
tf.Tensor(b'./data/csv/train_ 6.csv', shape=(), dtype=string)
tf.Tensor(b'./data/csv/train_17.csv', shape=(), dtype=string)
tf.Tensor(b'./data/csv/train_ 3.csv', shape=(), dtype=string)
tf.Tensor(b'./data/csv/train_14.csv', shape=(), dtype=string)
tf.Tensor(b'./data/csv/train_18.csv', shape=(), dtype=string)
tf.Tensor(b'./data/csv/train_15.csv', shape=(), dtype=string)
tf.Tensor(b'./data/csv/train_13.csv', shape=(), dtype=string)
tf.Tensor(b'./data/csv/train_ 0.csv', shape=(), dtype=string)
tf.Tensor(b'./data/csv/train_ 2.csv', shape=(), dtype=string)
tf.Tensor(b'./data/csv/train_10.csv', shape=(), dtype=string)
tf.Tensor(b'./data/csv/t

In [27]:
n_readers = 5
dataset = filenames_dataset.interleave(lambda filename: tf.data.TextLineDataset(filename).skip(1), cycle_length=n_readers)
for line in dataset.take(5): # 取前5行
    print(line.numpy().decode('utf-8'))

0.6363646332204844,-1.0895425985107923,0.09260902815633619,-0.20538124656801682,1.2025670451003232,-0.03630122549633783,-0.6784101660505877,0.182235342347858,2.429
0.42408210084996534,0.9129633171802288,-0.04437481876046234,-0.15297213746739335,-0.24727627804141977,-0.10539166599677323,0.8612674255663844,-1.3357789003702432,3.955
2.51504373119231,1.0731637904355105,0.5574401201546321,-0.17273513019187772,-0.612912610473286,-0.01909156503651574,-0.5710993036045546,-0.027490309606616956,5.00001
0.04326300977263167,-1.0895425985107923,-0.38878716774583305,-0.10789864528874438,-0.6818663605100649,-0.0723871014747467,-0.8883662012710817,0.8213992340186296,1.426
-0.32652634129448693,0.43236189741438374,-0.09345459539684739,-0.08402991822890092,0.8460035745154013,-0.0266316482653991,-0.5617679242614233,0.1422875991184281,2.431


In [28]:
# 解析csv中的内容， 举个栗子
sample_str = '1,2,3,4,5'
records = [
    tf.constant(0, dtype=tf.int32),
    0,
    np.nan,
    'hell',
    tf.constant([])
]
parsed_fields = tf.io.decode_csv(sample_str, records)
for item in parsed_fields:
    print(item)

tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3.0, shape=(), dtype=float32)
tf.Tensor(b'4', shape=(), dtype=string)
tf.Tensor(5.0, shape=(), dtype=float32)


In [29]:
tf.io.decode_csv(',,,,', records)

InvalidArgumentError: Field 4 is required but missing in record 0! [Op:DecodeCSV]

In [30]:
try:
    tf.io.decode_csv(',,,,', records)
except tf.errors.InvalidArgumentError as ex:
    print(ex)

Field 4 is required but missing in record 0! [Op:DecodeCSV]


In [31]:
try:
    tf.io.decode_csv('11,2,5', records)
except tf.errors.InvalidArgumentError as ex:
    print(ex)

Expect 5 fields but have 3 in record 0 [Op:DecodeCSV]


In [32]:
# 定义一个解析csv每行内容的方法
def parse_csv_line(line, n_fields):
    records = [tf.constant(np.nan)]*n_fields
    parsed_fields = tf.io.decode_csv(line, records)
    x = tf.stack(parsed_fields[0:-1])
    y = tf.stack(parsed_fields[-1])
    return x, y

In [33]:
parse_csv_line(b'0.6303435674178064,1.874166156711919,-0.06713214279531016,-0.12543366804152128,-0.19737553788322462,-0.022722631725889016,-0.692407235065288,0.7265233438487496,2.419',n_fields=9)

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([ 0.63034356,  1.8741661 , -0.06713215, -0.12543367, -0.19737554,
        -0.02272263, -0.69240725,  0.72652334], dtype=float32)>,
 <tf.Tensor: shape=(), dtype=float32, numpy=2.419>)

In [34]:
# 定义一个解析csv每行内容的方法
def parse_csv_line(line, n_fields=9):
    records = [tf.constant(np.nan)]*n_fields
    parsed_fields = tf.io.decode_csv(line, records)
    x = tf.stack(parsed_fields[0:-1])
    y = tf.stack(parsed_fields[-1])
    return x, y

# 定义一个多线程读取csv文件并解析的方法
def csv_read_dataset(filenames, n_readers=5, batch_size=32, n_parse_threads=5, shuffle_buffer_size=10000):
    '''
    filenames: 文件名列表
    n_readers: 并行程度
    batch_size: 批大小
    n_parse_threads: 解析的并行程度
    shuffle_buffer_size: 
    '''
    filenames_dataset = tf.data.Dataset.list_files(filenames)
    filenames_dataset = filenames_dataset.repeat()
    dataset = filenames_dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1),
        cycle_length=n_readers)
    dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(parse_csv_line, num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset

In [35]:
train_set = csv_read_dataset(train_filenames, batch_size=3)
for x_batch, y_batch in train_set.take(2):
    print('x:')
    pprint.pprint(x_batch)
    print('y:')
    pprint.pprint(y_batch)

x:
<tf.Tensor: shape=(3, 8), dtype=float32, numpy=
array([[ 0.63034356,  1.8741661 , -0.06713215, -0.12543367, -0.19737554,
        -0.02272263, -0.69240725,  0.72652334],
       [-0.097193  , -1.2497431 ,  0.36232963,  0.02690608,  1.0338118 ,
         0.04588159,  1.3418335 , -1.635387  ],
       [ 0.09734604,  0.75276285, -0.20218964, -0.19547   , -0.40605137,
         0.00678553, -0.81371516,  0.6566148 ]], dtype=float32)>
y:
<tf.Tensor: shape=(3,), dtype=float32, numpy=array([2.419, 1.832, 1.119], dtype=float32)>
x:
<tf.Tensor: shape=(3, 8), dtype=float32, numpy=
array([[ 0.81150836, -0.04823952,  0.5187339 , -0.0293864 , -0.03406402,
        -0.05081595, -0.7157357 ,  0.91627514],
       [ 0.8015443 ,  0.27216142, -0.11624393, -0.20231152, -0.5430516 ,
        -0.02103962, -0.5897621 , -0.08241846],
       [ 1.6312258 ,  0.35226166,  0.04080576, -0.14088951, -0.4632104 ,
        -0.06751624, -0.82771224,  0.59669316]], dtype=float32)>
y:
<tf.Tensor: shape=(3,), dtype=float32, num

In [36]:
train_set = csv_read_dataset(train_filenames, batch_size=32)
valid_set = csv_read_dataset(valid_filenames, batch_size=32)
test_set = csv_read_dataset(test_filenames, batch_size=32)