In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras


In [3]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()

Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to /home/hui/scikit_learn_data


In [4]:
from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(
    housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train_all, y_train_all, random_state = 11)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)

(11610, 8) (11610,)
(3870, 8) (3870,)
(5160, 8) (5160,)


In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)

In [6]:
output_dir = "generate_csv"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

def save_to_csv(output_dir, data, name_prefix,
                header=None, n_parts=10):
    '''
    output_dir:生成数据集的存放位置
    data：源数据
    name_prefix：数据前缀，用来区分训练集，验证集，测试集
    n_parts：将数据分为多少份
    '''
    path_format = os.path.join(output_dir, "{}_{:02d}.csv")
    # od.path.join()：在output_dir下生成新的文件名  output_dir/~.csv
    # {}_{:02d}.csv：第一个{}用name_prefix来填充，制定是train，validata或test;
    #               第二个{}用file_idx来填充，制定时那种数据的第几个文件
    filenames = []
    
    for file_idx, row_indices in enumerate(
        np.array_split(np.arange(len(data)), n_parts)):
        '''从内向外看：
        np.arrange(len(data)):生成一个和元素数量相同的数据，用来在data中取数据
        np.array_split(np.arange(len(data)), n_parts)):将数据分为制定组数
        enumerate（～）：获得分成制定组后的数据的索引及索引对应的元素
        
        '''
        part_csv = path_format.format(name_prefix, file_idx) #生成output_dir/~.csv
        filenames.append(part_csv) #统计所有创建的文件名
        with open(part_csv, "wt", encoding="utf-8") as f:  #
            if header is not None:
                f.write(header + "\n")  #写入第一行，也就是列名
            for row_index in row_indices:
                f.write(",".join(
                    [repr(col) for col in data[row_index]]))
                    # repr(object):将对象转化为工解释器读取的形式（也就是在数据的最外围加上一个冒号，"object"）
                    # 存数据都是要以字符串的形式的，json
                f.write('\n')
    return filenames
    
    
    
    
# np.c_():将数据按行融合   
train_data = np.c_[x_train_scaled, y_train]  #c_[]——应该是中括号，而不是小括号
valid_data = np.c_[x_valid_scaled, y_valid]
test_data = np.c_[x_test_scaled, y_test]
# 由housing.feature_names获得房子影响价格的各项指标名，然后再加上一个估价的均值。形成行名
header_cols = housing.feature_names + ["MidianHouseValue"]
header_str = ",".join(header_cols) #由列表转为字符串

train_filenames = save_to_csv(output_dir, train_data, "train",
                              header_str, n_parts=20)
valid_filenames = save_to_csv(output_dir, valid_data, "valid",
                              header_str, n_parts=10)
test_filenames = save_to_csv(output_dir, test_data, "test",
                             header_str, n_parts=10)
    