In [3]:
# https://colab.research.google.com/notebooks/mlcc/first_steps_with_tensor_flow.ipynb?hl=zh-cn
# 学习目标：#
# 学习基本的 TensorFlow 概念
# 在 TensorFlow 中使用 LinearRegressor 类并基于单个输入特征预测各城市街区的房屋价值中位数
# 使用均方根误差 (RMSE) 评估模型预测的准确率
# 通过调整模型的超参数提高模型准确率

# 加载必要的库
import math
from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset
# Dataset API 需要 TensorFlow 1.3+
tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format
#加载数据集
california_housing_dataframe = pd.read_csv("california_housing_train.csv", sep=",")
# 对数据进行随机化处理，以确保不会出现任何病态排序结果（可能会损害随机梯度下降法的效果）。
california_housing_dataframe = california_housing_dataframe.reindex(np.random.permutation(california_housing_dataframe.index))
# 将 median_house_value 调整为以千为单位，这样，模型就能够以常用范围内的学习速率较为轻松地学习这些数据。
california_housing_dataframe["median_house_value"] /= 1000.0
california_housing_dataframe
# 检查数据 样本数、均值、标准偏差、最大值、最小值和各种分位数
print(california_housing_dataframe.describe())
# 第 1 步：定义特征并配置特征列
# Define the input feature: total_rooms.
my_feature = california_housing_dataframe[["total_rooms"]]
# Configure a numeric feature column for total_rooms.
feature_columns = [tf.feature_column.numeric_column("total_rooms")]
# 第 2 步：定义目标
# Define the label.
targets = california_housing_dataframe["median_house_value"]
# 第 3 步：配置 LinearRegressor
# Use gradient descent as the optimizer for training the model.
my_optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.0000001)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
# Configure the linear regression model with our feature columns and optimizer.
# Set a learning rate of 0.0000001 for Gradient Descent.
linear_regressor = tf.estimator.LinearRegressor(
    feature_columns=feature_columns,
    optimizer=my_optimizer
)
# 第 4 步：定义输入函数
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    """Trains a linear regression model of one feature.

    Args:
      features: pandas DataFrame of features
      targets: pandas DataFrame of targets
      batch_size: Size of batches to be passed to the model
      shuffle: True or False. Whether to shuffle the data.
      num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely
    Returns:
      Tuple of (features, labels) for next data batch
    """

    # Convert pandas data into a dict of np arrays.
    features = {key: np.array(value) for key, value in dict(features).items()}

    # Construct a dataset, and configure batching/repeating
    ds = Dataset.from_tensor_slices((features, targets))  # warning: 2GB limit
    ds = ds.batch(batch_size).repeat(num_epochs)

    # Shuffle the data, if specified
    if shuffle:
        ds = ds.shuffle(buffer_size=10000)

    # Return the next batch of data
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels

# 第 5 步：训练模型
# 第 6 步：评估模型
# 调整模型超参数
# 任务 1：使 RMSE 不超过 180
# 任务 2：尝试其他特征

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
count    17000.0   17000.0             17000.0      17000.0         17000.0   
mean      -119.6      35.6                28.6       2643.7           539.4   
std          2.0       2.1                12.6       2179.9           421.5   
min       -124.3      32.5                 1.0          2.0             1.0   
25%       -121.8      33.9                18.0       1462.0           297.0   
50%       -118.5      34.2                29.0       2127.0           434.0   
75%       -118.0      37.7                37.0       3151.2           648.2   
max       -114.3      42.0                52.0      37937.0          6445.0   

       population  households  median_income  median_house_value  
count     17000.0     17000.0        17000.0             17000.0  
mean       1429.6       501.2            3.9               207.3  
std        1147.9       384.5            1.9               116.0  
min           3.0   