#### Copyright 2017 Google LLC.

In [None]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Set Fitur

**Tujuan Pembelajaran:** Membuat set fitur minimal yang berfungsi sama baiknya dengan set fitur yang lebih kompleks

Sejauh ini, kita telah menggabungkan semua fitur ke dalam model. Model dengan fitur yang lebih sedikit menggunakan resource yang lebih sedikit dan lebih mudah dikelola. Mari lihat apakah kita dapat membuat model pada set fitur perumahan minimal yang akan berfungsi sama baiknya seperti model yang menggunakan semua fitur dalam kumpulan data.

## Penyiapan

Seperti sebelumnya, mari muat dan siapkan data perumahan di California.

In [None]:
from __future__ import print_function

import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
%tensorflow_version 1.x
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

california_housing_dataframe = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv", sep=",")

california_housing_dataframe = california_housing_dataframe.reindex(
    np.random.permutation(california_housing_dataframe.index))

In [None]:
def preprocess_features(california_housing_dataframe):
  """Prepares input features from California housing data set.

  Args:
    california_housing_dataframe: A Pandas DataFrame expected to contain data
      from the California housing data set.
  Returns:
    A DataFrame that contains the features to be used for the model, including
    synthetic features.
  """
  selected_features = california_housing_dataframe[
    ["latitude",
     "longitude",
     "housing_median_age",
     "total_rooms",
     "total_bedrooms",
     "population",
     "households",
     "median_income"]]
  processed_features = selected_features.copy()
  # Create a synthetic feature.
  processed_features["rooms_per_person"] = (
    california_housing_dataframe["total_rooms"] /
    california_housing_dataframe["population"])
  return processed_features

def preprocess_targets(california_housing_dataframe):
  """Prepares target features (i.e., labels) from California housing data set.

  Args:
    california_housing_dataframe: A Pandas DataFrame expected to contain data
      from the California housing data set.
  Returns:
    A DataFrame that contains the target feature.
  """
  output_targets = pd.DataFrame()
  # Scale the target to be in units of thousands of dollars.
  output_targets["median_house_value"] = (
    california_housing_dataframe["median_house_value"] / 1000.0)
  return output_targets

In [None]:
# Choose the first 12000 (out of 17000) examples for training.
training_examples = preprocess_features(california_housing_dataframe.head(12000))
training_targets = preprocess_targets(california_housing_dataframe.head(12000))

# Choose the last 5000 (out of 17000) examples for validation.
validation_examples = preprocess_features(california_housing_dataframe.tail(5000))
validation_targets = preprocess_targets(california_housing_dataframe.tail(5000))

# Double-check that we've done the right thing.
print("Training examples summary:")
display.display(training_examples.describe())
print("Validation examples summary:")
display.display(validation_examples.describe())

print("Training targets summary:")
display.display(training_targets.describe())
print("Validation targets summary:")
display.display(validation_targets.describe())

## Tugas 1: Mengembangkan Set Fitur yang Baik

**Apa performa terbaik yang bisa Anda dapatkan hanya menggunakan 2 atau 3 fitur?**

**Matriks korelasi** menunjukkan korelasi berpasangan, baik untuk setiap fitur yang dibandingkan dengan target maupun untuk setiap fitur yang dibandingkan dengan fitur lainnya.

Di sini, korelasi ditentukan sebagai [koefisien korelasi Pearson](https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient). Anda tidak harus memahami detail matematika untuk latihan ini.

Nilai korelasi memiliki makna sebagai berikut:

  * `-1,0`: korelasi negatif sempurna
  * `0,0`: tidak ada korelasi
  * `1,0`: korelasi positif sempurna

In [None]:
correlation_dataframe = training_examples.copy()
correlation_dataframe["target"] = training_targets["median_house_value"]

correlation_dataframe.corr()

Fitur yang memiliki korelasi positif atau negatif yang tinggi dengan target akan menambahkan informasi ke model kita. Kita dapat menggunakan matriks korelasi untuk menemukan fitur yang memiliki korelasi tinggi semacam itu.

Sebaiknya kita juga memiliki fitur-fitur yang korelasinya tidak tinggi satu sama lain, sehingga fitur-fitur tersebut menambahkan informasi independen.

Gunakan informasi ini untuk mencoba menghapus fitur. Anda juga dapat mencoba mengembangkan fitur sintetis tambahan, seperti rasio dua fitur mentah.

Supaya mudah, kita telah menyertakan kode pelatihan dari latihan sebelumnya.

In [None]:
def construct_feature_columns(input_features):
  """Construct the TensorFlow Feature Columns.

  Args:
    input_features: The names of the numerical input features to use.
  Returns:
    A set of feature columns
  """
  return set([tf.feature_column.numeric_column(my_feature)
              for my_feature in input_features])

In [None]:
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    """Trains a linear regression model.

    Args:
      features: pandas DataFrame of features
      targets: pandas DataFrame of targets
      batch_size: Size of batches to be passed to the model
      shuffle: True or False. Whether to shuffle the data.
      num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely
    Returns:
      Tuple of (features, labels) for next data batch
    """

    # Convert pandas data into a dict of np arrays.
    features = {key:np.array(value) for key,value in dict(features).items()}

    # Construct a dataset, and configure batching/repeating.
    ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit
    ds = ds.batch(batch_size).repeat(num_epochs)

    # Shuffle the data, if specified.
    if shuffle:
      ds = ds.shuffle(10000)

    # Return the next batch of data.
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels

In [None]:
def train_model(
    learning_rate,
    steps,
    batch_size,
    training_examples,
    training_targets,
    validation_examples,
    validation_targets):
  """Trains a linear regression model.

  In addition to training, this function also prints training progress information,
  as well as a plot of the training and validation loss over time.

  Args:
    learning_rate: A `float`, the learning rate.
    steps: A non-zero `int`, the total number of training steps. A training step
      consists of a forward and backward pass using a single batch.
    batch_size: A non-zero `int`, the batch size.
    training_examples: A `DataFrame` containing one or more columns from
      `california_housing_dataframe` to use as input features for training.
    training_targets: A `DataFrame` containing exactly one column from
      `california_housing_dataframe` to use as target for training.
    validation_examples: A `DataFrame` containing one or more columns from
      `california_housing_dataframe` to use as input features for validation.
    validation_targets: A `DataFrame` containing exactly one column from
      `california_housing_dataframe` to use as target for validation.

  Returns:
    A `LinearRegressor` object trained on the training data.
  """

  periods = 10
  steps_per_period = steps / periods

  # Create a linear regressor object.
  my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
  my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
  linear_regressor = tf.estimator.LinearRegressor(
      feature_columns=construct_feature_columns(training_examples),
      optimizer=my_optimizer
  )

  # Create input functions.
  training_input_fn = lambda: my_input_fn(training_examples,
                                          training_targets["median_house_value"],
                                          batch_size=batch_size)
  predict_training_input_fn = lambda: my_input_fn(training_examples,
                                                  training_targets["median_house_value"],
                                                  num_epochs=1,
                                                  shuffle=False)
  predict_validation_input_fn = lambda: my_input_fn(validation_examples,
                                                    validation_targets["median_house_value"],
                                                    num_epochs=1,
                                                    shuffle=False)

  # Train the model, but do so inside a loop so that we can periodically assess
  # loss metrics.
  print("Training model...")
  print("RMSE (on training data):")
  training_rmse = []
  validation_rmse = []
  for period in range (0, periods):
    # Train the model, starting from the prior state.
    linear_regressor.train(
        input_fn=training_input_fn,
        steps=steps_per_period,
    )
    # Take a break and compute predictions.
    training_predictions = linear_regressor.predict(input_fn=predict_training_input_fn)
    training_predictions = np.array([item['predictions'][0] for item in training_predictions])

    validation_predictions = linear_regressor.predict(input_fn=predict_validation_input_fn)
    validation_predictions = np.array([item['predictions'][0] for item in validation_predictions])

    # Compute training and validation loss.
    training_root_mean_squared_error = math.sqrt(
        metrics.mean_squared_error(training_predictions, training_targets))
    validation_root_mean_squared_error = math.sqrt(
        metrics.mean_squared_error(validation_predictions, validation_targets))
    # Occasionally print the current loss.
    print("  period %02d : %0.2f" % (period, training_root_mean_squared_error))
    # Add the loss metrics from this period to our list.
    training_rmse.append(training_root_mean_squared_error)
    validation_rmse.append(validation_root_mean_squared_error)
  print("Model training finished.")


  # Output a graph of loss metrics over periods.
  plt.ylabel("RMSE")
  plt.xlabel("Periods")
  plt.title("Root Mean Squared Error vs. Periods")
  plt.tight_layout()
  plt.plot(training_rmse, label="training")
  plt.plot(validation_rmse, label="validation")
  plt.legend()

  return linear_regressor

Luangkan waktu 5 menit untuk mencari set fitur dan parameter pelatihan yang baik. Kemudian periksa solusi untuk melihat parameter atau fitur apa yang dipilih. Perlu diingat bahwa fitur yang berbeda mungkin memerlukan parameter pembelajaran yang berbeda.

In [None]:
#
# Your code here: add your features of choice as a list of quoted strings.
#
minimal_features = [
]

assert minimal_features, "You must select at least one feature!"

minimal_training_examples = training_examples[minimal_features]
minimal_validation_examples = validation_examples[minimal_features]

#
# Don't forget to adjust these parameters.
#
train_model(
    learning_rate=0.001,
    steps=500,
    batch_size=5,
    training_examples=minimal_training_examples,
    training_targets=training_targets,
    validation_examples=minimal_validation_examples,
    validation_targets=validation_targets)

### Solusi

Klik di bawah untuk mendapatkan solusi.

In [None]:
minimal_features = [
  "median_income",
  "latitude",
]

minimal_training_examples = training_examples[minimal_features]
minimal_validation_examples = validation_examples[minimal_features]

_ = train_model(
    learning_rate=0.01,
    steps=500,
    batch_size=5,
    training_examples=minimal_training_examples,
    training_targets=training_targets,
    validation_examples=minimal_validation_examples,
    validation_targets=validation_targets)

## Tugas 2: Memaksimalkan Penggunaan Garis Lintang dengan Lebih Baik

Penggambaran `latitude` vs.`median_house_value` menunjukkan bahwa sebenarnya tidak terjadi hubungan linear.

Sebagai gantinya, ada beberapa puncak, yang bersesuaian dekat dengan Los Angeles dan San Francisco.

In [None]:
plt.scatter(training_examples["latitude"], training_targets["median_house_value"])

**Cobalah membuat beberapa fitur sintetis yang berfungsi lebih baik dengan garis lintang.**

Misalnya, Anda dapat memiliki fitur-fitur yang memetakan `latitude` ke nilai `|latitude - 38|`, dan menyebut ini `distance_from_san_francisco`.

Atau Anda dapat membagi ruang menjadi 10 bucket yang berbeda. `latitude_32_to_33`, `latitude_33_to_34`, dll, masing-masing menampilkan nilai `1,0` jika `latitude` berada dalam rentang bucket tersebut dan menampilkan nilai `0,0` jika tidak.

Gunakan matriks korelasi untuk membantu pengembangan, lalu tambahkan ke dalam model jika Anda menemukan sesuatu yang terlihat menguntungkan.

Apa performa validasi terbaik yang bisa Anda dapatkan?

In [None]:
#
# YOUR CODE HERE: Train on a new data set that includes synthetic features based on latitude.
#

### Solusi

Klik di bawah untuk mendapatkan solusi.

Selain `latitude`, kita juga akan menetapkan `median_income`, untuk dibandingkan dengan hasil sebelumnya.

Kita memutuskan untuk memproses garis lintang tersebut. Tindakan ini cukup mudah dilakukan di Pandas menggunakan `Series.apply`.

In [None]:
def select_and_transform_features(source_df):
  LATITUDE_RANGES = zip(range(32, 44), range(33, 45))
  selected_examples = pd.DataFrame()
  selected_examples["median_income"] = source_df["median_income"]
  for r in LATITUDE_RANGES:
    selected_examples["latitude_%d_to_%d" % r] = source_df["latitude"].apply(
      lambda l: 1.0 if l >= r[0] and l < r[1] else 0.0)
  return selected_examples

selected_training_examples = select_and_transform_features(training_examples)
selected_validation_examples = select_and_transform_features(validation_examples)

In [None]:
_ = train_model(
    learning_rate=0.01,
    steps=500,
    batch_size=5,
    training_examples=selected_training_examples,
    training_targets=training_targets,
    validation_examples=selected_validation_examples,
    validation_targets=validation_targets)