<a href="https://colab.research.google.com/github/Marcellinus08/Deep-Learning/blob/main/Loading_and_Preprocessing_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Library

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing
import os


Data Prep

In [2]:
housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, random_state=42)

Normalisasi data

In [3]:
scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_


In [4]:
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = os.path.join("datasets", "housing")
    os.makedirs(housing_dir, exist_ok=True)
    path_format = os.path.join(housing_dir, f"my_{name_prefix}_{{:02d}}.csv")

    filepaths = []
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(file_idx)
        filepaths.append(part_csv)
        with open(part_csv, "w") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([str(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

In [5]:
train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = housing.feature_names + ["MedianHouseValue"]
header = ",".join(header_cols)

Save to File

In [6]:
train_filepaths = save_to_multiple_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, "test", header, n_parts=10)

Pipelining

In [7]:
def parse_csv_line(line):
    defs = [0.] * len(header_cols) # Default untuk setiap kolom adalah float
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return x, y

def csv_reader_dataset(filepaths, n_readers=5, n_parse_threads=5,
                       n_shuffle=10000, batch_size=32):
    # Membuat dataset dari path file
    dataset = tf.data.Dataset.list_files(filepaths)
    # Membaca dari beberapa file secara paralel
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers)
    # Parsing baris CSV secara paralel
    dataset = dataset.map(parse_csv_line, num_parallel_calls=n_parse_threads)
    # Shuffle data
    dataset = dataset.shuffle(n_shuffle)
    # Batch data dan prefetch
    return dataset.batch(batch_size).prefetch(1)

Membuat dataset untuk training

In [8]:
train_set = csv_reader_dataset(train_filepaths, batch_size=32)
valid_set = csv_reader_dataset(valid_filepaths, batch_size=32)
test_set = csv_reader_dataset(test_filepaths, batch_size=32)


Building Model

In [9]:
class Standardization(keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.mean = tf.constant(X_mean, dtype=tf.float32)
        self.std = tf.constant(X_std, dtype=tf.float32)
    def call(self, inputs):
        return (inputs - self.mean) / self.std


In [10]:
n_features = housing.data.shape[1]
model = keras.models.Sequential([
    Standardization(input_shape=[n_features]),
    keras.layers.Dense(30, activation="relu"),
    keras.layers.Dense(1)
])

  super().__init__(**kwargs)


Training Model

In [11]:
model.compile(loss="mse", optimizer="nadam", metrics=["RootMeanSquaredError"])
history = model.fit(train_set, epochs=5, validation_data=valid_set)

Epoch 1/5
    361/Unknown [1m9s[0m 5ms/step - RootMeanSquaredError: 1.8650 - loss: 3.6099



[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 7ms/step - RootMeanSquaredError: 1.8613 - loss: 3.5965 - val_RootMeanSquaredError: 1.3839 - val_loss: 1.9152
Epoch 2/5
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - RootMeanSquaredError: 0.8194 - loss: 0.6717 - val_RootMeanSquaredError: 0.8221 - val_loss: 0.6759
Epoch 3/5
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - RootMeanSquaredError: 0.7325 - loss: 0.5370 - val_RootMeanSquaredError: 0.7505 - val_loss: 0.5632
Epoch 4/5
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - RootMeanSquaredError: 0.6755 - loss: 0.4565 - val_RootMeanSquaredError: 0.6258 - val_loss: 0.3916
Epoch 5/5
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - RootMeanSquaredError: 0.6529 - loss: 0.4264 - val_RootMeanSquaredError: 0.8872 - val_loss: 0.7872


Evaluasi

In [12]:
print("\nEvaluasi pada test set:")
mse_test, rmse_test = model.evaluate(test_set)



Evaluasi pada test set:
[1m162/162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - RootMeanSquaredError: 0.6185 - loss: 0.3830


In [13]:
print("\nMembuat prediksi pada data baru:")
X_new = X_test[:3]
y_pred = model.predict(X_new)



Membuat prediksi pada data baru:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step


In [14]:
print("Prediksi:", y_pred.flatten())
print("Label Sebenarnya:", y_test[:3].flatten())

Prediksi: [0.34544224 1.7420028  3.5641167 ]
Label Sebenarnya: [0.477   0.458   5.00001]
