<a href="https://colab.research.google.com/github/Keidhnn/tgs3preprocessing.jpynb/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Persiapan Dataset

In [None]:
# preprocessing.ipynb / preprocessing.py

import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

iris = load_iris()

df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target
df['species'] = df['target'].apply(lambda x: iris.target_names[x])

print("--- Informasi Dataset Awal ---")
print(f"Jumlah Baris dan Kolom: {df.shape}")
print("\n5 Baris Data Pertama:")
print(df.head())

Data Cleaning

In [None]:
# 3.1. Penanganan Nilai Hilang (Missing Values)
print("\n--- Hasil Cek Nilai Hilang ---")
# Cek jumlah nilai hilang per kolom
print(df.isnull().sum())
# Karena output menunjukkan 0 untuk semua kolom, tidak ada tindakan imputasi yang diperlukan.

# 3.2. Penanganan Data Duplikat
print("\n--- Penanganan Duplikat ---")
duplicate_count = df.duplicated().sum()
print(f"Jumlah Baris Duplikat: {duplicate_count}")

# Hapus duplikasi jika ada (kecuali duplikat di kolom 'species' dan 'target')
# Kita hapus duplikat berdasarkan semua kolom
df.drop_duplicates(inplace=True)
print(f"Jumlah Baris Setelah Duplikasi Dihapus: {len(df)}")

Data Transformation

In [None]:
# Fitur numerik yang akan di-scale
features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
X = df[features].reset_index(drop=True)

# 4.1. Standarisasi (StandardScaler / Z-Score)
# Mengubah data sehingga rata-rata = 0 dan standar deviasi = 1.
scaler_standard = StandardScaler()
X_standardized = scaler_standard.fit_transform(X)
df_standardized = pd.DataFrame(X_standardized, columns=[f'{f}_std' for f in features])

print("\n--- 4.1 Data Hasil Standarisasi (5 Baris) ---")
print(df_standardized.head())

# 4.2. Normalisasi (MinMaxScaler)
# Mengubah data sehingga rentangnya menjadi 0 hingga 1.
scaler_minmax = MinMaxScaler()
X_normalized = scaler_minmax.fit_transform(X)
df_normalized = pd.DataFrame(X_normalized, columns=[f'{f}_norm' for f in features])

print("\n--- 4.2 Data Hasil Normalisasi (5 Baris) ---")
print(df_normalized.head())

Data Reduction

In [None]:
# 5. Reduksi Sampel (Train-Test Split)
X_final = df[features] # Fitur
y_final = df['target'] # Target

# Memisahkan data menjadi 80% data latih (training) dan 20% data uji (testing)
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42)

print("\n--- 5. Hasil Reduksi Sampel (Split Data) ---")
print(f"Jumlah data training (80%): {len(X_train)} baris")
print(f"Jumlah data testing (20%): {len(X_test)} baris")

# (Opsional) Gabungkan data hasil preprocessing ke dalam DataFrame akhir
df_final_preprocessed = pd.concat([df.reset_index(drop=True), df_standardized.reset_index(drop=True)], axis=1)
# df_final_preprocessed.to_csv('iris_preprocessed.csv', index=False)