In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
data = {
    'Platform': ['Instagram', 'TikTok', 'Facebook', 'Instagram', 'TikTok', 'Facebook',
                 'Instagram', 'TikTok', 'Facebook', 'Instagram', 'TikTok', 'Facebook'],
    'Usia': [28, 35, 45, np.nan, 22, 50, 40, 30, 55, 25, 38, 42],
    'Tayangan Iklan': [15000, 22000, 18000, 30000, 12000, 25000, 16000, 20000, 28000, np.nan, 21000, 19000],
    'Target Kota': ['Jakarta', 'Surabaya', 'Bandung', 'Jakarta', np.nan, 'Surabaya',
                    'Bandung', 'Jakarta', 'Surabaya', 'Bandung', 'Jakarta', 'Bandung'],
    'Beli Produk': ['Ya', 'Ya', 'Tidak', 'Ya', 'Tidak', 'Ya', 'Tidak', 'Ya', np.nan, 'Tidak', 'Ya', 'Tidak']
}
dataset = pd.DataFrame(data)


In [3]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values


In [4]:
print("--- X Awal (dengan NaN dan Kategorikal) ---")
print(X)
print("-" * 50)

--- X Awal (dengan NaN dan Kategorikal) ---
[['Instagram' 28.0 15000.0 'Jakarta']
 ['TikTok' 35.0 22000.0 'Surabaya']
 ['Facebook' 45.0 18000.0 'Bandung']
 ['Instagram' nan 30000.0 'Jakarta']
 ['TikTok' 22.0 12000.0 nan]
 ['Facebook' 50.0 25000.0 'Surabaya']
 ['Instagram' 40.0 16000.0 'Bandung']
 ['TikTok' 30.0 20000.0 'Jakarta']
 ['Facebook' 55.0 28000.0 'Surabaya']
 ['Instagram' 25.0 nan 'Bandung']
 ['TikTok' 38.0 21000.0 'Jakarta']
 ['Facebook' 42.0 19000.0 'Bandung']]
--------------------------------------------------


In [5]:
# Imputasi pada kolom numerik: Usia (index 1) dan Tayangan Iklan (index 2)
imputer_num = SimpleImputer(missing_values=np.nan, strategy='mean')
X[:, [1, 2]] = imputer_num.fit_transform(X[:, [1, 2]])

# Imputasi pada kolom kategorikal: Target Kota (index 3) dengan 'most_frequent'
imputer_cat = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X[:, [3]] = imputer_cat.fit_transform(X[:, [3]])


In [6]:
# a. Encoding Fitur (X) - One-Hot Encoding
# Kolom kategorikal: Platform (index 0) dan Target Kota (index 3)
ct = ColumnTransformer(
    transformers=[
        ('encoder_platform', OneHotEncoder(), [0]),
        ('encoder_city', OneHotEncoder(), [3])
    ],
    remainder='passthrough'
)
X = np.array(ct.fit_transform(X))

# b. Encoding Target (y) - Label Encoding
# Mengubah y menjadi list of string untuk memproses np.nan sebagai kategori terpisah
y_list = [str(val) for val in y]
le = LabelEncoder()
y = le.fit_transform(y_list)

print("--- X setelah Imputasi dan One-Hot Encoding ---")
print(X)
print("-" * 50)
print("--- y setelah Label Encoding ---")
print(y)
print("-" * 50)

--- X setelah Imputasi dan One-Hot Encoding ---
[[0.0 1.0 0.0 0.0 1.0 0.0 28.0 15000.0]
 [0.0 0.0 1.0 0.0 0.0 1.0 35.0 22000.0]
 [1.0 0.0 0.0 1.0 0.0 0.0 45.0 18000.0]
 [0.0 1.0 0.0 0.0 1.0 0.0 37.27272727272727 30000.0]
 [0.0 0.0 1.0 1.0 0.0 0.0 22.0 12000.0]
 [1.0 0.0 0.0 0.0 0.0 1.0 50.0 25000.0]
 [0.0 1.0 0.0 1.0 0.0 0.0 40.0 16000.0]
 [0.0 0.0 1.0 0.0 1.0 0.0 30.0 20000.0]
 [1.0 0.0 0.0 0.0 0.0 1.0 55.0 28000.0]
 [0.0 1.0 0.0 1.0 0.0 0.0 25.0 20545.454545454544]
 [0.0 0.0 1.0 0.0 1.0 0.0 38.0 21000.0]
 [1.0 0.0 0.0 1.0 0.0 0.0 42.0 19000.0]]
--------------------------------------------------
--- y setelah Label Encoding ---
[1 1 0 1 0 1 0 1 2 0 1 0]
--------------------------------------------------


In [7]:
# --- 4. MEMISAHKAN DATA LATIH DAN DATA UJI ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [8]:
# --- 5. PENYEKALAAN FITUR (FEATURE SCALING) ---
# Setelah OHE (3 platform + 3 kota = 6 kolom), kolom numerik (Usia, Tayangan Iklan)
# berada di index 6 dan 7. Scaling diterapkan dari index 6 ke atas.
sc = StandardScaler()

# Fit dan transform hanya pada kolom numerik (index 6 ke atas) untuk Training Set
X_train[:, 6:] = sc.fit_transform(X_train[:, 6:])

# Transform hanya pada kolom numerik untuk Test Set
X_test[:, 6:] = sc.transform(X_test[:, 6:])


print("--- X_train Final setelah Feature Scaling (Kolom 6 & 7 sudah diskalakan) ---")
print(X_train)
print("-" * 50)

print("--- X_test Final setelah Feature Scaling (Kolom 6 & 7 sudah diskalakan) ---")
print(X_test)

--- X_train Final setelah Feature Scaling (Kolom 6 & 7 sudah diskalakan) ---
[[0.0 0.0 1.0 0.0 1.0 0.0 -0.011822941265748451 0.07133875802522895]
 [0.0 0.0 1.0 0.0 0.0 1.0 -0.33104235544094757 0.3329142041177322]
 [0.0 1.0 0.0 1.0 0.0 0.0 0.20099000151771762 -1.2365384724372874]
 [0.0 1.0 0.0 0.0 1.0 0.0 -1.0758876551830787 -1.4981139185297907]
 [0.0 0.0 1.0 0.0 1.0 0.0 -0.8630747123996128 -0.19023668806727434]
 [1.0 0.0 0.0 1.0 0.0 0.0 0.4138029443011837 -0.4518121341597776]
 [0.0 1.0 0.0 1.0 0.0 0.0 -1.3951070693582779 -0.04755917201681835]
 [1.0 0.0 0.0 0.0 0.0 1.0 1.7970870723937131 1.9023668806727518]
 [1.0 0.0 0.0 0.0 0.0 1.0 1.265054715435048 1.117640542395242]]
--------------------------------------------------
--- X_test Final setelah Feature Scaling (Kolom 6 & 7 sudah diskalakan) ---
[[1.0 0.0 0.0 1.0 0.0 0.0 0.7330223584763828 -0.7133875802522809]
 [0.0 1.0 0.0 0.0 1.0 0.0 -0.0892094659142815 2.4255177728577584]
 [0.0 0.0 1.0 1.0 0.0 0.0 -1.714326483533477 -2.282840256807300