In [87]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

In [88]:
# --- 1. Import Dataset (Sesuai PDF Hal 22) ---
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [89]:
# --- 2. Pembersihan Data Khusus Dataset Ini ---
# Masalah 1: Kolom 'TotalCharges' terbaca sebagai object (teks) karena ada spasi kosong
# Kita paksa jadi angka, yang error akan jadi NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [90]:
# Masalah 2: Kolom 'customerID' tidak berguna untuk prediksi
# Kita buang agar tidak menyebabkan error string saat scaling
df = df.drop('customerID', axis=1)

In [91]:
# --- 3. Menghilangkan Missing Value (Sesuai PDF Hal 23) ---
# Mengisi data kosong di TotalCharges dengan rata-rata (mean)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df[['TotalCharges']] = imputer.fit_transform(df[['TotalCharges']])

In [92]:
# --- 4. Encoding Data Kategori (Sesuai PDF Hal 24-25) ---
# Dataset Anda banyak kolom teks (Gender, Partner, dll).
# Kita ubah semua teks menjadi angka menggunakan LabelEncoder agar bisa dihitung.

# Pisahkan nama kolom kategori dan numerik
categorical_cols = df.select_dtypes(include=['object']).columns

# Terapkan LabelEncoder untuk setiap kolom kategori
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

In [93]:
# --- 5. Memisahkan Fitur (X) dan Label (y) ---
X = df.iloc[:, :-1].values  # Semua kolom kecuali Churn
y = df.iloc[:, -1].values   # Kolom terakhir (Churn)

In [94]:
# --- 6. Membagi Dataset (Sesuai PDF Hal 26) ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [95]:
# --- 7. Feature Scaling (Sesuai PDF Hal 27) ---
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [96]:
# --- Cek Hasil ---
print("Ukuran X_train:", X_train.shape)
print("Data hasil scaling (baris pertama):\n", X_train[0])

Ukuran X_train: (5634, 19)
Data hasil scaling (baris pertama):
 [ 0.99257284 -0.43947526  1.04169196  1.54329281 -0.82588395  0.32791614
 -0.99562895  1.52742994  0.24553792  0.11772865  0.10750961  0.23680586
  0.01584551  0.00260717  1.57775379 -1.19824775  1.32775925 -1.49752994
 -0.89301894]
