In [2]:
# 1. Import Library
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# 2. Load Data
df = pd.read_csv("/content/loan_data.csv")

print("===== HEAD =====")
print(df.head())

print("\n===== INFO =====")
print(df.info())

print("\n===== DESCRIBE =====")
print(df.describe())

# 3. Data Preprocessing

# 3.1 Handling Missing Values
df = df.fillna(df.median(numeric_only=True))

# 3.2 Encode Categorical Columns
cat_cols = df.select_dtypes(include=['object']).columns

for col in cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

# 3.3 Pisahkan fitur & label
X = df.drop("loan_status", axis=1) # Corrected column name
y = df["loan_status"] # Corrected column name

# 3.4 Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3.5 Scaling (khusus numerik)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# 4. Modeling (Training)
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier()
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test, pred)
    results[name] = acc

    print("\n====================================")
    print(f"MODEL: {name}")
    print("Accuracy :", acc)
    print("Confusion Matrix:\n", confusion_matrix(y_test, pred))
    print("Classification Report:\n", classification_report(y_test, pred))

# 5. Model Performance Summary
print("\n======= Summary Accuracy ======")
for model, acc in results.items():
    print(f"{model}: {acc}")

===== HEAD =====
   person_age person_gender person_education  person_income  person_emp_exp  \
0        22.0        female           Master        71948.0               0   
1        21.0        female      High School        12282.0               0   
2        25.0        female      High School        12438.0               3   
3        23.0        female         Bachelor        79753.0               0   
4        24.0          male           Master        66135.0               1   

  person_home_ownership  loan_amnt loan_intent  loan_int_rate  \
0                  RENT    35000.0    PERSONAL          16.02   
1                   OWN     1000.0   EDUCATION          11.14   
2              MORTGAGE     5500.0     MEDICAL          12.87   
3                  RENT    35000.0     MEDICAL          15.23   
4                  RENT    35000.0     MEDICAL          14.27   

   loan_percent_income  cb_person_cred_hist_length  credit_score  \
0                 0.49                         3.

Machine Learning Workflow (Tanpa Model Tuning, Deployment, Monitoring)
1. Problem Definition

Menentukan tujuan analisis dan jenis permasalahan yang akan diselesaikan.
Contoh: Memprediksi apakah seorang nasabah layak diberi pinjaman (Loan Approval Classification).

2. Data Collection

Mengumpulkan dataset dari berbagai sumber seperti file CSV, database, API, atau web scraping.
Data yang digunakan harus relevan, lengkap, dan mencerminkan kondisi nyata.

3. Data Understanding

Memahami struktur data:
Jumlah baris & kolom
Tipe fitur (numerik, kategorik, tanggal, teks)
Distribusi data
Outlier
Korelasi antar fitur
Biasanya dilakukan dengan:

.head(), .info(), .describe()

Visualisasi (histogram, heatmap, scatter plot)

4. Data Preprocessing

Tahap pembersihan dan persiapan data sebelum modeling.

4.1 Handling Missing Values

Menghapus baris/kolom yang tidak informatif

Mengisi nilai dengan mean/median (numerik) atau modus (kategorik)

4.2 Handling Outliers

Menggunakan IQR

Winsorizing

Clipping

4.3 Encoding Fitur Kategorik

Label Encoding

One-Hot Encoding

4.4 Scaling / Normalization

StandardScaler (mean=0, std=1)

Min-Max Scaler (rentang 0–1)

4.5 Train–Test Split

Memisahkan data untuk pelatihan dan pengujian.