# Buat pipeline end‑to‑end untuk klasifikasi model machine learning:

## 1. Pengumpulan & Pembersihan Data

### Import library yang diperlukan

In [1]:
from google.colab import drive
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif, VarianceThreshold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc, roc_auc_score, classification_report, precision_recall_curve
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Set tema untuk visualisasi
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

### Memuat data

In [2]:
# Connecting to Google Drive
drive.mount('/content/drive')

# Importing Dataset
df = pd.read_csv('/content/drive/MyDrive/Belajar (PTA TA)/Tempat Belajar/KlasifikasiUTS.csv')

# Display  dataset
df.info()
df.describe()
df.head()

Mounted at /content/drive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-nul

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


Karena dataset sudah bersih dari missing values, maka tidak perlu dilakukan pembersihan data.

## 2. Feature Selection

In [3]:
# Memisahkan fitur dan target
X = df.drop('Class', axis=1)
y = df['Class']

### Constant dan Quasi-Constant Feature Removal

In [4]:
constant_filter = VarianceThreshold(threshold=0.01)
constant_filter.fit(X)
constant_columns = [column for column, is_constant in zip(X.columns, constant_filter.get_support()) if not is_constant]
print(f"Constant/Quasi-constant features: {constant_columns if constant_columns else 'Tidak ada'}")

# Memfilter fitur konstan
X_filtered = constant_filter.transform(X)
feature_names = X.columns[constant_filter.get_support()]
X_filtered = pd.DataFrame(X_filtered, columns=feature_names)

Constant/Quasi-constant features: Tidak ada


### Korelasi antar fitur

In [5]:
plt.figure(figsize=(20, 16))
correlation_matrix = X_filtered.corr()
mask = np.triu(correlation_matrix)
sns.heatmap(correlation_matrix, annot=False, mask=mask, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.savefig('correlation_matrix.png')
plt.close()

### Identifikasi fitur

In [6]:
# Identifikasi fitur dengan korelasi tinggi (threshold 0.8)
high_corr_threshold = 0.8
high_corr_pairs = []

for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > high_corr_threshold:
            high_corr_pairs.append((correlation_matrix.columns[i], correlation_matrix.columns[j], correlation_matrix.iloc[i, j]))

print(f"Fitur dengan korelasi tinggi (>0.8): {high_corr_pairs if high_corr_pairs else 'Tidak ada'}")

# Hapus satu dari setiap pasangan fitur dengan korelasi tinggi
features_to_drop = set()
for pair in high_corr_pairs:
    features_to_drop.add(pair[1])  # Hapus fitur kedua

X_filtered = X_filtered.drop(columns=features_to_drop, errors='ignore')
print(f"Fitur setelah menghapus korelasi tinggi: {X_filtered.shape[1]}")

Fitur dengan korelasi tinggi (>0.8): Tidak ada
Fitur setelah menghapus korelasi tinggi: 30


### Mutual Information untuk fitur kategorik (tidak ada fitur kategorik disini)

In [7]:
mi_scores = mutual_info_classif(X_filtered, y)
mi_scores = pd.Series(mi_scores, index=X_filtered.columns)
mi_scores = mi_scores.sort_values(ascending=False)
print("Top 10 fitur berdasarkan Mutual Information:")
print(mi_scores.head(10))

# Visualisasi Mutual Information Scores
plt.figure(figsize=(12, 8))
mi_scores.sort_values().plot.barh()
plt.title('Mutual Information Scores (Korelasi dengan Target)')
plt.xlabel('Mutual Information')
plt.tight_layout()
plt.savefig('mutual_information.png')
plt.close()

Top 10 fitur berdasarkan Mutual Information:
V17    0.008258
V14    0.008136
V12    0.007601
V10    0.007530
V11    0.006831
V16    0.006144
V4     0.004978
V3     0.004952
V18    0.004317
V9     0.004277
dtype: float64


### ANOVA (f_classif) untuk fitur numerik

In [8]:
f_scores, p_values = f_classif(X_filtered, y)
f_scores = pd.Series(f_scores, index=X_filtered.columns)
f_scores = f_scores.sort_values(ascending=False)
print("Top 10 fitur berdasarkan ANOVA F-Test:")
print(f_scores.head(10))

# Visualisasi F-scores
plt.figure(figsize=(12, 8))
f_scores.sort_values().plot.barh()
plt.title('ANOVA F-Scores (Korelasi dengan Target)')
plt.xlabel('F-Score')
plt.tight_layout()
plt.savefig('anova_f_scores.png')
plt.close()

Top 10 fitur berdasarkan ANOVA F-Test:
V17    33979.168593
V14    28695.547788
V12    20749.822361
V10    14057.979985
V16    11443.349428
V3     11014.508305
V7     10349.605408
V11     6999.355047
V4      5163.832114
V18     3584.380605
dtype: float64


### Pemilihan top fitur

In [9]:
# Pemilihan top fitur berdasarkan analisis
top_k = 15  # Jumlah fitur yang akan diambil
selector = SelectKBest(f_classif, k=top_k)
X_new = selector.fit_transform(X_filtered, y)
selected_features = X_filtered.columns[selector.get_support()]
print(f"\nSelected top {top_k} features:")
print(selected_features.tolist())

X_selected = X_filtered[selected_features]


Selected top 15 features:
['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V9', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18']


## 3. Feature Engineering
### Transformasi fitur Time dan Amount jika ada dalam fitur terpilih

In [10]:
if 'Time' in X_selected.columns:
    # Ekstrak fitur siklus dari Time (hour of day)
    X_selected['Hour'] = df['Time'] / 3600 % 24
    X_selected.drop('Time', axis=1, inplace=True)

if 'Amount' in X_selected.columns:
    # Menggunakan RobustScaler untuk Amount karena ada outlier
    amount_scaler = RobustScaler()
    X_selected['Amount'] = amount_scaler.fit_transform(X_selected[['Amount']])

# Untuk seluruh dataset, kita akan menggunakan StandardScaler
# karena sebagian besar fitur V sudah di-PCA dan dinormalisasi
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)
X_scaled = pd.DataFrame(X_scaled, columns=X_selected.columns)

print(f"Dataset setelah transformasi: {X_scaled.shape}")

Dataset setelah transformasi: (284807, 15)


### Split data and Handle Class Imbalance

In [11]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)

print(f"\nTrain set: {X_train.shape}, Test set: {X_test.shape}")
print(f"Distribusi kelas pada train: {pd.Series(y_train).value_counts()}")
print(f"Distribusi kelas pada test: {pd.Series(y_test).value_counts()}")

# Mengatasi imbalance dengan SMOTE (hanya pada training set)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"Data setelah SMOTE: {X_train_resampled.shape}")
print(f"Distribusi kelas setelah SMOTE: {pd.Series(y_train_resampled).value_counts()}")


Train set: (199364, 15), Test set: (85443, 15)
Distribusi kelas pada train: Class
0    199020
1       344
Name: count, dtype: int64
Distribusi kelas pada test: Class
0    85295
1      148
Name: count, dtype: int64
Data setelah SMOTE: (398040, 15)
Distribusi kelas setelah SMOTE: Class
0    199020
1    199020
Name: count, dtype: int64


## 4. Model Training dan Evaluasi
### Mendifinisikan tiap fungsi

In [34]:
# Dictionary untuk menyimpan hasil model
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(probability=True, random_state=42),
    'Random Forest (Bagging)': RandomForestClassifier(random_state=42),
    'AdaBoost (Boosting)': AdaBoostClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

# Parameter untuk GridSearchCV
param_grids = {
    'Logistic Regression': {'C': [0.01, 0.1, 1, 10, 100]},
    'Decision Tree': {'max_depth': [5, 10, 15, 20, None]},
    'KNN': {'n_neighbors': [3, 5, 7, 9, 11]},
    'SVM': {'C': [0.000000001], 'kernel': ['rbf', 'linear']},
    'Random Forest (Bagging)': {'n_estimators': [50, 100], 'max_depth': [10, 20, None]},
    'AdaBoost (Boosting)': {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1, 1.0]},
    'Gradient Boosting': {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1, 1.0]}
}

# Variabel untuk menyimpan hasil evaluasi
results = {}
best_models = {}
metrics_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC'])

# Fungsi untuk plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.savefig(f'confusion_matrix_{model_name.replace(" ", "_").lower()}.png')
    plt.close()

# Fungsi untuk plot ROC Curve
def plot_roc_curve(y_true, y_prob, model_name):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(10, 8))
    plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name}')
    plt.legend(loc="lower right")
    plt.savefig(f'roc_curve_{model_name.replace(" ", "_").lower()}.png')
    plt.close()
    return roc_auc

# Fungsi untuk plot Precision-Recall Curve
def plot_pr_curve(y_true, y_prob, model_name):
    precision, recall, _ = precision_recall_curve(y_true, y_prob)
    pr_auc = auc(recall, precision)
    plt.figure(figsize=(10, 8))
    plt.plot(recall, precision, label=f'PR curve (area = {pr_auc:.3f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve - {model_name}')
    plt.legend(loc="lower left")
    plt.savefig(f'pr_curve_{model_name.replace(" ", "_").lower()}.png')
    plt.close()
    return pr_auc

# Fungsi untuk training model
metrics_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC'])
def train_model(model_name):
    model = models[model_name]
    print(f"\nTraining {model_name}...")

    # Grid Search CV untuk hyperparameter tuning
    grid_search = GridSearchCV(model, param_grids[model_name], cv=3, scoring='f1', n_jobs=-1)
    grid_search.fit(X_train_resampled, y_train_resampled)

    # Simpan model terbaik
    best_model = grid_search.best_estimator_
    best_models[model_name] = best_model

    # Prediksi
    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test)[:, 1]

    # Evaluasi
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_prob)

    # Tampilkan hasil
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"AUC: {auc_score:.4f}")

    # Simpan hasil
    results[model_name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc_score,
        'y_pred': y_pred,
        'y_prob': y_prob
    }

    # Plot confusion matrix
    plot_confusion_matrix(y_test, y_pred, model_name)

    # Plot ROC curve
    plot_roc_curve(y_test, y_prob, model_name)

    # Plot Precision-Recall curve
    plot_pr_curve(y_test, y_prob, model_name)

    # Tambahkan ke dataframe
    return pd.DataFrame({
        'Model': [model_name],
        'Accuracy': [accuracy],
        'Precision': [precision],
        'Recall': [recall],
        'F1-Score': [f1],
        'AUC': [auc_score]
    })

### Training untuk setiap model
#### Logistic Regression

In [28]:
metrics_df = pd.concat([metrics_df, train_model('Logistic Regression')], ignore_index=True)


Training Logistic Regression...
Best parameters: {'C': 1}
Accuracy: 0.9744
Precision: 0.0561
Recall: 0.8716
F1-Score: 0.1053
AUC: 0.9603


#### Decision Tree

In [29]:
metrics_df = pd.concat([metrics_df, train_model('Decision Tree')], ignore_index=True)


Training Decision Tree...
Best parameters: {'max_depth': None}
Accuracy: 0.9973
Precision: 0.3639
Recall: 0.7500
F1-Score: 0.4901
AUC: 0.8739


#### KNN


In [30]:
metrics_df = pd.concat([metrics_df, train_model('KNN')], ignore_index=True)


Training KNN...
Best parameters: {'n_neighbors': 3}
Accuracy: 0.9985
Precision: 0.5360
Recall: 0.8041
F1-Score: 0.6432
AUC: 0.9117


#### SVM

In [35]:
metrics_df = pd.concat([metrics_df, train_model('SVM')], ignore_index=True)


Training SVM...


KeyboardInterrupt: 

#### Random Forest (Bagging)

In [36]:
metrics_df = pd.concat([metrics_df, train_model('Random Forest (Bagging)')], ignore_index=True)


Training Random Forest (Bagging)...
Best parameters: {'max_depth': None, 'n_estimators': 100}
Accuracy: 0.9992
Precision: 0.7785
Recall: 0.7838
F1-Score: 0.7811
AUC: 0.9617


#### AdaBoost (Boosting)

In [37]:
metrics_df = pd.concat([metrics_df, train_model('AdaBoost (Boosting)')], ignore_index=True)


Training AdaBoost (Boosting)...
Best parameters: {'learning_rate': 1.0, 'n_estimators': 100}
Accuracy: 0.9761
Precision: 0.0604
Recall: 0.8784
F1-Score: 0.1129
AUC: 0.9575


#### Gradient Boosting

In [38]:
metrics_df = pd.concat([metrics_df, train_model('Gradient Boosting')], ignore_index=True)


Training Gradient Boosting...


KeyboardInterrupt: 

# Perbandingan model

In [40]:
# Plot perbandingan metrik
plt.figure(figsize=(14, 10))
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC']
for metric in metrics:
    plt.figure(figsize=(12, 6))
    sns.barplot(x='Model', y=metric, data=metrics_df)
    plt.title(f'Perbandingan {metric} antar Model')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f'comparison_{metric.lower().replace("-", "_")}.png')
    plt.close()

# Plot ROC curves untuk semua model dalam satu grafik
plt.figure(figsize=(12, 10))
for model_name, result in results.items():
    fpr, tpr, _ = roc_curve(y_test, result['y_prob'])
    plt.plot(fpr, tpr, label=f"{model_name} (AUC = {result['auc']:.3f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - All Models')
plt.legend(loc="lower right")
plt.savefig('roc_curves_all_models.png')
plt.close()

<Figure size 1400x1000 with 0 Axes>