In [1]:
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor

# 🔹 Load dataset (Gantilah dengan dataset penelitian Anda)
df = pd.read_csv("D://PoltekSSN/KULIAH/TA/ML RBA/ML/Sebelumnya/rba-dataset-fix.csv") 


In [2]:
print("▶️ Analisis Time Cost EDA pada 1 Juta Baris Data...")

# Sampling 1 juta baris data (jika data kurang dari 1 juta, gunakan seluruh data)
df_eda = df.sample(n=min(1_000_000, len(df)), random_state=42)

eda_times = {}

# 1. Melihat info dataframe
start = time.time()
df_eda.info()
eda_times['info'] = time.time() - start

# 2. Melihat statistik deskriptif
start = time.time()
desc = df_eda.describe(include='all')
eda_times['describe'] = time.time() - start

# 3. Melihat jumlah missing value
start = time.time()
missing = df_eda.isnull().sum()
eda_times['missing_values'] = time.time() - start

# 4. Melihat distribusi nilai unik pada setiap kolom
start = time.time()
unique_counts = df_eda.nunique()
eda_times['unique_counts'] = time.time() - start

# 5. Melihat korelasi antar fitur numerik
start = time.time()
correlations = df_eda.corr(numeric_only=True)
eda_times['correlation'] = time.time() - start

print("\n⏱️ Waktu yang dibutuhkan untuk setiap langkah EDA (dalam detik):")
for step, t in eda_times.items():
    print(f"- {step}: {t:.4f} detik")

▶️ Analisis Time Cost EDA pada 1 Juta Baris Data...
<class 'pandas.core.frame.DataFrame'>
Index: 1000000 entries, 1551603 to 1686537
Data columns (total 16 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   index                     1000000 non-null  int64  
 1   Login Timestamp           1000000 non-null  object 
 2   User ID                   1000000 non-null  int64  
 3   Round-Trip Time [ms]      46281 non-null    float64
 4   IP Address                1000000 non-null  object 
 5   Country                   1000000 non-null  object 
 6   Region                    999422 non-null   object 
 7   City                      999574 non-null   object 
 8   ASN                       1000000 non-null  int64  
 9   User Agent String         1000000 non-null  object 
 10  Browser Name and Version  1000000 non-null  object 
 11  OS Name and Version       1000000 non-null  object 
 12  Device Type               99995

In [3]:
print("▶️ Analisis Time Cost EDA pada Seluruh Dataset...")

# Sampling seluruh data
df_full = df.copy()

eda_times_full = {}

# 1. Melihat info dataframe
start = time.time()
df_full.info()
eda_times_full['info'] = time.time() - start

# 2. Melihat statistik deskriptif
start = time.time()
desc_full = df_full.describe(include='all')
eda_times_full['describe'] = time.time() - start

# 3. Melihat jumlah missing value
start = time.time()
missing_full = df_full.isnull().sum()
eda_times_full['missing_values'] = time.time() - start

# 4. Melihat distribusi nilai unik pada setiap kolom
start = time.time()
unique_counts_full = df_full.nunique()
eda_times_full['unique_counts'] = time.time() - start

# 5. Melihat korelasi antar fitur numerik
start = time.time()
correlations_full = df_full.corr(numeric_only=True)
eda_times_full['correlation'] = time.time() - start

print("\n⏱️ Waktu yang dibutuhkan untuk setiap langkah EDA pada seluruh data (dalam detik):")
for step, t in eda_times_full.items():
    print(f"- {step}: {t:.4f} detik")

▶️ Analisis Time Cost EDA pada Seluruh Dataset...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500000 entries, 0 to 2499999
Data columns (total 16 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   index                     int64  
 1   Login Timestamp           object 
 2   User ID                   int64  
 3   Round-Trip Time [ms]      float64
 4   IP Address                object 
 5   Country                   object 
 6   Region                    object 
 7   City                      object 
 8   ASN                       int64  
 9   User Agent String         object 
 10  Browser Name and Version  object 
 11  OS Name and Version       object 
 12  Device Type               object 
 13  Login Successful          bool   
 14  Is Attack IP              bool   
 15  Is Account Takeover       bool   
dtypes: bool(3), float64(1), int64(3), object(9)
memory usage: 255.1+ MB

⏱️ Waktu yang dibutuhkan untuk setiap langkah EDA pada seluru

In [8]:
print("▶️ Memulai Preprocessing Data...")
cleaning_times = []
encoding_times = []
normalization_times = []
splitting_times = []

for _ in range(5):  # Loop untuk mendapatkan rataan waktu
    # ✅ Cleaning: Menghapus nilai yang hilang
    start_cleaning = time.time()
    df_cleaned = df.dropna()
    end_cleaning = time.time()
    cleaning_times.append(end_cleaning - start_cleaning)

    # ✅ Encoding: Mengonversi data kategorikal menjadi numerik
    start_encoding = time.time()
    label_encoders = {}
    for col in df_cleaned.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        df_cleaned.loc[:, col] = le.fit_transform(df_cleaned[col])
        label_encoders[col] = le
    end_encoding = time.time()
    encoding_times.append(end_encoding - start_encoding)

    # ✅ Normalisasi: Standarisasi fitur numerik
    start_normalization = time.time()
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df_cleaned)
    end_normalization = time.time()
    normalization_times.append(end_normalization - start_normalization)

    # ✅ Splitting dataset (Train-Test Split)
    start_splitting = time.time()
    X_train, X_test = train_test_split(df_scaled, test_size=0.2, random_state=42)
    end_splitting = time.time()
    splitting_times.append(end_splitting - start_splitting)

# 📌 Rataan waktu untuk setiap tahap preprocessing
avg_cleaning_time = np.mean(cleaning_times)
avg_encoding_time = np.mean(encoding_times)
avg_normalization_time = np.mean(normalization_times)
avg_splitting_time = np.mean(splitting_times)

print(f"✅ Rata-rata waktu cleaning: {avg_cleaning_time:.2f} detik")
print(f"✅ Rata-rata waktu encoding: {avg_encoding_time:.2f} detik")
print(f"✅ Rata-rata waktu normalisasi: {avg_normalization_time:.2f} detik")
print(f"✅ Rata-rata waktu splitting: {avg_splitting_time:.2f} detik")

▶️ Memulai Preprocessing Data...
✅ Rata-rata waktu cleaning: 0.87 detik
✅ Rata-rata waktu encoding: 0.83 detik
✅ Rata-rata waktu normalisasi: 0.18 detik
✅ Rata-rata waktu splitting: 0.03 detik


In [9]:
print("\n▶️ Memulai Pelatihan Model...")

model_training_times = {}
model_testing_times = {}

models = {
    "Isolation Forest": IsolationForest(contamination=0.1, random_state=42),
    "One-Class SVM": OneClassSVM(nu=0.1, kernel="rbf"),
    "Local Outlier Factor": LocalOutlierFactor(n_neighbors=20, novelty=True)
}

for model_name, model in models.items():
    training_times = []
    testing_times = []

    for _ in range(5):  # Loop untuk mendapatkan rataan waktu
        # ✅ Training
        start_train = time.time()
        if model_name == "Local Outlier Factor":
            model.fit(X_train)  # LOF tidak memiliki metode 'fit' seperti model lainnya
        else:
            model.fit(X_train)
        end_train = time.time()
        training_times.append(end_train - start_train)

        # ✅ Testing (Inference)
        start_test = time.time()
        if model_name == "Local Outlier Factor":
            predictions = model.predict(X_test)
        else:
            predictions = model.predict(X_test)
        end_test = time.time()
        testing_times.append(end_test - start_test)

    # 📌 Rataan waktu training & testing
    model_training_times[model_name] = np.mean(training_times)
    model_testing_times[model_name] = np.mean(testing_times)

# Menampilkan hasil pelatihan dan evaluasi model
for model_name in models.keys():
    print(f"✅ {model_name} - Rata-rata waktu training: {model_training_times[model_name]:.2f} detik")
    print(f"✅ {model_name} - Rata-rata waktu testing: {model_testing_times[model_name]:.2f} detik")



▶️ Memulai Pelatihan Model...
✅ Isolation Forest - Rata-rata waktu training: 0.61 detik
✅ Isolation Forest - Rata-rata waktu testing: 0.09 detik
✅ One-Class SVM - Rata-rata waktu training: 159.34 detik
✅ One-Class SVM - Rata-rata waktu testing: 13.77 detik
✅ Local Outlier Factor - Rata-rata waktu training: 10.52 detik
✅ Local Outlier Factor - Rata-rata waktu testing: 2.67 detik


In [8]:
print("\n▶️ Memulai Simulasi Deployment...")

inference_times = []
best_model = models["Isolation Forest"]  # Pilih model terbaik setelah evaluasi

for _ in range(5):  # Loop untuk mendapatkan rataan waktu inferensi
    start_time = time.time()
    sample_input = X_test[:1]  # Simulasi 1 request prediksi
    prediction = best_model.predict(sample_input)
    end_time = time.time()
    inference_times.append(end_time - start_time)

# 📌 Rataan waktu inferensi
avg_inference_time = np.mean(inference_times)
print(f"✅ Rata-rata waktu inferensi per request: {avg_inference_time:.4f} detik")


▶️ Memulai Simulasi Deployment...
✅ Rata-rata waktu inferensi per request: 0.0034 detik
