# Outlier Detection
## Koneksi Ke Database

In [50]:
import pandas as pd
import pymysql 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pycaret.anomaly import *

# Koneksi ke database MySQL
conn = pymysql.connect(
    user="root",
    password="",
    host="127.0.0.1",
    port=3306,
    db="psd",
)

# Load data dari database
df = pd.read_sql("SELECT * FROM psd.Iris", conn)
print(f"Columns: {list(df.columns)}")
print(df.head())

Columns: ['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species']
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa


## Persiapan data untuk outlier detection
Hapus kolom Id dan Species untuk fokus pada fitur numerik

In [51]:
df_numeric = df.drop(['Id', 'Species'], axis=1)
print("Dataset untuk outlier detection (hanya fitur numerik):")
print(f"Shape: {df_numeric.shape}")
print(f"Columns: {list(df_numeric.columns)}")
print("\nInfo dataset:")
print(df_numeric.info())
print("\nStatistik deskriptif:")
print(df_numeric.describe())

Dataset untuk outlier detection (hanya fitur numerik):
Shape: (150, 4)
Columns: ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

Info dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SepalLengthCm  150 non-null    float64
 1   SepalWidthCm   150 non-null    float64
 2   PetalLengthCm  150 non-null    float64
 3   PetalWidthCm   150 non-null    float64
dtypes: float64(4)
memory usage: 4.8 KB
None

Statistik deskriptif:
       SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
count     150.000000    150.000000     150.000000    150.000000
mean        5.843333      3.054000       3.758667      1.198667
std         0.828066      0.433594       1.764420      0.763161
min         4.300000      2.000000       1.000000      0.100000
25%         5.100000      2.800000       1.600000      0.300000
50%         5.8000

## Setup PyCaret untuk Anomaly Detection

In [52]:
print("Setting up PyCaret for Anomaly Detection...")
anomaly_setup = setup(df_numeric, session_id=123, verbose=False)
print("Setup berhasil!")

Setting up PyCaret for Anomaly Detection...
Setup berhasil!


## 1. ABOD (Angle-based Outlier Detection)

In [53]:
print("IMPLEMENTASI ABOD (Angle-based Outlier Detection)")

# Create ABOD model
abod_model = create_model('abod', fraction=0.1)  # 10% data dianggap outlier
print("ABOD model berhasil dibuat!")

# Assign anomaly scores dan labels
abod_results = assign_model(abod_model)
print(f"\nHasil ABOD:")
print(f"Shape: {abod_results.shape}")
print(f"Columns: {list(abod_results.columns)}")

# Lihat distribusi anomaly
print(f"\nDistribusi Anomaly ABOD:")
print(abod_results['Anomaly'].value_counts())
print(f"Percentage of outliers: {(abod_results['Anomaly'] == 1).sum() / len(abod_results) * 100:.2f}%")

# Lihat outliers dengan anomaly score tertinggi
abod_outliers = abod_results[abod_results['Anomaly'] == 1].sort_values('Anomaly_Score', ascending=False)
print(f"\nTop 5 Outliers (ABOD):")
print(abod_outliers[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Anomaly_Score']].head())

IMPLEMENTASI ABOD (Angle-based Outlier Detection)


ABOD model berhasil dibuat!

Hasil ABOD:
Shape: (150, 6)
Columns: ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Anomaly', 'Anomaly_Score']

Distribusi Anomaly ABOD:
Anomaly
0    135
1     15
Name: count, dtype: int64
Percentage of outliers: 10.00%

Top 5 Outliers (ABOD):
     SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  Anomaly_Score
106            4.9           2.5            4.5           1.7      -0.050388
108            6.7           2.5            5.8           1.8      -0.084224
41             4.5           2.3            1.3           0.3      -0.088998
117            7.7           3.8            6.7           2.2      -0.129286
131            7.9           3.8            6.4           2.0      -0.137417


## 2. KNN (K-Nearest Neighbors) untuk Outlier Detection

In [54]:
print("IMPLEMENTASI KNN untuk Outlier Detection")

# Create KNN model
knn_model = create_model('knn', fraction=0.1)  # 10% data dianggap outlier
print("KNN model berhasil dibuat!")

# Assign anomaly scores dan labels
knn_results = assign_model(knn_model)
print(f"\nHasil KNN:")
print(f"Shape: {knn_results.shape}")

# Lihat distribusi anomaly
print(f"\nDistribusi Anomaly KNN:")
print(knn_results['Anomaly'].value_counts())
print(f"Percentage of outliers: {(knn_results['Anomaly'] == 1).sum() / len(knn_results) * 100:.2f}%")

# Lihat outliers dengan anomaly score tertinggi
knn_outliers = knn_results[knn_results['Anomaly'] == 1].sort_values('Anomaly_Score', ascending=False)
print(f"\nTop 5 Outliers (KNN):")
print(knn_outliers[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Anomaly_Score']].head())

IMPLEMENTASI KNN untuk Outlier Detection


KNN model berhasil dibuat!

Hasil KNN:
Shape: (150, 6)

Distribusi Anomaly KNN:
Anomaly
0    135
1     15
Name: count, dtype: int64
Percentage of outliers: 10.00%

Top 5 Outliers (KNN):
     SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  Anomaly_Score
131            7.9           3.8            6.4           2.0       1.024695
117            7.7           3.8            6.7           2.2       1.019804
118            7.7           2.6            6.9           2.3       0.964365
106            4.9           2.5            4.5           1.7       0.883176
98             5.1           2.5            3.0           1.1       0.818535


## 3. LOF (Local Outlier Factor) untuk Outlier Detection

In [55]:
print("IMPLEMENTASI LOF (Local Outlier Factor)")

# Create LOF model
lof_model = create_model('lof', fraction=0.1)  # 10% data dianggap outlier
print("LOF model berhasil dibuat!")

# Assign anomaly scores dan labels
lof_results = assign_model(lof_model)
print(f"\nHasil LOF:")
print(f"Shape: {lof_results.shape}")

# Lihat distribusi anomaly
print(f"\nDistribusi Anomaly LOF:")
print(lof_results['Anomaly'].value_counts())
print(f"Percentage of outliers: {(lof_results['Anomaly'] == 1).sum() / len(lof_results) * 100:.2f}%")

# Lihat outliers dengan anomaly score tertinggi
lof_outliers = lof_results[lof_results['Anomaly'] == 1].sort_values('Anomaly_Score', ascending=False)
print(f"\nTop 5 Outliers (LOF):")
print(lof_outliers[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Anomaly_Score']].head())

IMPLEMENTASI LOF (Local Outlier Factor)


LOF model berhasil dibuat!

Hasil LOF:
Shape: (150, 6)

Distribusi Anomaly LOF:
Anomaly
0    135
1     15
Name: count, dtype: int64
Percentage of outliers: 10.00%

Top 5 Outliers (LOF):
     SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  Anomaly_Score
41             4.5           2.3            1.3           0.3       1.777233
15             5.7           4.4            1.5           0.4       1.653626
118            7.7           2.6            6.9           2.3       1.624653
117            7.7           3.8            6.7           2.2       1.572990
131            7.9           3.8            6.4           2.0       1.547288


## Identifikasi dan Hapus 2 Outliers Teratas dari Setiap Algoritma

In [56]:
print("IDENTIFIKASI DAN PENGHAPUSAN 2 OUTLIERS TERATAS")

# Simpan hasil dari setiap algoritma dengan index asli
abod_results_indexed = abod_results.copy()
abod_results_indexed.index = df.index

knn_results_indexed = knn_results.copy()
knn_results_indexed.index = df.index

lof_results_indexed = lof_results.copy()
lof_results_indexed.index = df.index

# Identifikasi 2 outliers teratas dari setiap algoritma
print("\n1. Top 2 Outliers dari ABOD:")
top2_abod = abod_results_indexed[abod_results_indexed['Anomaly'] == 1].nlargest(2, 'Anomaly_Score')
top2_abod_indices = top2_abod.index.tolist()
print(f"Indices: {top2_abod_indices}")
print(top2_abod[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Anomaly_Score']])

print("\n2. Top 2 Outliers dari KNN:")
top2_knn = knn_results_indexed[knn_results_indexed['Anomaly'] == 1].nlargest(2, 'Anomaly_Score')
top2_knn_indices = top2_knn.index.tolist()
print(f"Indices: {top2_knn_indices}")
print(top2_knn[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Anomaly_Score']])

print("\n3. Top 2 Outliers dari LOF:")
top2_lof = lof_results_indexed[lof_results_indexed['Anomaly'] == 1].nlargest(2, 'Anomaly_Score')
top2_lof_indices = top2_lof.index.tolist()
print(f"Indices: {top2_lof_indices}")
print(top2_lof[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Anomaly_Score']])

IDENTIFIKASI DAN PENGHAPUSAN 2 OUTLIERS TERATAS

1. Top 2 Outliers dari ABOD:
Indices: [106, 108]
     SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  Anomaly_Score
106            4.9           2.5            4.5           1.7      -0.050388
108            6.7           2.5            5.8           1.8      -0.084224

2. Top 2 Outliers dari KNN:
Indices: [131, 117]
     SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  Anomaly_Score
131            7.9           3.8            6.4           2.0       1.024695
117            7.7           3.8            6.7           2.2       1.019804

3. Top 2 Outliers dari LOF:
Indices: [41, 15]
    SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  Anomaly_Score
41            4.5           2.3            1.3           0.3       1.777233
15            5.7           4.4            1.5           0.4       1.653626


In [57]:
# Gabungkan semua indeks outliers yang akan dihapus
all_outlier_indices = list(set(top2_abod_indices + top2_knn_indices + top2_lof_indices))
print(f"\n4. Semua indeks outliers yang akan dihapus: {sorted(all_outlier_indices)}")
print(f"Total outliers unik yang akan dihapus: {len(all_outlier_indices)}")

# Hapus outliers dari dataset asli
df_cleaned = df.drop(all_outlier_indices).reset_index(drop=True)
print(f"\nDataset sebelum pembersihan: {df.shape}")
print(f"Dataset setelah pembersihan: {df_cleaned.shape}")
print(f"Jumlah data yang dihapus: {len(all_outlier_indices)}")

# Tampilkan informasi outliers yang dihapus
print(f"\n5. Detail Outliers yang Dihapus:")
outliers_removed = df.loc[all_outlier_indices].copy()
outliers_removed = outliers_removed.sort_index()
print(outliers_removed)

# Tampilkan statistik dataset setelah pembersihan
df_cleaned_numeric = df_cleaned.drop(['Id', 'Species'], axis=1)
print(f"\n6. Statistik Dataset Setelah Pembersihan:")
print(df_cleaned_numeric.describe())


4. Semua indeks outliers yang akan dihapus: [15, 41, 106, 108, 117, 131]
Total outliers unik yang akan dihapus: 6

Dataset sebelum pembersihan: (150, 6)
Dataset setelah pembersihan: (144, 6)
Jumlah data yang dihapus: 6

5. Detail Outliers yang Dihapus:
      Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  \
15    16            5.7           4.4            1.5           0.4   
41    42            4.5           2.3            1.3           0.3   
106  107            4.9           2.5            4.5           1.7   
108  109            6.7           2.5            5.8           1.8   
117  118            7.7           3.8            6.7           2.2   
131  132            7.9           3.8            6.4           2.0   

            Species  
15      Iris-setosa  
41      Iris-setosa  
106  Iris-virginica  
108  Iris-virginica  
117  Iris-virginica  
131  Iris-virginica  

6. Statistik Dataset Setelah Pembersihan:
       SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWi