In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Load and prepare the dataset
data = load_iris()
X, y = data.data, data.target
feature_names = data.feature_names
print(feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [None]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [None]:
print(X.shape, "   ", y.shape)

(150, 4)     (150,)


In [None]:
# Standardize the dataset / Z score
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Function to detect and remove outliers using Z-Score
def remove_outliers_zscore(X, threshold=3):
    z_scores = np.abs((X - np.mean(X, axis=0)) / np.std(X, axis=0))
    mask = (z_scores < threshold).all(axis=1)
    return X[mask], mask

In [None]:
# Function to detect and remove outliers using IQR
def remove_outliers_iqr(X):
    Q1 = np.percentile(X, 25, axis=0)
    Q3 = np.percentile(X, 75, axis=0)
    IQR = Q3 - Q1
    mask = ((X >= (Q1 - 1.5 * IQR)) & (X <= (Q3 + 1.5 * IQR))).all(axis=1)
    return X[mask], mask

In [None]:
# Function to detect and remove outliers using Isolation Forest
def remove_outliers_isolation_forest(X):
    iso = IsolationForest(contamination=0.1)
    yhat = iso.fit_predict(X)
    mask = yhat != -1
    return X[mask], mask

In [None]:
# Function to detect and remove outliers using Local Outlier Factor
def remove_outliers_lof(X):
    lof = LocalOutlierFactor()
    yhat = lof.fit_predict(X)
    mask = yhat != -1
    return X[mask], mask

In [None]:
# Function to detect and remove outliers using Elliptic Envelope
def remove_outliers_elliptic_envelope(X):
    envelope = EllipticEnvelope(contamination=0.1)
    yhat = envelope.fit_predict(X)
    mask = yhat != -1
    return X[mask], mask


In [None]:
# List of outlier detection methods
outlier_methods = {
    'Z-Score': remove_outliers_zscore,
    'IQR': remove_outliers_iqr,
    'Isolation Forest': remove_outliers_isolation_forest,
    'LOF': remove_outliers_lof,
    'Elliptic Envelope': remove_outliers_elliptic_envelope
}

In [None]:
# Function to train and evaluate the model
def train_evaluate(X_train, X_test, y_train, y_test, model_name):
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, digits=4)
    print(f"\n{model_name} - Accuracy: {accuracy}\n")
    print(f"{model_name} - Classification Report:\n{report}\n")

# Apply outlier detection methods and evaluate the model
for method_name, method_func in outlier_methods.items():
    X_filtered, mask = method_func(X_scaled)
    y_filtered = y[mask]
    X_train, X_test, y_train, y_test = train_test_split(X_filtered, y_filtered, test_size=0.3, random_state=42)
    train_evaluate(X_train, X_test, y_train, y_test, method_name)


Z-Score - Accuracy: 0.9111111111111111

Z-Score - Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        19
           1     0.9091    0.7692    0.8333        13
           2     0.8000    0.9231    0.8571        13

    accuracy                         0.9111        45
   macro avg     0.9030    0.8974    0.8968        45
weighted avg     0.9160    0.9111    0.9106        45



IQR - Accuracy: 0.9545454545454546

IQR - Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        17
           1     0.9091    0.9091    0.9091        11
           2     0.9375    0.9375    0.9375        16

    accuracy                         0.9545        44
   macro avg     0.9489    0.9489    0.9489        44
weighted avg     0.9545    0.9545    0.9545        44



Isolation Forest - Accuracy: 0.926829268292683

Isolation Forest - Classification Report:
     