In [1]:
# Question: Multivariate Outlier Detection Using Mahalanobis Distance
# Description: Implement Mahalanobis distance to detect multivariate outliers in a dataset.
import numpy as np
import pandas as pd
from scipy.stats import chi2

def mahalanobis_outliers(data, threshold=0.99):
    # Convert to NumPy array if data is a DataFrame
    if isinstance(data, pd.DataFrame):
        data = data.values

    # Mean vector and covariance matrix
    mean_vec = np.mean(data, axis=0)
    cov_matrix = np.cov(data, rowvar=False)
    inv_cov_matrix = np.linalg.inv(cov_matrix)

    # Mahalanobis distances
    distances = []
    for x in data:
        diff = x - mean_vec
        md = np.sqrt(diff.T @ inv_cov_matrix @ diff)
        distances.append(md)

    # Chi-square threshold
    dof = data.shape[1]
    chi2_thresh = np.sqrt(chi2.ppf(threshold, df=dof))

    # Outlier indices
    outlier_indices = [i for i, d in enumerate(distances) if d > chi2_thresh]
    
    return outlier_indices, distances

# Example usage
data = np.array([
    [1, 2],
    [2, 3],
    [3, 4],
    [4, 5],
    [100, 200]  # Outlier
])

outliers, distances = mahalanobis_outliers(data)
print("Outlier indices:", outliers)
print("Mahalanobis distances:", distances)



Outlier indices: []
Mahalanobis distances: [np.float64(1.414213562373517), np.float64(0.6324555320337604), np.float64(0.6324555320337688), np.float64(1.4142135623736352), np.float64(1.7888543819998666)]
