In [None]:
# Step 1: Import necessary libraries
import numpy as np
import pandas as pd
from scipy.stats import chi2
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Step 2: Load your dataset
# Example: Replace with your actual dataset path or source
df = pd.read_csv('your_dataset.csv')

# Step 3: Standardize the dataset
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)

# Step 4: Calculate the covariance matrix and its inverse
cov_matrix = np.cov(X_scaled.T)  # Covariance matrix of the dataset
inv_cov_matrix = np.linalg.inv(cov_matrix)  # Inverse of the covariance matrix

# Step 5: Compute Mahalanobis Distance for each data point
mean = np.mean(X_scaled, axis=0)
mahal_dist = []
for i in range(len(X_scaled)):
    diff = X_scaled[i] - mean
    mahal_dist.append(np.sqrt(np.dot(np.dot(diff, inv_cov_matrix), diff.T)))
mahal_dist = np.array(mahal_dist)

# Step 6: Calculate the threshold for outliers using Chi-Square distribution
alpha = 0.05  # Significance level (5% for 95% confidence)
threshold = chi2.ppf((1 - alpha), df=X_scaled.shape[1])  # Chi-square threshold

# Step 7: Identify outliers (where Mahalanobis distance is greater than threshold)
outliers = mahal_dist > threshold

# Step 8: Visualize the result (optional)
plt.figure(figsize=(10, 6))
plt.hist(mahal_dist, bins=30, edgecolor='black')
plt.axvline(x=threshold, color='red', linestyle='--')
plt.title('Mahalanobis Distance Distribution')
plt.xlabel('Mahalanobis Distance')
plt.ylabel('Frequency')
plt.show()

# Step 9: Display the outlier data points
outlier_data = df[outliers]
print("Outlier Data:\n", outlier_data)