In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2
from scipy.spatial.distance import mahalanobis

# Step 1: Generate synthetic multivariate data
np.random.seed(42)  # For reproducibility
mean = [0, 0]
cov = [[1, 0.5], [0.5, 1]]  # Covariance matrix
data = np.random.multivariate_normal(mean, cov, size=500)

# Introduce outliers
outliers = np.random.uniform(low=-6, high=6, size=(10, 2))
data_with_outliers = np.vstack([data, outliers])

# Convert to a pandas DataFrame
df = pd.DataFrame(data_with_outliers, columns=['Feature1', 'Feature2'])

# Step 2: Compute the mean vector and covariance matrix
mean_vector = df.mean().values
cov_matrix = np.cov(df.values, rowvar=False)
inv_cov_matrix = np.linalg.inv(cov_matrix)

# Step 3: Calculate Mahalanobis distance for each observation
def compute_mahalanobis(row, mean_vector, inv_cov_matrix):
    diff = row - mean_vector
    md = np.sqrt(diff.T @ inv_cov_matrix @ diff)
    return md

df['Mahalanobis_Distance'] = df.apply(lambda row: compute_mahalanobis(row[['Feature1', 'Feature2']].values, mean_vector, inv_cov_matrix), axis=1)

# Step 4: Determine the threshold for outliers
# Degrees of freedom is equal to the number of variables
threshold = chi2.ppf(0.975, df=2)  # 97.5% quantile
df['Outlier'] = df['Mahalanobis_Distance']**2 > threshold

# Step 5: Visualize the results
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x='Feature1', y='Feature2', hue='Outlier', palette={True: 'red', False: 'blue'}, alpha=0.6)
plt.title('Mahalanobis Distance-Based Outlier Detection')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend(title='Outlier')
plt.show()

# Step 6: Print the number of outliers detected
num_outliers = df['Outlier'].sum()
print(f"Number of outliers detected: {num_outliers}")