# Utils

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import f
import plotly.express as px

In [2]:
def calculate_mean(df: pd.DataFrame) -> np.ndarray:
    return df.sum().to_numpy() / len(df)

In [3]:
def calculate_cov_matrix(df: pd.DataFrame) -> np.ndarray:
    centered = df - calculate_mean(df)
    return centered.apply(lambda row: np.outer(row, row), axis=1).sum() / (len(df) - 1)

In [4]:
def calculate_mahalanobis_square_distance(x: np.ndarray, df: pd.DataFrame) -> np.float64:
    d = x - calculate_mean(df)
    cov_matrix = calculate_cov_matrix(df)
    return d.dot(np.linalg.inv(cov_matrix).dot(d))

In [5]:
def calculate_statistics(x: np.ndarray, df: pd.DataFrame) -> np.float64:
    n = len(df)
    p = len(df.columns)
    return ((n - p) * n) / ((n ** 2 - 1) * p) * calculate_mahalanobis_square_distance(x, df)

In [6]:
def calculate_p_value(index: int, df: pd.DataFrame) -> np.float64:
    x = df.loc[index].to_numpy()
    reduced_df = df.drop([index])

    statistics = calculate_statistics(x, reduced_df)

    n = len(reduced_df)
    p = len(reduced_df.columns)

    dfn, dfd = p, n - p

    return 1 - f.cdf(statistics, dfn=dfn, dfd=dfd)

In [7]:
def mark_outliers(df: pd.DataFrame, alpha: float = 0.01) -> None:
    while True:
        filtered_df = df[~df['is_outlier']]
        filtered_df = filtered_df.drop(columns=['is_outlier'])
        p_values = filtered_df.apply(lambda row: calculate_p_value(row.name, filtered_df), axis=1)

        if p_values.min() >= alpha:
            break

        print(f'{p_values.idxmin()}: {df.loc[p_values.idxmin()].drop("is_outlier").to_numpy()}')
        df.loc[p_values.idxmin(), 'is_outlier'] = True

# Outliers analysis

In [8]:
df = pd.read_excel('addicts0.xls', sheet_name=1)
necessary_columns = ['asi1_med', 'asi2_emp', 'asi4_dr']
df = df[necessary_columns]
df.fillna(df.mean(), inplace=True)
df

Unnamed: 0,asi1_med,asi2_emp,asi4_dr
0,0.19,0.70,0.30
1,0.44,0.23,0.27
2,0.50,1.00,0.30
3,0.00,0.80,0.26
4,0.00,0.75,0.23
...,...,...,...
275,0.10,0.59,0.25
276,0.00,1.00,0.25
277,0.00,1.00,0.21
278,0.00,1.00,0.29


In [9]:
df['is_outlier'] = False
mark_outliers(df)

46: [0.0 1.0 0.66]
268: [0.0 1.0 0.09]
98: [0.97 0.5 0.19]
219: [0.0 1.0 0.11]
85: [0.0 0.0 0.2]
170: [1.0 1.0 0.38]
145: [0.83 1.0 0.41]
147: [0.0 1.0 0.12]
222: [0.0 0.85 0.12]
6: [0.0 1.0 0.42]


In [10]:
px.scatter_3d(df, x='asi1_med', y='asi2_emp', z='asi4_dr', color='is_outlier')

Unsupported