# Задание 2

## Описание

По данным addicts0.xls проанализировать на предмет выбросов индексы тяжести.

## Подготовка

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import f
import plotly.express as px

In [2]:
df = pd.read_excel('addicts0.xls', sheet_name=1)
necessary_columns = ['asi1_med', 'asi2_emp', 'asi4_dr']
df = df[necessary_columns]
df.fillna(df.mean(), inplace=True)

ImportError: Missing optional dependency 'xlrd'. Install xlrd >= 2.0.1 for xls Excel support Use pip or conda to install xlrd.

## Решение

In [None]:


def calculate_mean(df: pd.DataFrame) -> np.ndarray:
    return df.sum().to_numpy() / len(df)
     

def calculate_cov_matrix(df: pd.DataFrame) -> np.ndarray:
    centered = df - calculate_mean(df)
    return centered.apply(lambda row: np.outer(row, row), axis=1).sum() / (len(df) - 1)
     

def calculate_mahalanobis_square_distance(x: np.ndarray, df: pd.DataFrame) -> np.float64:
    d = x - calculate_mean(df)
    cov_matrix = calculate_cov_matrix(df)
    return d.dot(np.linalg.inv(cov_matrix).dot(d))
     

def calculate_statistics(x: np.ndarray, df: pd.DataFrame) -> np.float64:
    n = len(df)
    p = len(df.columns)
    return ((n - p) * n) / ((n ** 2 - 1) * p) * calculate_mahalanobis_square_distance(x, df)
     

def calculate_p_value(index: int, df: pd.DataFrame) -> np.float64:
    x = df.loc[index].to_numpy()
    reduced_df = df.drop([index])

    statistics = calculate_statistics(x, reduced_df)

    n = len(reduced_df)
    p = len(reduced_df.columns)

    dfn, dfd = p, n - p

    return 1 - f.cdf(statistics, dfn=dfn, dfd=dfd)
     

def mark_outliers(df: pd.DataFrame, alpha: float = 0.01) -> None:
    while True:
        filtered_df = df[~df['is_outlier']]
        filtered_df = filtered_df.drop(columns=['is_outlier'])
        p_values = filtered_df.apply(lambda row: calculate_p_value(row.name, filtered_df), axis=1)

        if p_values.min() >= alpha:
            break

        print(f'{p_values.idxmin()}: {df.loc[p_values.idxmin()].drop("is_outlier").to_numpy()}')
        df.loc[p_values.idxmin(), 'is_outlier'] = True
     
