In [107]:
import pandas as pd, numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from scipy.stats import skew, kurtosis
import numpy as np
import plotly.express as px


In [145]:
def normalize_numerical_data(series):
    scaler = StandardScaler()
    standardized_data = scaler.fit_transform(series.values.reshape(-1, 1)).flatten()
    mean = scaler.mean_[0]
    std = scaler.scale_[0]
    
    return pd.Series(standardized_data, name=series.name), mean, std

def inverse_filter_ranges(filter_ranges, mean, std):
    filter_ranges['lower_bound_gamma'] = round(filter_ranges['lower_bound'] * std + mean, 2)
    filter_ranges['upper_bound_gamma'] = round(filter_ranges['upper_bound'] * std + mean, 2)
    return filter_ranges.drop(['lower_bound', 'upper_bound'], axis=1)

def iqr_approach(series, feature):
    summary = series.describe()
    return pd.DataFrame({
        'feature': [feature],
        'lower_bound_iqr': round(summary['25%'] - 1.5 * (summary['75%'] - summary['25%']), 2),
        'upper_bound_iqr': round(summary['75%'] + 1.5 * (summary['75%'] - summary['25%']), 2)
    })

def z_score_approach(series, feature, alpha: int = 3):
    summary = series.describe()
    return pd.DataFrame({
        'feature': [feature],
        'lower_bound_z_score': round(summary['mean'] - alpha * summary['std'], 2),
        'upper_bound_z_score': round(summary['mean'] + alpha * summary['std'], 2)
    })

def modified_z_score_approach(series, feature, alpha: int = 3):
    median = series.median()
    absolute_deviations = np.abs(series - median)
    mad = absolute_deviations.median()

    if mad == 0:
        lower_bound = upper_bound = median
    else:
        lower_bound = round(median - alpha * mad, 2)
        upper_bound = round(median + alpha * mad, 2)

    return pd.DataFrame({
        'feature': [feature],
        'lower_bound_mod_z_score': lower_bound,
        'upper_bound_mod_z_score': upper_bound
    })

def f1(z: pd.Series, beta_1: int, beta_2: int) -> pd.Series:
    anomaly_mask = ~((-beta_1 < z) & (z < beta_2))
    print(set(anomaly_mask))
    return anomaly_mask

def f2(z: pd.Series, beta_1: int, beta_2: int, gamma: int = 3) -> pd.Series:
    isolation_forest = IsolationForest()
    mask_f1 = f1(z, gamma * beta_1, gamma * beta_2)
    isolation_forest.fit(z.values.reshape(-1, 1))  # Fit on reshaped data
    mask_iforest = isolation_forest.predict(z.values.reshape(-1, 1)) == -1  # -1 indicates outliers
    return mask_f1 & mask_iforest

def gamma_outlier(series, feature, alphas: int = 6, alphak: int = 30):
    
    skew_result, kurt_result = abs(skew(series)), abs(kurtosis(series))
    
    if skew_result < alphas and kurt_result < alphak:
        mask = f1(series, beta_1=skew_result, beta_2=kurt_result)
    else:
        mask = f2(series, beta_1=skew_result, beta_2=kurt_result, gamma=3)

    filtered_data = series[~mask]


    if len(filtered_data) > 0:
        return pd.DataFrame({
            'feature': [feature],
            'lower_bound': round(filtered_data.min(), 2),
            'upper_bound': round(filtered_data.max(), 2)
        })

def jaccard_index(lower_a, upper_a, lower_b, upper_b):
    overlap = max(0, min(upper_b, upper_a) - max(lower_b, lower_a))
    distance = max(0, max(upper_b, upper_a) - min(lower_b, lower_a))
    
    return overlap / distance if distance != 0 else 0



In [146]:
np.random.seed(42)

num_features = 10  
num_data_points = 100  
num_anomalies = 5  


data = {}
min_values = []
max_values = []

for i in range(1, num_features + 1):
    feature_name = f'Feature_{i}'
    normal_data = np.random.normal(loc=10, scale=5, size=num_data_points)  
    min_values.append(np.min(normal_data))  
    max_values.append(np.max(normal_data))  
    anomalies = np.random.uniform(low=50, high=100, size=num_anomalies)  
    combined_data = np.concatenate([normal_data, anomalies]) 
    data[feature_name] = combined_data

features_list = []
values_list = []

for feature in data:
    features_list.append(feature)
    values_list.append(data[feature].tolist()) 

final_df = pd.DataFrame({
    'feature': features_list,
    'values': values_list,
    'lower': min_values,
    'upper': max_values
})

final_df

Unnamed: 0,feature,values,lower,upper
0,Feature_1,"[12.483570765056164, 9.308678494144077, 13.238...",-3.098726,19.261391
1,Feature_2,"[11.074694150473775, -0.11157466597867582, 5.2...",-0.111575,23.672111
2,Feature_3,"[15.415256215876385, 15.269010260174515, 3.111...",-6.206337,29.263657
3,Feature_4,"[6.653986359093935, 15.809686588733873, 17.764...",-2.261323,21.378806
4,Feature_5,"[15.896485920319133, 10.337592407050545, 20.30...",-1.509606,25.394404
5,Feature_6,"[7.177214983652124, 4.989766381856329, 11.3987...",-3.704774,23.918887
6,Feature_7,"[10.278624561443474, 15.470957592354743, 1.537...",-3.484433,22.866799
7,Feature_8,"[1.8520350701183634, 12.759318276547504, 20.02...",-2.423058,20.406573
8,Feature_9,"[14.921611992382918, 8.930055778872456, 9.7526...",-0.99403,22.634662
9,Feature_10,"[8.887936107722172, 4.265206856332804, 9.42704...",2.042977,24.825052


In [147]:
# Initialize empty DataFrames for filter ranges
filter_ranges_iqr = []
filter_ranges_z = []
filter_ranges_modified_z = []
filter_ranges_gamma = []

# Iterate over each row in the final DataFrame
for index, row in final_df.iterrows():
    # Extract the 'Values' column as a Series
    values_series = row['values']
    feature = row['feature']
    # Flatten the lists into a single Series
    #values_series = values_series.explode()

    # Convert to float if necessary
    values_series = pd.Series(values_series)

    # Apply each outlier detection method on the Series
    filter_row_iqr = iqr_approach(values_series, feature)
    filter_row_z = z_score_approach(values_series, feature)
    filter_row_modified_z = modified_z_score_approach(values_series, feature)
    
    # Normalize the numerical data (Values Series)
    final_df_norm, mean, std = normalize_numerical_data(values_series)
    
    # Apply gamma outlier detection on the normalized Series
    filter_ranges_gamma_norm = gamma_outlier(final_df_norm, feature)
    
    # Inverse the filter ranges using the mean and std
    filter_row_gamma = inverse_filter_ranges(filter_ranges_gamma_norm, mean, std)

    filter_ranges_iqr.append(filter_row_iqr)
    filter_ranges_z.append(filter_row_z)
    filter_ranges_modified_z.append(filter_row_modified_z)
    filter_ranges_gamma.append(filter_row_gamma)
    

{False}
{False}
{False}
{False}
{False}
{False}
{False}
{False}
{False}
{False}


In [148]:
filter_ranges_iqr = pd.concat(filter_ranges_iqr, ignore_index=True)

In [149]:
filter_ranges_z = pd.concat(filter_ranges_z, ignore_index=True)

In [150]:
filter_ranges_modified_z = pd.concat(filter_ranges_modified_z, ignore_index=True)

In [151]:
filter_ranges_gamma = pd.concat(filter_ranges_gamma, ignore_index=True)

In [152]:
filter_ranges_iqr

Unnamed: 0,feature,lower_bound_iqr,upper_bound_iqr
0,Feature_1,-1.62,21.86
1,Feature_2,-3.77,25.29
2,Feature_3,-4.54,25.52
3,Feature_4,-2.42,25.26
4,Feature_5,-6.43,25.9
5,Feature_6,-3.13,24.51
6,Feature_7,-2.18,23.01
7,Feature_8,-6.11,26.12
8,Feature_9,-4.28,27.1
9,Feature_10,-0.98,22.37


In [153]:
dfs = [filter_ranges_iqr, filter_ranges_z, filter_ranges_modified_z, filter_ranges_gamma]

for df in dfs:
    final_df = pd.merge(final_df, df, on=['feature'], how='left')

In [154]:
final_df

Unnamed: 0,feature,values,lower,upper,lower_bound_iqr,upper_bound_iqr,lower_bound_z_score,upper_bound_z_score,lower_bound_mod_z_score,upper_bound_mod_z_score,lower_bound_gamma,upper_bound_gamma
0,Feature_1,"[12.483570765056164, 9.308678494144077, 13.238...",-3.098726,19.261391,-1.62,21.86,-29.98,54.74,1.33,18.31,-3.08,97.12
1,Feature_2,"[11.074694150473775, -0.11157466597867582, 5.2...",-0.111575,23.672111,-3.77,25.29,-35.31,63.28,-1.06,22.08,-0.08,99.51
2,Feature_3,"[15.415256215876385, 15.269010260174515, 3.111...",-6.206337,29.263657,-4.54,25.52,-36.49,63.87,-0.72,21.85,-6.28,99.58
3,Feature_4,"[6.653986359093935, 15.809686588733873, 17.764...",-2.261323,21.378806,-2.42,25.26,-30.24,57.84,0.93,21.86,-2.27,88.6
4,Feature_5,"[15.896485920319133, 10.337592407050545, 20.30...",-1.509606,25.394404,-6.43,25.9,-35.81,61.98,-2.33,21.71,-1.51,96.46
5,Feature_6,"[7.177214983652124, 4.989766381856329, 11.3987...",-3.704774,23.918887,-3.13,24.51,-35.53,62.48,0.65,21.0,-3.76,99.31
6,Feature_7,"[10.278624561443474, 15.470957592354743, 1.537...",-3.484433,22.866799,-2.18,23.01,-27.85,52.17,-0.32,19.78,-3.5,95.78
7,Feature_8,"[1.8520350701183634, 12.759318276547504, 20.02...",-2.423058,20.406573,-6.11,26.12,-34.53,60.14,-1.1,22.44,-2.43,97.44
8,Feature_9,"[14.921611992382918, 8.930055778872456, 9.7526...",-0.99403,22.634662,-4.28,27.1,-30.99,59.57,-0.46,22.95,-1.03,97.5
9,Feature_10,"[8.887936107722172, 4.265206856332804, 9.42704...",2.042977,24.825052,-0.98,22.37,-28.8,56.04,1.72,19.02,2.08,99.32


In [155]:
# Calculate Jaccard index for each feature
jaccard_results = {
                    'feature':[],
                    'IQR':[],
                   'Z-SCORE': [],
                   'MOD_Z': [],
                   'GAMMA':[]
                   }

for method_iters in range(5):
    for index, row in final_df.iterrows():
        match method_iters:
            case 0:
                jaccard_results['feature'].append(row['feature'])
            case 1:
                lower= row['lower']
                upper = row['upper']
                jaccard_value = jaccard_index(lower_a=row['lower_bound_iqr'],upper_a=row['upper_bound_iqr'], lower_b=lower, upper_b=upper)
                jaccard_results['IQR'].append(jaccard_value)
            case 2:
                lower= row['lower']
                upper = row['upper']
                jaccard_value = jaccard_index(lower_a=row['lower_bound_z_score'],upper_a=row['upper_bound_z_score'], lower_b=lower, upper_b=upper)
                jaccard_results['Z-SCORE'].append(jaccard_value)
            case 3:
                lower= row['lower']
                upper = row['upper']
                jaccard_value = jaccard_index(lower_a=row['lower_bound_mod_z_score'],upper_a=row['upper_bound_mod_z_score'], lower_b=lower, upper_b=upper)
                jaccard_results['MOD_Z'].append(jaccard_value)
            case 4:
                lower= row['lower']
                upper = row['upper']
                jaccard_value = jaccard_index(lower_a=row['lower_bound_gamma'],upper_a=row['upper_bound_gamma'], lower_b=lower, upper_b=upper)
                jaccard_results['GAMMA'].append(jaccard_value)


In [156]:
jaccard_results

{'feature': ['Feature_1',
  'Feature_2',
  'Feature_3',
  'Feature_4',
  'Feature_5',
  'Feature_6',
  'Feature_7',
  'Feature_8',
  'Feature_9',
  'Feature_10'],
 'IQR': [0.8366369070182103,
  0.818433773616221,
  0.8474768805941009,
  0.8540509210409928,
  0.8321685699336256,
  0.9586781345238681,
  0.9453608164882374,
  0.7083348178484263,
  0.7529857206012912,
  0.7877148795372693],
 'Z-SCORE': [0.26392960862834525,
  0.24123831485229114,
  0.35342760216838354,
  0.26839384076310946,
  0.2751202563243084,
  0.2818453355753879,
  0.32930807586799743,
  0.24114958465464015,
  0.2609175343691312,
  0.26852987972754655],
 'MOD_Z': [0.7593878163957736,
  0.8972778283925554,
  0.6363124815372208,
  0.8477481306584621,
  0.8375150567085582,
  0.7366872823162394,
  0.7627726788573753,
  0.8650011343849852,
  0.9645269546761182,
  0.7347753822347408],
 'GAMMA': [0.22292631248823983,
  0.23842336235847697,
  0.33506512519949905,
  0.26015329035341345,
  0.2746147786664705,
  0.26800874492814

In [157]:
jaccard_results = pd.DataFrame(jaccard_results)


In [158]:
final_df = pd.merge(final_df, jaccard_results, on=['feature'], how='left')

In [159]:
final_df

Unnamed: 0,feature,values,lower,upper,lower_bound_iqr,upper_bound_iqr,lower_bound_z_score,upper_bound_z_score,lower_bound_mod_z_score,upper_bound_mod_z_score,lower_bound_gamma,upper_bound_gamma,IQR,Z-SCORE,MOD_Z,GAMMA
0,Feature_1,"[12.483570765056164, 9.308678494144077, 13.238...",-3.098726,19.261391,-1.62,21.86,-29.98,54.74,1.33,18.31,-3.08,97.12,0.836637,0.26393,0.759388,0.222926
1,Feature_2,"[11.074694150473775, -0.11157466597867582, 5.2...",-0.111575,23.672111,-3.77,25.29,-35.31,63.28,-1.06,22.08,-0.08,99.51,0.818434,0.241238,0.897278,0.238423
2,Feature_3,"[15.415256215876385, 15.269010260174515, 3.111...",-6.206337,29.263657,-4.54,25.52,-36.49,63.87,-0.72,21.85,-6.28,99.58,0.847477,0.353428,0.636312,0.335065
3,Feature_4,"[6.653986359093935, 15.809686588733873, 17.764...",-2.261323,21.378806,-2.42,25.26,-30.24,57.84,0.93,21.86,-2.27,88.6,0.854051,0.268394,0.847748,0.260153
4,Feature_5,"[15.896485920319133, 10.337592407050545, 20.30...",-1.509606,25.394404,-6.43,25.9,-35.81,61.98,-2.33,21.71,-1.51,96.46,0.832169,0.27512,0.837515,0.274615
5,Feature_6,"[7.177214983652124, 4.989766381856329, 11.3987...",-3.704774,23.918887,-3.13,24.51,-35.53,62.48,0.65,21.0,-3.76,99.31,0.958678,0.281845,0.736687,0.268009
6,Feature_7,"[10.278624561443474, 15.470957592354743, 1.537...",-3.484433,22.866799,-2.18,23.01,-27.85,52.17,-0.32,19.78,-3.5,95.78,0.945361,0.329308,0.762773,0.265423
7,Feature_8,"[1.8520350701183634, 12.759318276547504, 20.02...",-2.423058,20.406573,-6.11,26.12,-34.53,60.14,-1.1,22.44,-2.43,97.44,0.708335,0.24115,0.865001,0.228593
8,Feature_9,"[14.921611992382918, 8.930055778872456, 9.7526...",-0.99403,22.634662,-4.28,27.1,-30.99,59.57,-0.46,22.95,-1.03,97.5,0.752986,0.260918,0.964527,0.239812
9,Feature_10,"[8.887936107722172, 4.265206856332804, 9.42704...",2.042977,24.825052,-0.98,22.37,-28.8,56.04,1.72,19.02,2.08,99.32,0.787715,0.26853,0.734775,0.233817


In [160]:
length_df = len(final_df)

iqr_mean = final_df['IQR'].sum() / length_df
z_score_mean = final_df['Z-SCORE'].sum() / length_df
mod_z_mean = final_df['MOD_Z'].sum() / length_df
gamma_mean = final_df['GAMMA'].sum() / length_df

df_eval = pd.DataFrame({
    'IQR': [iqr_mean],
    'Z-SCORE': [z_score_mean],
    'MOD_Z': [mod_z_mean],
    'GAMMA': [gamma_mean]
}, index=['mean jaccard coeff'])

In [161]:
df_jaccard = final_df.iloc[:, -4:]

fig = px.imshow(df_jaccard, 
                text_auto=".2f",  
                color_continuous_scale="YlGnBu", 
                labels=dict(color="Jaccard Index")) 

fig.update_layout(
    title="Jaccard Index Heatmap",
    xaxis_title="Methods",
    yaxis_title="Runs"
)


fig.show()


In [162]:
df_eval

Unnamed: 0,IQR,Z-SCORE,MOD_Z,GAMMA
mean jaccard coeff,0.834184,0.278386,0.8042,0.256684


In [163]:
best_method = df_eval.idxmax(axis=1)

In [164]:
print(f'Best Method based on labeled data: {best_method.values[0]}')

Best Method based on labeled data: IQR
