In [1]:
import pandas as pd, numpy as np
from scipy.stats import skew, kurtosis
import numpy as np
import plotly.express as px
import time
import ast

In [2]:
def iqr_approach(series, feature, k:float):
    summary = series.describe()
    return pd.DataFrame({
        'Feature Name': [feature],
        'Lower Bound IQR': round(summary['25%'] - k * (summary['75%'] - summary['25%']), 2),
        'Upper Bound IQR': round(summary['75%'] + k * (summary['75%'] - summary['25%']), 2)
    })

def std_approach(series, feature, alpha: float):
    summary = series.describe()
    return pd.DataFrame({
        'Feature Name': [feature],
        'Lower Bound STD': round(summary['mean'] - alpha * summary['std'], 2),
        'Upper Bound STD': round(summary['mean'] + alpha * summary['std'], 2)
    })

def modified_z_score_approach(series, feature, threshold: float = 3.5):
    median = series.median()
    
    absolute_deviations = np.abs(series - median)
    mad = absolute_deviations.median()

    lower_bound = round(median - (threshold * mad / 0.6745), 2)
    upper_bound = round(median + (threshold * mad / 0.6745), 2)

    # Ergebnisse als DataFrame zurückgeben
    return pd.DataFrame({
        'Feature Name': [feature],
        'Lower Bound Mod-Z Score': [lower_bound],
        'Upper Bound Mod-Z Score': [upper_bound]
    })
    

def gamma_method_modified(series, feature, alphas: int = 6, alphak: int = 30, beta_1=2, beta_2=2, gamma=2):
    
    mean = series.mean()
    sigma = series.std()

    z_scores = (series - mean) / sigma

    skew_result, kurt_result = abs(skew(z_scores)), abs(kurtosis(z_scores))
    if skew_result < alphas and kurt_result < alphak:
        return pd.DataFrame({
        'Feature Name': [feature],
        'Lower Bound Modified Gamma': round(mean - beta_1 * sigma,2),
        'Upper Bound Modified Gamma': round(mean + beta_2 * sigma,2)
        })
    else:   
        return pd.DataFrame({
            'Feature Name': [feature],
            'Lower Bound Modified Gamma': round(mean - gamma * beta_1 * sigma,2),
            'Upper Bound Modified Gamma': round(mean + gamma * beta_2 * sigma,2)
        })


def jaccard_index(lower_a, upper_a, lower_b, upper_b):
    overlap = max(0, min(upper_b, upper_a) - max(lower_b, lower_a))
    distance = max(0, max(upper_b, upper_a) - min(lower_b, lower_a))
    
    return overlap / distance if distance != 0 else 0

In [3]:
df = pd.read_csv('../TestData/combined_labeled_test_dataset.xls')

In [4]:
df

Unnamed: 0,Dataset-ID,Dataset Name,Dataset Description,Feature Name,Feature Data,Unique / distinct values,Average value,Median,Quantile 25% (Q1),Quantile 75% (Q3),...,rating_kevin,url_research,add_info,lower_bound_mean,lower_bound_median,upper_bound_mean,upper_bound_median,rating_mean,lower_bound_final,upper_bound_final
0,43342,German-House-Prices,Context\r\nProjects are a great way to learn d...,Price,"[498000.0, 495000.0, 749000.0, 259000.0, 46900...",1411,556685.086716,405215.000000,250000.000000,655000.000000,...,2.0,https://www.statista.com/statistics/1393405/de...,,100000.333333,1.0,2.203333e+07,15500000.0,2.000000,0.0,15750000.0
1,43358,Earthquakes-Data-NZ,Context\r\nNew Zealand lies on a fault-line th...,longitude,"[174.9236145, 176.07489009999998, 168.484024, ...",19970,168.815327,175.843956,174.445881,176.780491,...,3.0,https://de.wikipedia.org/wiki/Geographische_Ko...,,-180.000000,-180.0,1.800000e+02,180.0,3.000000,-180.0,180.0
2,43358,Earthquakes-Data-NZ,Context\r\nNew Zealand lies on a fault-line th...,_latitude,"[-40.47462845, -38.65804291, -44.22292328, -40...",20450,-39.695796,-39.410406,-40.739138,-38.446671,...,3.0,https://de.wikipedia.org/wiki/Geographische_Ko...,,-90.000000,-90.0,9.000000e+01,90.0,3.000000,-90.0,90.0
3,43371,New-Cases-of-COVID-19-In-World-Countries,Has the curve flattened?\r\nCountries around t...,lat,"[23.7, 15.0, 37.0902, 29.1832, 24.974, 31.8257...",254,23.383159,27.514200,9.945600,42.315400,...,3.0,https://de.wikipedia.org/wiki/Geographische_Ko...,,-120.000000,-90.0,1.200000e+02,90.0,3.000000,-90.0,90.0
4,43371,New-Cases-of-COVID-19-In-World-Countries,Has the curve flattened?\r\nCountries around t...,long,"[121.0, 101.0, -95.7129, 120.0934, 101.487, 11...",257,29.244353,24.966800,-23.041800,106.520150,...,3.0,https://de.wikipedia.org/wiki/Geographische_Ko...,,-150.000000,-180.0,1.500000e+02,180.0,3.000000,-180.0,180.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462,43617,Medical-Appointment,Context\r\nThe No Show problem is one of the b...,creacion_hora_d,"[14, 11, 10, 8, 10, 11, 12, 19, 17, 12, 13, 16...",24,13.424249,13.000000,10.000000,16.000000,...,3.0,,,0.000000,0.0,2.300000e+01,23.0,1.666667,0.0,23.0
463,43617,Medical-Appointment,Context\r\nThe No Show problem is one of the b...,creacion_hora_c,"[-0.87, -0.97, -0.87, -0.5, -0.87, -0.97, -1.0...",13,-0.533509,-0.710000,-0.970000,-0.260000,...,0.0,,,,,,,1.000000,-1.0,1.0
464,43617,Medical-Appointment,Context\r\nThe No Show problem is one of the b...,latencia,"[18, 17, 20, 34, 17, 14, 21, 34, 24, 49, 35, 3...",129,9.044516,4.000000,1.000000,11.000000,...,0.0,,,,,,,0.500000,0.0,160.0
465,43618,Credit-Card-Dataset-for-Clustering,This case requires to develop a customer segme...,CASH_ADVANCE_FREQUENCY,"[0.0, 0.25, 0.0, 0.083333, 0.0, 0.0, 0.0, 0.0,...",54,0.135144,0.000000,0.000000,0.222222,...,3.0,,,0.000000,0.0,1.166667e+00,1.0,2.000000,0.0,1.0


In [5]:
test_df = df[['Dataset-ID', 'Dataset Name', 'Feature Name', 'Feature Data', 'lower_bound_final', 'upper_bound_final']].copy()

In [6]:
test_df.rename(columns={'lower_bound_final': 'GT Lower Bound', 'upper_bound_final': 'GT Upper Bound'}, inplace=True)

In [7]:
test_df

Unnamed: 0,Dataset-ID,Dataset Name,Feature Name,Feature Data,GT Lower Bound,GT Upper Bound
0,43342,German-House-Prices,Price,"[498000.0, 495000.0, 749000.0, 259000.0, 46900...",0.0,15750000.0
1,43358,Earthquakes-Data-NZ,longitude,"[174.9236145, 176.07489009999998, 168.484024, ...",-180.0,180.0
2,43358,Earthquakes-Data-NZ,_latitude,"[-40.47462845, -38.65804291, -44.22292328, -40...",-90.0,90.0
3,43371,New-Cases-of-COVID-19-In-World-Countries,lat,"[23.7, 15.0, 37.0902, 29.1832, 24.974, 31.8257...",-90.0,90.0
4,43371,New-Cases-of-COVID-19-In-World-Countries,long,"[121.0, 101.0, -95.7129, 120.0934, 101.487, 11...",-180.0,180.0
...,...,...,...,...,...,...
462,43617,Medical-Appointment,creacion_hora_d,"[14, 11, 10, 8, 10, 11, 12, 19, 17, 12, 13, 16...",0.0,23.0
463,43617,Medical-Appointment,creacion_hora_c,"[-0.87, -0.97, -0.87, -0.5, -0.87, -0.97, -1.0...",-1.0,1.0
464,43617,Medical-Appointment,latencia,"[18, 17, 20, 34, 17, 14, 21, 34, 24, 49, 35, 3...",0.0,160.0
465,43618,Credit-Card-Dataset-for-Clustering,CASH_ADVANCE_FREQUENCY,"[0.0, 0.25, 0.0, 0.083333, 0.0, 0.0, 0.0, 0.0,...",0.0,1.0


In [8]:
type(test_df['Feature Data'][0])

str

In [9]:
# Function to safely convert strings to lists
def safe_eval(val):
    try:
        # Check if the value is a string and not empty
        if isinstance(val, str) and val.startswith("[") and val.endswith("]"):
            return ast.literal_eval(val)  # Safely convert string to list
        else:
            return []  # Default empty list for invalid values
    except (ValueError, SyntaxError):
        return []  # Handle cases where conversion fails

In [10]:
test_df['Feature Data'] = test_df['Feature Data'].apply(safe_eval)

In [11]:
test_df = test_df[test_df['Feature Data'].apply(lambda x: x != [])]

In [12]:
error_tracking = []

iqr_params = {'k': [1, 1.5, 2, 2.5, 3, 3.5]}
std_params = {'alpha': [1, 1.5, 2, 2.5, 3, 3.5]}
modified_z_params = {'alpha': [1, 1.5, 2, 2.5, 3, 3.5, 4]}
gamma_params = {
    'alpha_s': [3,6], 
    'alpha_k': [15, 30], 
    'beta_1': [2, 3], 
    'beta_2': [2, 3], 
    'gamma': [2]
}

final_results = []

best_iqr_params = {}
best_std_params = {}
best_mod_z_params = {}
best_gamma_params = {}

start_time = time.time()

for index, row in test_df.iterrows():
    values_series = pd.Series(row['Feature Data'])
    id = row['Dataset-ID']
    dataset = row['Dataset Name']
    feature = row['Feature Name']
    baseline_bounds = (row['GT Lower Bound'], row['GT Upper Bound'])
    
    best_iqr, best_std, best_mod_z, best_gamma = None, None, None, None
    best_iqr_score, best_std_score, best_mod_z_score, best_gamma_score = -float('inf'), -float('inf'), -float('inf'), -float('inf')

    # Grid search for IQR
    for step, k in enumerate(iqr_params['k']):
        filter_row_iqr = iqr_approach(values_series, feature, k)
        lower, upper = filter_row_iqr['Lower Bound IQR'].iloc[0], filter_row_iqr['Upper Bound IQR'].iloc[0]
        score = jaccard_index(lower, upper, baseline_bounds[0], baseline_bounds[1])
        error_tracking.append({'step': step, 'method': 'IQR', 'error': 1 - score, 'feature': feature, 'parameter': k})
        if score > best_iqr_score:
            best_iqr_score = score
            best_iqr = filter_row_iqr
            best_iqr_params = {'k': k}  

    # Grid search for Standard Deviation
    for step, alpha in enumerate(std_params['alpha']):
        filter_row_std = std_approach(values_series, feature, alpha)
        lower, upper = filter_row_std['Lower Bound STD'].iloc[0], filter_row_std['Upper Bound STD'].iloc[0]
        score = jaccard_index(lower, upper, baseline_bounds[0], baseline_bounds[1])
        error_tracking.append({'step': step, 'method': 'STD', 'error': 1 - score, 'feature': feature, 'parameter': alpha})
        if score > best_std_score:
            best_std_score = score
            best_std = filter_row_std
            best_std_params = {'alpha': alpha}  

    # Grid search for Modified Z-Score
    for step, alpha in enumerate(modified_z_params['alpha']):
        filter_row_mod_z = modified_z_score_approach(values_series, feature, alpha)
        lower, upper = filter_row_mod_z['Lower Bound Mod-Z Score'].iloc[0], filter_row_mod_z['Upper Bound Mod-Z Score'].iloc[0]
        score = jaccard_index(lower, upper, baseline_bounds[0], baseline_bounds[1])
        error_tracking.append({'step': step, 'method': 'MOD_Z', 'error': 1 - score, 'feature': feature, 'parameter': alpha})
        if score > best_mod_z_score:
            best_mod_z_score = score
            best_mod_z = filter_row_mod_z
            best_mod_z_params = {'alpha': alpha}  

    # Grid search for Gamma Approach
    step = 0
    for alpha_s in gamma_params['alpha_s']:
        for alpha_k in gamma_params['alpha_k']:
            for beta_1 in gamma_params['beta_1']:
                for beta_2 in gamma_params['beta_2']:
                    for gamma in gamma_params['gamma']:
                        filter_row_gamma = gamma_method_modified(values_series, feature, alpha_s, alpha_k, beta_1, beta_2, gamma)
                        if filter_row_gamma is not None:
                            lower, upper = filter_row_gamma['Lower Bound Modified Gamma'].iloc[0], filter_row_gamma['Upper Bound Modified Gamma'].iloc[0]
                            score = jaccard_index(lower, upper, baseline_bounds[0], baseline_bounds[1])
                            error_tracking.append({'step': step, 'method': 'GAMMA', 'error': 1 - score, 'feature': feature, 'parameter': (alpha_s, alpha_k, beta_1, beta_2, gamma)})
                            if score > best_gamma_score:
                                best_gamma_score = score
                                best_gamma = filter_row_gamma
                                best_gamma_params = {
                                    'alpha_s': alpha_s, 
                                    'alpha_k': alpha_k, 
                                    'beta_1': beta_1, 
                                    'beta_2': beta_2, 
                                    'gamma': gamma
                                }  
                            step += 1

    result = {
        'Dataset-ID': id,
        'Dataset Name': dataset,
        'Feature Name': feature,
        'Feature Data': row['Feature Data'],
        'GT Lower Bound': row['GT Lower Bound'],
        'GT Upper Bound': row['GT Upper Bound'],
        'Lower Bound IQR': best_iqr['Lower Bound IQR'].iloc[0] if best_iqr is not None else None,
        'Upper Bound IQR': best_iqr['Upper Bound IQR'].iloc[0] if best_iqr is not None else None,
        'Lower Bound STD': best_std['Lower Bound STD'].iloc[0] if best_std is not None else None,
        'Upper Bound STD': best_std['Upper Bound STD'].iloc[0] if best_std is not None else None,
        'Lower Bound Mod-Z Score': best_mod_z['Lower Bound Mod-Z Score'].iloc[0] if best_mod_z is not None else None,
        'Upper Bound Mod-Z Score': best_mod_z['Upper Bound Mod-Z Score'].iloc[0] if best_mod_z is not None else None,
        'Lower Bound Modified Gamma': best_gamma['Lower Bound Modified Gamma'].iloc[0] if best_gamma is not None else None,
        'Upper Bound Modified Gamma': best_gamma['Upper Bound Modified Gamma'].iloc[0] if best_gamma is not None else None,
        'IQR': best_iqr_score,
        'STD': best_std_score,
        'MOD_Z': best_mod_z_score,
        'GAMMA': best_gamma_score,
        'Best IQR Approach Params': best_iqr_params,  
        'Best STD Approach Params': best_std_params,  
        'Best Mod-Z Approach Params': best_mod_z_params, 
        'Best Modified Gamma Params': best_gamma_params 
    }
    final_results.append(result)

final_df_result = pd.DataFrame(final_results)

end_time = time.time()

error_df = pd.DataFrame(error_tracking)

0         498000.0
1         495000.0
2         749000.0
3         259000.0
4         469000.0
           ...    
10547    1495000.0
10548     449000.0
10549     678000.0
10550     419900.0
10551     699000.0
Length: 10552, dtype: float64
count    1.055200e+04
mean     5.566851e+05
std      6.087410e+05
min      0.000000e+00
25%      2.500000e+05
50%      4.052150e+05
75%      6.550000e+05
max      1.300000e+07
dtype: float64
0         498000.0
1         495000.0
2         749000.0
3         259000.0
4         469000.0
           ...    
10547    1495000.0
10548     449000.0
10549     678000.0
10550     419900.0
10551     699000.0
Length: 10552, dtype: float64
count    1.055200e+04
mean     5.566851e+05
std      6.087410e+05
min      0.000000e+00
25%      2.500000e+05
50%      4.052150e+05
75%      6.550000e+05
max      1.300000e+07
dtype: float64
0         498000.0
1         495000.0
2         749000.0
3         259000.0
4         469000.0
           ...    
10547    1495000.0
10548  

In [56]:
final_df_result

Unnamed: 0,Dataset-ID,Dataset Name,Feature Name,Feature Data,GT Lower Bound,GT Upper Bound,Lower Bound IQR,Upper Bound IQR,Lower Bound STD,Upper Bound STD,...,Lower Bound Modified Gamma,Upper Bound Modified Gamma,IQR,STD,MOD_Z,GAMMA,Best IQR Approach Params,Best STD Approach Params,Best Mod-Z Approach Params,Best Modified Gamma Params
0,43342,German-House-Prices,Price,"[498000.0, 495000.0, 749000.0, 259000.0, 46900...",0.0,15750000.0,-1167500.00,2072500.00,-1573908.42,2687278.60,...,-1878278.92,4209131.10,0.122506,0.155120,0.091770,0.238772,{'k': 3.5},{'alpha': 3.5},{'alpha': 4},"{'alpha_s': 3, 'alpha_k': 15, 'beta_1': 2, 'be..."
1,43358,Earthquakes-Data-NZ,longitude,"[174.9236145, 176.07489009999998, 168.484024, ...",-180.0,180.0,166.27,184.95,1.26,336.37,...,-118.42,360.30,0.037622,0.346147,0.029740,0.552323,{'k': 3.5},{'alpha': 3.5},{'alpha': 4},"{'alpha_s': 3, 'alpha_k': 15, 'beta_1': 3, 'be..."
2,43358,Earthquakes-Data-NZ,_latitude,"[-40.47462845, -38.65804291, -44.22292328, -40...",-90.0,90.0,-48.76,-30.42,-47.30,-32.09,...,-46.22,-33.18,0.101889,0.084500,0.074556,0.072444,{'k': 3.5},{'alpha': 3.5},{'alpha': 4},"{'alpha_s': 3, 'alpha_k': 15, 'beta_1': 3, 'be..."
3,43371,New-Cases-of-COVID-19-In-World-Countries,lat,"[23.7, 15.0, 37.0902, 29.1832, 24.974, 31.8257...",-90.0,90.0,-87.16,139.42,-64.48,111.24,...,-51.92,98.69,0.772208,0.767641,0.736484,0.752133,{'k': 3},{'alpha': 3.5},{'alpha': 4},"{'alpha_s': 3, 'alpha_k': 15, 'beta_1': 3, 'be..."
4,43371,New-Cases-of-COVID-19-In-World-Countries,long,"[121.0, 101.0, -95.7129, 120.0934, 101.487, 11...",-180.0,180.0,-152.60,236.08,-161.01,219.50,...,-199.07,181.45,0.799366,0.853592,0.868714,0.946074,{'k': 1},{'alpha': 2.5},{'alpha': 1.5},"{'alpha_s': 3, 'alpha_k': 15, 'beta_1': 3, 'be..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,43617,Medical-Appointment,creacion_hora_d,"[14, 11, 10, 8, 10, 11, 12, 19, 17, 12, 13, 16...",0.0,23.0,1.00,25.00,1.14,25.71,...,1.14,21.61,0.880000,0.850253,0.875622,0.890000,{'k': 1.5},{'alpha': 3},{'alpha': 2.5},"{'alpha_s': 3, 'alpha_k': 15, 'beta_1': 3, 'be..."
366,43617,Medical-Appointment,creacion_hora_c,"[-0.87, -0.97, -0.87, -0.5, -0.87, -0.97, -1.0...",-1.0,1.0,-2.04,0.80,-2.10,1.03,...,-1.58,1.03,0.592105,0.638978,0.563077,0.766284,{'k': 1.5},{'alpha': 3},{'alpha': 4},"{'alpha_s': 3, 'alpha_k': 15, 'beta_1': 2, 'be..."
367,43617,Medical-Appointment,latencia,"[18, 17, 20, 34, 17, 14, 21, 34, 24, 49, 35, 3...",0.0,160.0,-34.00,46.00,-37.89,55.98,...,-44.59,89.50,0.237113,0.282884,0.154240,0.437460,{'k': 3.5},{'alpha': 3.5},{'alpha': 4},"{'alpha_s': 3, 'alpha_k': 15, 'beta_1': 2, 'be..."
368,43618,Credit-Card-Dataset-for-Clustering,CASH_ADVANCE_FREQUENCY,"[0.0, 0.25, 0.0, 0.083333, 0.0, 0.0, 0.0, 0.0,...",0.0,1.0,-0.78,1.00,-0.57,0.84,...,-0.27,0.74,0.561798,0.535032,0.000000,0.582677,{'k': 3.5},{'alpha': 3.5},{'alpha': 1},"{'alpha_s': 3, 'alpha_k': 15, 'beta_1': 2, 'be..."


In [13]:
elapsed_time = (end_time - start_time)/60

print(f"Elapsed time: {elapsed_time:.2f} minutes")

Elapsed time: 0.14 minutes


In [14]:
error_df

Unnamed: 0,step,method,error,feature,parameter
0,0,IQR,0.933354,Price,1
1,1,IQR,0.921620,Price,1.5
2,2,IQR,0.910178,Price,2
3,3,IQR,0.899016,Price,2.5
4,4,IQR,0.888124,Price,3
...,...,...,...,...,...
12945,11,GAMMA,0.724953,CASH_ADVANCE_TRX,"(6, 15, 3, 3, 2)"
12946,12,GAMMA,0.792248,CASH_ADVANCE_TRX,"(6, 30, 2, 2, 2)"
12947,13,GAMMA,0.699422,CASH_ADVANCE_TRX,"(6, 30, 2, 3, 2)"
12948,14,GAMMA,0.809894,CASH_ADVANCE_TRX,"(6, 30, 3, 2, 2)"


In [16]:
final_df_result.head()


Unnamed: 0,Dataset-ID,Dataset Name,Feature Name,Feature Data,GT Lower Bound,GT Upper Bound,Lower Bound IQR,Upper Bound IQR,Lower Bound STD,Upper Bound STD,...,Lower Bound Modified Gamma,Upper Bound Modified Gamma,IQR,STD,MOD_Z,GAMMA,Best IQR Approach Params,Best STD Approach Params,Best Mod-Z Approach Params,Best Modified Gamma Params
0,43342,German-House-Prices,Price,"[498000.0, 495000.0, 749000.0, 259000.0, 46900...",0.0,15750000.0,-1167500.0,2072500.0,-1573908.42,2687278.6,...,-1878278.92,4209131.1,0.122506,0.15512,0.09177,0.238772,{'k': 3.5},{'alpha': 3.5},{'alpha': 4},"{'alpha_s': 3, 'alpha_k': 15, 'beta_1': 2, 'be..."
1,43358,Earthquakes-Data-NZ,longitude,"[174.9236145, 176.07489009999998, 168.484024, ...",-180.0,180.0,166.27,184.95,1.26,336.37,...,-118.42,360.3,0.037622,0.346147,0.02974,0.552323,{'k': 3.5},{'alpha': 3.5},{'alpha': 4},"{'alpha_s': 3, 'alpha_k': 15, 'beta_1': 3, 'be..."
2,43358,Earthquakes-Data-NZ,_latitude,"[-40.47462845, -38.65804291, -44.22292328, -40...",-90.0,90.0,-48.76,-30.42,-47.3,-32.09,...,-46.22,-33.18,0.101889,0.0845,0.074556,0.072444,{'k': 3.5},{'alpha': 3.5},{'alpha': 4},"{'alpha_s': 3, 'alpha_k': 15, 'beta_1': 3, 'be..."
3,43371,New-Cases-of-COVID-19-In-World-Countries,lat,"[23.7, 15.0, 37.0902, 29.1832, 24.974, 31.8257...",-90.0,90.0,-87.16,139.42,-64.48,111.24,...,-51.92,98.69,0.772208,0.767641,0.736484,0.752133,{'k': 3},{'alpha': 3.5},{'alpha': 4},"{'alpha_s': 3, 'alpha_k': 15, 'beta_1': 3, 'be..."
4,43371,New-Cases-of-COVID-19-In-World-Countries,long,"[121.0, 101.0, -95.7129, 120.0934, 101.487, 11...",-180.0,180.0,-152.6,236.08,-161.01,219.5,...,-199.07,181.45,0.799366,0.853592,0.868714,0.946074,{'k': 1},{'alpha': 2.5},{'alpha': 1.5},"{'alpha_s': 3, 'alpha_k': 15, 'beta_1': 3, 'be..."


In [17]:
length_df = len(final_df_result)

iqr_mean = final_df_result['IQR'].sum() / length_df
z_score_mean = final_df_result['STD'].sum() / length_df
mod_z_mean = final_df_result['MOD_Z'].sum() / length_df
gamma_mean = final_df_result['GAMMA'].sum() / length_df

df_eval = pd.DataFrame({
    'IQR': [iqr_mean],
    'STD': [z_score_mean],
    'MOD_Z': [mod_z_mean],
    'GAMMA': [gamma_mean]
}, index=['mean jaccard coeff'])

In [54]:
import plotly.io as pio
import plotly.express as px
import pandas as pd

# Set Plotly renderer for VS Code
pio.renderers.default = "vscode"

# Extract last 4 columns for visualization
df_jaccard = final_df_result.iloc[:, -8:-4]

# Ensure data is numeric
df_jaccard = df_jaccard.apply(pd.to_numeric, errors='coerce').fillna(0)

# ✅ Limit to first 20 features (rows)
df_jaccard = df_jaccard.iloc[:20, :]

# Create heatmap
fig = px.imshow(
    df_jaccard, 
    text_auto=".2f",  
    color_continuous_scale="YlGnBu", 
    labels=dict(color="Jaccard Index")
)

# Update layout with correct axis labels
fig.update_layout(
    title="Jaccard Index Heatmap (First 20 Features)",
    xaxis_title="Methods",
    yaxis_title="Features",
    xaxis=dict(
        tickmode='array', 
        tickvals=list(range(df_jaccard.shape[1])), 
        ticktext=df_jaccard.columns
    ),
    yaxis=dict(
        tickmode='array', 
        tickvals=list(range(df_jaccard.shape[0])), 
        ticktext=final_df_result.index[:df_jaccard.shape[0]].tolist()
    )
)

# Show figure
fig.show()


In [24]:
df_eval

Unnamed: 0,IQR,STD,MOD_Z,GAMMA
mean jaccard coeff,0.358263,0.391481,0.334343,0.398435


In [25]:
best_method = df_eval.idxmax(axis=1)

In [26]:
print(f'Best Method based on labeled data: {best_method.values[0]}')

Best Method based on labeled data: GAMMA


In [34]:
final_df_result['Best Modified Gamma Params'][0]

{'alpha_s': 3, 'alpha_k': 15, 'beta_1': 2, 'beta_2': 3, 'gamma': 2}

In [35]:
all_values = [list(d.values()) for d in final_df_result['Best Modified Gamma Params']]

In [39]:
from collections import Counter
# Count occurrences of each list
counter = Counter(tuple(sublist) for sublist in all_values)

# Get the most common list
most_common_list, count = counter.most_common(1)[0]

# Convert tuple back to list
most_common_list = list(most_common_list)


In [40]:
most_common_list

[3, 15, 2, 3, 2]

In [42]:
results_best_approach = pd.DataFrame()

for index, row in test_df.iterrows():
    id = row['Dataset-ID']
    dataset = row['Dataset Name']
    values_series = pd.Series(row['Feature Data'])
    feature = row['Feature Name']
    baseline_bounds = (row['GT Lower Bound'], row['GT Upper Bound'])
    
    best_results = gamma_method_modified(values_series, feature, 3, 15, 2, 3, 2)
    
    if best_results is not None:
        lower_bound_iqr = best_results['Lower Bound Modified Gamma'].iloc[0]
        upper_bound_iqr = best_results['Upper Bound Modified Gamma'].iloc[0]

        baseline_lower, baseline_upper = baseline_bounds

        score = jaccard_index(lower_bound_iqr, upper_bound_iqr, baseline_lower, baseline_upper)

        results = {
            'Dataset-ID': id,
            'Dataset Name': dataset,
            'Feature Name': feature,
            'Feature Data': values_series.tolist(),  
            'Lower Bound Modified Gamma': lower_bound_iqr,
            'Upper Bound Modified Gamma': upper_bound_iqr,
            'GT Lower Bound': baseline_lower,
            'GT Upper Bound': baseline_upper,
            'Jaccard Score': score
        }

        iqr_results_df = pd.DataFrame([results])

        results_best_approach = pd.concat([results_best_approach, iqr_results_df], ignore_index=True)

results_best_approach


Unnamed: 0,Dataset-ID,Dataset Name,Feature Name,Feature Data,Lower Bound Modified Gamma,Upper Bound Modified Gamma,GT Lower Bound,GT Upper Bound,Jaccard Score
0,43342,German-House-Prices,Price,"[498000.0, 495000.0, 749000.0, 259000.0, 46900...",-1878278.92,4209131.10,0.0,15750000.0,0.238772
1,43358,Earthquakes-Data-NZ,longitude,"[174.9236145, 176.07489009999998, 168.484024, ...",-22.67,456.05,-180.0,180.0,0.318638
2,43358,Earthquakes-Data-NZ,_latitude,"[-40.47462845, -38.65804291, -44.22292328, -40...",-44.04,-33.18,-90.0,90.0,0.060333
3,43371,New-Cases-of-COVID-19-In-World-Countries,lat,"[23.7, 15.0, 37.0902, 29.1832, 24.974, 31.8257...",-26.82,98.69,-90.0,90.0,0.619111
4,43371,New-Cases-of-COVID-19-In-World-Countries,long,"[121.0, 101.0, -95.7129, 120.0934, 101.487, 11...",-122.96,257.55,-180.0,180.0,0.692401
...,...,...,...,...,...,...,...,...,...
365,43617,Medical-Appointment,creacion_hora_d,"[14, 11, 10, 8, 10, 11, 12, 19, 17, 12, 13, 16...",5.23,25.71,0.0,23.0,0.691171
366,43617,Medical-Appointment,creacion_hora_c,"[-0.87, -0.97, -0.87, -0.5, -0.87, -0.97, -1.0...",-1.58,1.03,-1.0,1.0,0.766284
367,43617,Medical-Appointment,latencia,"[18, 17, 20, 34, 17, 14, 21, 34, 24, 49, 35, 3...",-44.59,89.50,0.0,160.0,0.437460
368,43618,Credit-Card-Dataset-for-Clustering,CASH_ADVANCE_FREQUENCY,"[0.0, 0.25, 0.0, 0.083333, 0.0, 0.0, 0.0, 0.0,...",-0.27,0.74,0.0,1.0,0.582677


In [48]:
length_df = len(results_best_approach)

gamma_mean_opt = results_best_approach['Jaccard Score'].sum() / length_df

df_eval['GAMMA OPT PARAMS'] = gamma_mean_opt

In [49]:
df_eval

Unnamed: 0,IQR,STD,MOD_Z,GAMMA,GAMMA OPT PARAMS
mean jaccard coeff,0.358263,0.391481,0.334343,0.398435,0.356103
