In [None]:
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import f_oneway

df = pd.read_csv("data/per_trace_metrics.csv")

print(df.head(10))

plt.figure(figsize=(12, 6))
sns.boxplot(x='per_trace_metric', y='value', hue='trace_section', data=df.query("per_trace_metric not in ['zero_crossing', 'peak_count', 'kurtosis', 'shannon_entropy', 'psd_high_ratio']")
)
plt.xticks(rotation=90)
plt.title("per_trace_metric means across first, second, third")
plt.show()

plt.figure(figsize=(12, 6))
sns.boxplot(x='per_trace_metric', y='value', hue='trace_section', data=df.query("per_trace_metric in ['zero_crossing']"))
plt.xticks(rotation=90)
plt.title("per_trace_metric means across first, second, third")
plt.show()


plt.figure(figsize=(12, 6))
sns.boxplot(x='per_trace_metric', y='value', hue='trace_section', data=df.query("per_trace_metric in ['peak_count', 'kurtosis', 'shannon_entropy', 'psd_high_ratio']"))
plt.xticks(rotation=90)
plt.title("per_trace_metric means across first, second, third")
plt.show()

# ANOVA test to see which per_trace_metrics have significant difference between labels
def anova_and_visualize(dataframe):
    features = []
    for per_trace_metric in dataframe['per_trace_metric'].unique():
        groups = [dataframe[(dataframe['per_trace_metric']==per_trace_metric) & (dataframe['trace_section']==trace_section)]['value'] for trace_section in ['heavy_1', 'sleep_1', 'heavy_2', 'sleep_2', 'heavy_3']]
        f_stat, p_val = f_oneway(*groups)
        features.append((per_trace_metric, p_val))

    significant_dataframe = pd.DataFrame(features, columns=['per_trace_metric', 'p_value']).sort_values('p_value')
    print("Significant differentiating per_trace_metrics:")
    display(significant_dataframe)

    top_per_trace_metrics = significant_dataframe['per_trace_metric']
    for per_trace_metric in top_per_trace_metrics:
        plt.figure(figsize=(8, 4))
        sns.boxplot(x='trace_section', y='value', data=dataframe[dataframe['per_trace_metric']==per_trace_metric])
        plt.title(f"per_trace_metric: {per_trace_metric}")
        plt.show()

# print("Full dataset")
# anova_and_visualize(df)
print("Category 1")
anova_and_visualize(df[df["category"] == "category_1"])
print("Category 2")
anova_and_visualize(df[df["category"] == "category_2"])

In [None]:
import numpy as np
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# ***** Cohen's d per capacitor and metric
category_1 = df[df["category"] == "category_1"].copy()

# Get rid of the specific labels and just label sleep and heavy as such to simplify comparison
section_to_state = {
    'heavy_1': 'heavy',
    'heavy_2': 'heavy',
    'heavy_3': 'heavy',
    'sleep_1': 'sleep',
    'sleep_2': 'sleep'
}

category_1['state'] = category_1['trace_section'].map(section_to_state)

rows = []

for (cap, metric), sub in category_1.groupby(
    ['capacitor_name', 'per_trace_metric']
):
    heavy = sub[sub['state'] == 'heavy']['value']
    sleep = sub[sub['state'] == 'sleep']['value']

    if len(heavy) > 1 and len(sleep) > 1:
        heavy_std = heavy.std(ddof=1)
        sleep_std = sleep.std(ddof=1)

        pooled_std = np.sqrt((heavy_std**2 + sleep_std**2) / 2)

        if pooled_std > 0:
            d = (heavy.mean() - sleep.mean()) / pooled_std
            rows.append((cap, metric, d))

effect_df = (
    pd.DataFrame(
        rows,
        columns=['capacitor_name', 'per_trace_metric', 'cohens_d']
    )
    .sort_values('cohens_d', key=np.abs, ascending=False)
)

display(effect_df)
# ***** Cohen's d per capacitor and metric




# ***** Average cohen's d per metric
metric_effect_df = (
    effect_df
    .assign(abs_cohens_d=effect_df['cohens_d'].abs())
    .groupby('per_trace_metric', as_index=False)
    .agg(
        mean_abs_cohens_d=('abs_cohens_d', 'mean'),
    )
    .sort_values('mean_abs_cohens_d', ascending=False)
)

display(metric_effect_df)
# ***** Average cohen's d per metric




# ***** Ranking capacitors total cohen's d per selected metrics
def rank_capacitors_per_selected_metrics(eff_df, sel_metrics):
        
    print(selected_metrics)

    filtered = eff_df[
        eff_df['per_trace_metric'].isin(sel_metrics)
    ]

    pivot_df = filtered.pivot(
        index='capacitor_name',
        columns='per_trace_metric',
        values='cohens_d'
    )

    rank_df = pivot_df.abs().rank(
        axis=0,
        ascending=False,
        method='min'
    )

    rank_df['mean_rank'] = rank_df.mean(axis=1)
    ranked_captors = rank_df.sort_values('mean_rank')
    display(ranked_captors)

selected_metrics = [
    'zero_crossing',
    'iqr',
    'mad',
    # 'env_mean',
    # 'rms_mean',
    # 'psd_high_ratio',
    # 'psd_mean',
]
rank_capacitors_per_selected_metrics(eff_df=effect_df, sel_metrics=selected_metrics)
selected_metrics = [
    'zero_crossing',
    'iqr',
    'mad',
    'env_mean',
    'rms_mean',
    # 'psd_high_ratio',
    # 'psd_mean',
]
rank_capacitors_per_selected_metrics(eff_df=effect_df, sel_metrics=selected_metrics)
selected_metrics = [
    'zero_crossing',
    'iqr',
    'mad',
    'env_mean',
    'rms_mean',
    'psd_high_ratio',
    'psd_mean',
]
rank_capacitors_per_selected_metrics(eff_df=effect_df, sel_metrics=selected_metrics)

# ***** Ranking capacitors total cohen's d per selected metrics




# # ***** Cohen's d per metric only
# rows = []

# for metric, sub in category_1.groupby('per_trace_metric'):
#     heavy = sub[sub['state'] == 'heavy']['value']
#     sleep = sub[sub['state'] == 'sleep']['value']

#     if len(heavy) > 1 and len(sleep) > 1:
#         heavy_std = heavy.std(ddof=1)
#         sleep_std = sleep.std(ddof=1)

#         pooled_std = np.sqrt((heavy_std**2 + sleep_std**2) / 2)

#         if pooled_std > 0:
#             d = (heavy.mean() - sleep.mean()) / pooled_std
#             rows.append((metric, d))

# effect_df = (
#     pd.DataFrame(rows, columns=['per_trace_metric', 'cohens_d'])
#     .sort_values('cohens_d', key=np.abs, ascending=False)
# )

# display(effect_df)
# # ***** Cohen's d per metric only



pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

In [None]:
category_1 = df[df["category"] == "category_1"].copy()
capacitors = list(category_1["capacitor_name"].unique()) 

for c in ["C21", "C53"]: 
    print(c)
    anova_and_visualize(df[df["capacitor_name"] == c])
