In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy

file_path = "../data_more_1p_glitches.csv"
df = pd.read_csv(file_path)
df = df.fillna(-1)

META_KNOWLEDGE = ("distance", "num_states", "num_traces", "automaton", "glitched_delta_freq", "dominant_delta_freq")

In [None]:
distance = df['distance']

for column in df.columns:
    if column not in ("distance", "num_states", "num_traces", "automaton", "glitched_delta_freq", "dominant_delta_freq"):
        plt.figure(figsize=(8, 5))

        is_minus_one = df[column] == -1
        is_not_minus_one = ~is_minus_one
        
        # actual values
        plt.scatter(distance[is_not_minus_one], df[column][is_not_minus_one], 
                    color='blue', label=f'{column}', alpha=0.7)
        
        # -1
        plt.scatter(distance[is_minus_one], df[column][is_minus_one], 
                    color='red', label=f'{column} is None', alpha=0.7)        
        plt.axvline(0, color='red', linestyle='--', linewidth=0.8)
        plt.title(f"{column} vs Distance")
        plt.xlabel("Distance")
        plt.ylabel(column)
        plt.grid(alpha=0.4)
        plt.xlim(distance.min() - 1, distance.max() + 1)  # Ensure space around x=0
        plt.legend()
        plt.tight_layout()
        plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm

uuids = df['automaton'].unique()
colors = cm.tab20.colors
color_map = {uuid: colors[i % len(colors)] for i, uuid in enumerate(uuids)}

distance = df['distance']

for column in df.columns:
    if column.endswith("_dominant_trans_freq") or column.endswith("_glitch_trans_freq"):
        plt.figure(figsize=(8, 5))
        
        for uuid in uuids:
            group = df[df['automaton'] == uuid]
            color = color_map[uuid]
            
            is_minus_one = group[column] == -1
            is_not_minus_one = ~is_minus_one
            
            # lines
            plt.plot(group['distance'], group[column], color=color, alpha=0.6, label=f'UUID: {uuid}')
            
            # actual values
            plt.scatter(group['distance'][is_not_minus_one], group[column][is_not_minus_one], 
                        color=color, alpha=0.7, label=f'{column} (UUID: {uuid})')
            
            # -1
            plt.scatter(group['distance'][is_minus_one], group[column][is_minus_one], 
                        color='red', edgecolor=color, alpha=0.7, label=f'{column} (-1, UUID: {uuid})')

        plt.axvline(0, color='black', linestyle='--', linewidth=0.8)

        plt.title(f"{column} vs Distance")
        plt.xlabel("Distance")
        plt.ylabel(column)
        plt.grid(alpha=0.4)
        plt.xlim(distance.min() - 1, distance.max() + 1)
        
        plt.show()


In [None]:
def calc_correlations(method, threshold=None):
    grouped = df.groupby('automaton')
    final_correlations = {}
    
    for column in df.columns:
        if column not in META_KNOWLEDGE:
            correlations = []
            
            for uuid, group in grouped:
                if group[column].nunique() > 1:
                    corr = group["distance"].corr(group[column])
                    if not np.isnan(corr): 
                        correlations.append(corr)
            
            if correlations:
                final_correlations[column] = np.mean(correlations)
            else:
                final_correlations[column] = None
    
    print(f"\n{method} correlations:\n".title())
    for column, corr in final_correlations.items():
        if threshold is not None and abs(corr) < threshold:
            continue
        print(f"{column}: {corr}")
        
calc_correlations("pearson", threshold=0.4)
calc_correlations("spearman", threshold=0.4)
calc_correlations("kendall", threshold=0.4)


In [None]:
import ast
import scipy
from numpy import nan

def find_outliers(list_as_string: str, method: str, threshold: float = None):
    if list_as_string in ("nan", -1, nan, None):
        return []
    freq_list = ast.literal_eval(list_as_string)
    
    if method == "iqr":
        q1 = pd.Series(freq_list).quantile(0.25)
        q3 = pd.Series(freq_list).quantile(0.75)
        iqr = q3 - q1
        
        lower_bound = q1 - 1.5 * iqr
        outliers = [x for x in freq_list if x < lower_bound]
    
    elif method == "zscore":
        z_scores = scipy.stats.zscore(freq_list)
        assert threshold is not None
        outliers = [freq for i, freq in enumerate(freq_list) if z_scores[i] < threshold]
    
    else: 
        raise NotImplementedError(f"{method} method not supported")
        
    return outliers

In [None]:
df["outliers_zscore_m2"] = df["dominant_delta_freq"].apply(find_outliers, method="zscore", threshold=-2)
df["outliers_zscore_m1"] = df["dominant_delta_freq"].apply(find_outliers, method="zscore", threshold=-1)
df["outliers_zscore_m05"] = df["dominant_delta_freq"].apply(find_outliers, method="zscore", threshold=-0.5)
df["outliers_iqr"] = df["dominant_delta_freq"].apply(find_outliers, method="iqr")


In [None]:
import matplotlib.pyplot as plt

distances = df['distance']
outliers_zscore_count_m2 = df['outliers_zscore_m2'].apply(len)
outliers_zscore_count_m1 = df['outliers_zscore_m1'].apply(len)
outliers_zscore_count_m05 = df['outliers_zscore_m05'].apply(len)
outliers_iqr_count = df['outliers_iqr'].apply(len)

# Scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(distances, outliers_zscore_count_m2, color='darkblue', alpha=0.7, label='Z-score Outliers -2')
plt.scatter(distances, outliers_zscore_count_m1, color='mediumblue', alpha=0.7, label='Z-score Outliers -1')
plt.scatter(distances, outliers_zscore_count_m05, color='lightblue', alpha=0.7, label='Z-score Outliers -0.5')
plt.scatter(distances, outliers_iqr_count, color='red', alpha=0.7, label='IQR Outliers')

plt.xlabel('Distance', fontsize=12)
plt.ylabel('Number of Outliers', fontsize=12)
plt.title('Number of Outliers vs. Distance', fontsize=14)
plt.axhline(0, color='black', linewidth=0.5, linestyle='--')  # Optional line at 0
plt.legend()
plt.grid(alpha=0.3)
plt.show()

In [None]:
outliers_zscore_m2_distances = df[df['outliers_zscore_m2'].apply(len) > 0]['distance']
outliers_zscore_m1_distances = df[df['outliers_zscore_m1'].apply(len) > 0]['distance']
outliers_zscore_m05_distances = df[df['outliers_zscore_m05'].apply(len) > 0]['distance']
outliers_iqr_distances = df[df['outliers_iqr'].apply(len) > 0]['distance']

plt.figure(figsize=(10, 6))
plt.boxplot([outliers_zscore_m2_distances, outliers_zscore_m1_distances, outliers_zscore_m05_distances, outliers_iqr_distances], tick_labels=['Z-score Outliers -2', 'Z-score Outliers -1', 'Z-score Outliers -0.5', 'IQR Outliers'])

plt.ylabel('Distance', fontsize=12)
plt.title('Distribution of Distance for Outliers', fontsize=14)
plt.grid(alpha=0.3)
plt.show()

In [None]:
import numpy as np

distance_bins = df['distance'].round(1)  # Adjust binning if necessary
zscore_counts_m2 = df.groupby(distance_bins)['outliers_zscore_m2'].apply(lambda x: sum(len(outliers) for outliers in x))
zscore_counts_m1 = df.groupby(distance_bins)['outliers_zscore_m1'].apply(lambda x: sum(len(outliers) for outliers in x))
zscore_counts_m05 = df.groupby(distance_bins)['outliers_zscore_m05'].apply(lambda x: sum(len(outliers) for outliers in x))
iqr_counts = df.groupby(distance_bins)['outliers_iqr'].apply(lambda x: sum(len(outliers) for outliers in x))

x = np.arange(len(zscore_counts_m2.index))  # position for bars
width = 0.8

plt.figure(figsize=(10, 6))
plt.bar(x, zscore_counts_m2, width, label='Z-score Outliers -2', color='darkblue', alpha=0.7)
plt.bar(x, zscore_counts_m1, width, label='Z-score Outliers -1', color='mediumblue', alpha=0.7)
plt.bar(x, zscore_counts_m05, width, label='Z-score Outliers -0.5', color='lightblue', alpha=0.7)
plt.bar(x, iqr_counts, width, bottom=zscore_counts_m2, label='IQR Outliers', color='red', alpha=0.7)

# Add labels, title, and legend
plt.xlabel('Distance', fontsize=12)
plt.ylabel('Number of Outliers', fontsize=12)
plt.title('Outliers by Distance', fontsize=14)
plt.xticks(ticks=x, labels=zscore_counts_m2.index, rotation=45)
plt.legend()
plt.grid(alpha=0.3)
plt.show()


In [None]:
import seaborn as sns
df['zscore_m2_outlier_count'] = df['outliers_zscore_m2'].apply(len)
df['zscore_m1_outlier_count'] = df['outliers_zscore_m1'].apply(len)
df['zscore_m05_outlier_count'] = df['outliers_zscore_m05'].apply(len)
df['iqr_outlier_count'] = df['outliers_iqr'].apply(len)

heatmap_data = df[['distance', 'zscore_m2_outlier_count','zscore_m1_outlier_count','zscore_m05_outlier_count', 'iqr_outlier_count']]

heatmap_melted = heatmap_data.melt(
    id_vars='distance',
    var_name='Outlier Type',
    value_name='Count'
)

heatmap_pivot = heatmap_melted.pivot_table(
    index='distance',
    columns='Outlier Type',
    values='Count',
    fill_value=0
)

plt.figure(figsize=(10, 6))
sns.heatmap(heatmap_pivot, cmap='coolwarm', annot=True, fmt='.2f', cbar=True)

plt.title('Heatmap of Outlier Counts vs. Distance', fontsize=14)
plt.ylabel('Distance', fontsize=12)
plt.xlabel('Outlier Type', fontsize=12)
plt.show()