In [16]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval

def add_missing_nodes(deg_weight,deg_unweight):
    if isinstance(deg_unweight,str):
        deg_unweight = literal_eval(deg_unweight)
    if len(deg_unweight) == len(deg_weight):
        return deg_unweight
    else:
        nodes = [node for node,_ in deg_unweight]
        if isinstance(deg_weight,str):
            deg_weight = literal_eval(deg_weight) 
        for node, _ in deg_weight:
            if node not in nodes:
               deg_unweight.append((node,0))
    return deg_unweight 


data = pd.read_csv("data/distribution_statistics_degree_seq.csv") 
for list_cols in ['input_question','layers', 'heterogenity','eigen_values_adj','eigen_values_lap']:
    data[list_cols] =  data[list_cols].apply(lambda x: literal_eval(x.replace("nan","0"))) 
data['deg_weight'] =  data['deg_weight'] .apply(literal_eval) 
data['deg_unweight'] =  data.apply(lambda x: add_missing_nodes(x.deg_weight, x.deg_unweight), axis=1)


model_list = data["model"].unique()
few_shot_list = data["few_shot"].unique() 
threshold_list = data["threshold"].unique() 

data["sorted_eigen_values_adj"] = data['eigen_values_adj'].apply(lambda x: [sorted([abs(e)for e in list_eigen])for list_eigen in x]) 
data["spectral_gap_adj"] = data['sorted_eigen_values_adj'].apply(lambda x: [list_eigen[-1]-list_eigen[-2] for list_eigen in x])

#data["heterogeneity_std_dev"] = data['heterogenity'].apply(lambda x: np.std(np.diff([x if xx != None else 0 for xx in x])))
#data["spectral_gap_std_dev"] = data['spectral_gap_adj'].apply(lambda x: np.std(np.diff([x if xx != None else 0 for xx in x])))



In [18]:
analyze_plot = []
for model in model_list:
    for few_shot in few_shot_list:
        for threshold in threshold_list:
            filtered_data = data[(data["model"]==model)&(data["few_shot"]==few_shot)&(data["threshold"]==threshold)]
            for questions, layers,heterogenity_value,gamma_values,spectral_gap,equation, difficulty  in zip(filtered_data["input_question"].tolist(),filtered_data["layers"].tolist(),filtered_data["heterogenity"].tolist(),filtered_data["gamma"].tolist(),filtered_data["spectral_gap_adj"].tolist(),filtered_data["equation"].tolist(),filtered_data["difficulty"].tolist()):
                heterogeneity_std_dev = np.std(np.diff(heterogenity_value))
                spectral_gap_std_dev = np.std(np.diff(spectral_gap))
                if False:
                    fig, [hetero, eigen] = plt.subplots(figsize=(12,4),ncols=2)
                    
                    hetero.plot(layers,heterogenity_value)
                    hetero.set_ylabel("Heterogeneity")
                    hetero.set_xlabel("Layers")



                    eigen.plot(layers,spectral_gap)
                    eigen.set_ylabel("Spectral Gap")
                    eigen.set_xlabel("Layers")
                    # Calculate the standard deviation of the first-order differences
                    heterogeneity_std_dev = np.std(np.diff(heterogenity_value))
                    hetero.set_title(heterogeneity_std_dev)
                    spectral_gap_std_dev = np.std(np.diff(spectral_gap))
                    eigen.set_title(spectral_gap_std_dev)

                    fig.suptitle(f"{model} | {few_shot}| {threshold} | {questions}| {random}")
                    plt.show()
                analyze_plot.append({"model":model,
                                        "few_shot":few_shot,
                                        "threshold":threshold,
                                        "difficulty":difficulty,
                                        "equation":equation,
                                        "questions":questions,
                                        "std_heterogenity":heterogeneity_std_dev,
                                        "std_spectral_gap":spectral_gap_std_dev,
                                        })
df = pd.DataFrame(analyze_plot)

In [None]:
fig, [ax0,ax1] = plt.subplots(figsize=(10,4), ncols=2)
sns.scatterplot(data=df,x="few_shot", y = "std_heterogenity",hue="model",ax=ax0)
sns.scatterplot(data=df,x="few_shot", y = "std_spectral_gap",hue="model",ax=ax1)
ax0.set_title("Heterogenity")
ax0.set_ylabel("std of change in Heterogenity")
ax1.set_title("Spectral Gap")
ax1.set_ylabel("std of change in Spectral Gap")
ax0.get_legend().remove()
fig.suptitle("Model")
plt.legend(loc=(1.04, 0))
plt.show()
fig, [ax0,ax1] = plt.subplots(figsize=(10,4), ncols=2)
sns.scatterplot(data=df,x="few_shot", y = "std_heterogenity",hue="difficulty",ax=ax0)
sns.scatterplot(data=df,x="few_shot", y = "std_spectral_gap",hue="difficulty",ax=ax1)
ax0.set_title("Heterogenity")
ax0.set_ylabel("std of change in Heterogenity")
ax1.set_title("Spectral Gap")
ax1.set_ylabel("std of change in Spectral Gap")
fig.suptitle("Difficulty")
plt.legend(loc=(1.04, 0))
plt.show()
fig, [ax0,ax1] = plt.subplots(figsize=(10,4), ncols=2)
sns.scatterplot(data=df,x="few_shot", y = "std_heterogenity",hue="questions",ax=ax0)
sns.scatterplot(data=df,x="few_shot", y = "std_spectral_gap",hue="questions",ax=ax1)
ax0.set_title("Heterogenity")
ax0.set_ylabel("std of change in Heterogenity")
ax1.set_title("Spectral Gap")
ax1.set_ylabel("std of change in Spectral Gap")
ax0.get_legend().remove()
fig.suptitle("Question")
plt.legend(loc=(1.04, 0))
plt.show()

for model in model_list:
    fig, [ax0,ax1] = plt.subplots(figsize=(14,4), ncols=2)
    sns.scatterplot(data=df[(df["model"]==model)],x="difficulty", y = "std_heterogenity",hue="equation",ax=ax0)
    sns.scatterplot(data=df[ (df["model"]==model)],x="difficulty", y = "std_spectral_gap",hue="equation",ax=ax1)
    ax0.set_title("Heterogenity")
    ax0.set_ylabel("std of change in Heterogenity")
    ax1.set_title("Spectral Gap")
    ax1.set_ylabel("std of change in Spectral Gap")
    fig.suptitle(f"Equation type | {model}")
    ax0.get_legend().remove()
    plt.legend(loc=(1.04, 0))

    plt.show()
for model in model_list:
    fig, [ax0,ax1] = plt.subplots(figsize=(14,4), ncols=2)
    sns.scatterplot(data=df[(df["model"]==model)],x="difficulty", y = "std_heterogenity",hue="few_shot",ax=ax0)
    sns.scatterplot(data=df[ (df["model"]==model)],x="difficulty", y = "std_spectral_gap",hue="few_shot",ax=ax1)
    ax0.set_title("Heterogenity")
    ax0.set_ylabel("std of change in Heterogenity")
    ax1.set_title("Spectral Gap")
    ax1.set_ylabel("std of change in Spectral Gap")
    fig.suptitle(f"Few Shot {model}")
    ax0.get_legend().remove()
    plt.legend(title="Number of Few Shot: ",loc=(1.04, 0))

    plt.show()
for model in model_list:
    fig, [ax0,ax1] = plt.subplots(figsize=(14,4), ncols=2)
    sns.scatterplot(data=df[(df["model"]==model)],x="difficulty", y = "std_heterogenity",hue="questions",ax=ax0)
    sns.scatterplot(data=df[ (df["model"]==model)],x="difficulty", y = "std_spectral_gap",hue="questions",ax=ax1)
    ax0.set_title("Heterogenity")
    ax0.set_ylabel("std of change in Heterogenity")
    ax1.set_title("Spectral Gap")
    ax1.set_ylabel("std of change in Spectral Gap")
    fig.suptitle(f"Few Shot {model}")
    ax0.get_legend().remove()
    plt.legend(title="Number of Few Shot: ",loc=(1.04, 0))

    plt.show()

In [None]:
def get_degree_sequence(degree_sequence_1,degree_sequence_2):
    df1 = pd.DataFrame(list(degree_sequence_1.items()), columns=['Node', 'Degree1']).set_index('Node')
    df2 = pd.DataFrame(list(degree_sequence_2.items()), columns=['Node', 'Degree2']).set_index('Node')

    # Merge and fill missing values with zero
    df_diff = df1.join(df2, how='outer').fillna(0)

    # Compute degree difference
    df_diff['Degree Difference'] = df_diff['Degree1'] - df_diff['Degree2']

In [None]:
# Expand `deg_unweight` to separate rows
# Expand both `deg_unweight` and `degree_weighted` columns
expanded_df = data.explode(['deg_unweight', 'deg_weight']).reset_index(drop=True)

# Split the tuples in `deg_unweight` and `degree_weighted`
expanded_df[['node', 'degree_unweighted']] = pd.DataFrame(expanded_df['deg_unweight'].tolist(), index=expanded_df.index)
expanded_df['degree_weighted'] = expanded_df['deg_weight'].apply(lambda x: x[1] if isinstance(x, tuple) else x)

# Drop the now-unnecessary original `deg_unweight` column
expanded_df = expanded_df.drop(columns=['deg_unweight'])
print(expanded_df)

# Group by parameters and node, and calculate summary statistics for both degree types
#grouped_stats = expanded_df.groupby(['few_shot', 'input_question', 'difficulty', 'equation', 'threshold', 'node'])[['degree_unweighted', 'degree_weighted']].agg(['mean', 'std', 'min', 'max'])

# Display the comparison DataFrame
#print(grouped_stats)