In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os as os

In [None]:
source_folder = '../csv_data/08-12/'
disconnect_pairs = "continuous_disconnected_pairs"
subgraph_percentages_tests = [
    [0.6, 0.4],
    [0.25, 0.25, 0.25, 0.25],
    [0.75, 0.25],
    [0.25, 0.75],
    [0.33, 0.33, 0.34],
]
prune_quantiles = [0.25, 0.5, 0.75, 1]

order = []
for subgraph_percentage in subgraph_percentages_tests:
    order.append('_'.join(map(str, subgraph_percentage)))

In [None]:
measureDict = {}
file_names = []
for file_name in os.listdir(source_folder):
    if file_name.endswith('.csv'):
        file_names.append(file_name)

for file_name in file_names:
    # Split the file name into parts
    parts = file_name.split('_')
    city = parts[0]  # The first part is the city
    measure = parts[1]  # The second part is the measure
        
    # The rest are the percentiles (everything from the third element up to '.csv')
    percentiles = '_'.join(parts[2:]).replace('.csv', '')
    df = pd.read_csv(os.path.join(source_folder, file_name))
    df['percentile'] = percentiles  
    if measure not in measureDict:
        measureDict[measure] = [df]
    else:
        measureDict[measure].append(df)

for k,v in measureDict.items():
    print(k)
    print(len(v))

measureDict["random"][0].head()

In [None]:

for measure in measureDict:
    mergedPercentileDf = pd.concat(measureDict[measure])
    mergedPercentileDf['mean_error'] = mergedPercentileDf.groupby('percentile')['sum_of_errors'].transform('mean')
    mergedPercentileDf['mean_disconnected_pairs'] = mergedPercentileDf.groupby('percentile')[disconnect_pairs].transform('mean')

    mergedPercentileDf["percentile"] = pd.Categorical(
    mergedPercentileDf["percentile"],
    categories=order,
    ordered=True
)
    # Create a figure and axes
    fig, ax1 = plt.subplots(figsize=(10, 6))

    # Plot bar chart for mean_error
    # order bars by percentile

    sns.barplot(
        data=mergedPercentileDf,
        x="percentile",
        y="mean_error",
        ax=ax1,
        hue="percentile",
    )
    ax1.set_ylabel("Mean Error", fontsize=12)
    ax1.set_xlabel("Percentile", fontsize=12)
    ax1.tick_params(axis="y", labelsize=10)

    # Add a secondary y-axis for mean_disconnected_pairs
    ax2 = ax1.twinx()

    # Calculate mean_disconnected_pairs for each percentile
    mean_disconnected = (
        mergedPercentileDf.groupby("percentile")["mean_disconnected_pairs"].mean()
    )

    # Plot points on top of bars
    sns.scatterplot(
        x=mean_disconnected.index,
        y=mean_disconnected.values,
        ax=ax2,
        color="red",
        s=100,
        label="Avg. Disconnected Pairs",
        zorder=5,
    )

    ax2.set_ylabel("Avg. Disconnected Pairs", fontsize=12, color="black")
    ax2.tick_params(axis="y", labelcolor="black", labelsize=10)

    # Improve layout
    ax1.grid(axis="y", linestyle="--", alpha=0.7)
    ax1.set_title(measure+": Mean Error and Disconnected Pairs by Percentile ", fontsize=14)

    # Add legend for the scatter points
    ax2.legend(loc="upper right", fontsize=10)

    # Show the plot
    plt.tight_layout()
    plt.show()

In [None]:
# add 
for measure in measureDict:
    mergedPercentileDf = pd.concat(measureDict[measure])
    mergedPercentileDf['mean_error'] = mergedPercentileDf.groupby('percentile')['sum_of_errors'].transform('mean')
    mergedPercentileDf['mean_disconnected_pairs'] = mergedPercentileDf.groupby('percentile')[disconnect_pairs].transform('mean')
    mergedPercentileDf["percentile"] = pd.Categorical(
    mergedPercentileDf["percentile"],
    categories=order,
    ordered=True
)
    # add line for mean_disconnected_pairs with different y-axis

    sns.catplot(data=mergedPercentileDf, kind="bar", y="mean_error", hue="percentile", errorbar=None).set_axis_labels(measure, "Mean error")

In [None]:
meantestdf = pd.read_csv(source_folder + "copenhagen_betweenness_0.33_0.33_0.34.csv")
print(df["sum_of_errors"].value_counts())

#get mean
mean = meantestdf['sum_of_errors'].mean()
print("sanity check that betweeness 0.33_0.33_0.34 mean is in ballpark: ", mean)


In [None]:
dftest = meantestdf.copy()
dftest = dftest[dftest["prune_quantile"] == 0.5]
numberofrows = dftest.shape[0]  
print("number of rows in copenhagen_betweenness_0.33_0.33_0.34.csv where prune_quantile is 0.5: ", numberofrows)
print(meantestdf.shape)

In [None]:
# def calculate_custom_bins(data, num_bins=10, method="equal"):
#     if method == "equal":
#         # Equal-width bins
#         min_val, max_val = data.min(), data.max()
#         bins = np.linspace(min_val, max_val, num_bins + 1)
#     elif method == "quantile":
#         # Equal-count bins
#         bins = np.quantile(data, np.linspace(0, 1, num_bins + 1))
#     else:
#         raise ValueError("Invalid method. Choose 'equal' or 'quantile'.")
#     return bins

def erros_vs_discon_points(df, measure, string_p, prune_quantile=0.5):
    # Bin the data
    bins = 10  # Number of bins
    df["error_bin"] = pd.cut(df["sum_of_errors"], bins=bins)
    
    # Aggregate disconnected pairs per bin
    binned_data = df.groupby("error_bin", observed=True,).agg(avg_disconnected=(disconnect_pairs, "mean"), bin_center=("sum_of_errors", "mean"),).reset_index()

    # Sort the original data for histogram
    df_sorted = df.sort_values("sum_of_errors")

    # Create the plot
    fig, ax1 = plt.subplots(figsize=(10, 6))

    # Plot the histogram of sum_of_errors on the primary y-axis
    sns.histplot(df_sorted["sum_of_errors"], bins=bins, alpha=0.5, ax=ax1, label="Histogram of Errors")
    ax1.set_xlabel("Sum of Errors")
    ax1.set_ylabel("Frequency of Errors", color="black")
    ax1.tick_params(axis="y", labelcolor="black")

    # Create a secondary y-axis for disconnected points per bin
    ax2 = ax1.twinx()
    ax2.plot(
        binned_data["bin_center"], 
        binned_data["avg_disconnected"], 
        marker="o", 
        label="Avg. Disconnected Pairs", 
        color="blue"
    )
    ax2.set_ylabel("Avg. Disconnected Pairs", color="black")
    ax2.tick_params(axis="y", labelcolor="blue")

    # Add legend
    ax1.legend(loc="upper left", fontsize=10)
    ax2.legend(loc="upper right", fontsize=10)

    plt.title("Disconnected Points vs. Histogram of Sum of Errors with " + str(measure) + " and " + string_p + " Prune Quantile: " + str(prune_quantile)) 
    # Show plot
    plt.tight_layout()
    #plt.show()
    # save the plot
    plt.savefig("plots/Disconnected_Points_vs._Histogram_of_Sum_of_Errors with/" + str(measure) + " and " + string_p + " Prune Quantile: " + str(prune_quantile) + ".png")
    print("plot saved for " + str(measure) + " and " + string_p + " Prune Quantile: " + str(prune_quantile))
    plt.close()
# d = meantestdf.copy()

# print("d: " + str(d["sum_of_errors"].value_counts()))
# df =d[d["prune_quantile"] == 0.5].copy()
# #print count unique values in column
# print("df: " + str(df["sum_of_errors"].value_counts()))
# df.head()
# #erros_vs_discon_points(df=df, measure="Betweenness", string_p="0.33_0.33_0.34", prune_quantile=0.5)
# # df["error_bin"] = pd.cut(df["sum_of_errors"], bins=10)
# # df.head()
# # print(df["sum_of_errors"].describe())
# # print(df["error_bin"].value_counts())
# #df.shape


In [None]:
for measure in measureDict:
    for df in measureDict[measure]:
        percentile = df["percentile"][0]
        for prune_quantile in prune_quantiles:
            #print (measure, percentile, prune_quantile)
            pruned = df[df["prune_quantile"] == prune_quantile].copy()
            erros_vs_discon_points(df=pruned, measure=measure, string_p=percentile, prune_quantile=prune_quantile)

In [None]:
pd.concat(measureDict["betweenness"]).head()

In [None]:
def heatmap(df):

    # Calculate the correlation matrix
    corr_matrix = df.corr()
    # Plot the heatmap
    plt.figure(figsize=(10, 6))
    sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
    plt.title("Correlation Heatmap of Variables")
    plt.show()

    
def change_strings_to_numbers(df):
    #show 20 first rows
    #map measure to a number 
    #df['measure'] = df['measure'].map({'betweenness': 1, 'closeness': 2, 'random': 3,})

#map percentile to a number
    df['percentile'] = df['percentile'].map({'0.75_0.25': 1,'_0.25_0.75':2, '0.25_0.25_0.25_0.25': 3, '0.33_0.33_0.34': 4,})
    return df
heatdf = pd.concat(measureDict["betweenness"])
#remove unessesary columns: bikengrowth_disconnected_pairs,continuous_abstract_edges,bikengrowth_abstract_edges
heatdf = heatdf.drop(columns=['bikengrowth_disconnected_pairs','continuous_abstract_edges','bikengrowth_abstract_edges', 'bikengrowth_vertices','percentile' ])
heatmap(heatdf)


A correlation of 0.85 suggests a strong positive relationship.
A correlation of -0.75 indicates a strong negative relationship.
A correlation near 0 implies no linear dependence.

In [None]:

def density_bell_error(df, percentile):
    sns.displot(data=df, x="sum_of_errors", hue="prune_quantile", kind="kde", fill=True)
    plt.title("Density Plot of Sum of Errors by prune quantile for percentile: " + percentile)
    plt.show()
density_bell_error(meantestdf, "0.33_0.33_0.34")

for measure in measureDict:
    mergedPercentileDf = pd.concat(measureDict[measure])
    for percentile in order:
        density_bell_error(mergedPercentileDf, percentile)

In [None]:
def denisity_bell_errors_hue_percentile(df, measure, prune_quantile):
    sns.displot(data=df, x="sum_of_errors", hue="percentile", kind="kde", fill=True)
    plt.title("Density Plot of Sum of Errors by percentile with " + measure + " and prune_quantile: " + str(prune_quantile))
    plt.show()
    
for measure in measureDict:
    mergedPercentileDf = pd.concat(measureDict[measure])
    for prune_quantile in prune_quantiles :
        mdf = mergedPercentileDf.copy()
        df = mdf[mdf["prune_quantile"] == prune_quantile]
        denisity_bell_errors_hue_percentile(df, measure, prune_quantile)


In [None]:
def density_bell_disconnected(df, percentile):
    sns.displot(data=df, x=disconnect_pairs, hue="prune_quantile", kind="kde", fill=True)
    plt.title("Density Plot of Disconnected Pairs by prune quantile for percentile: " + percentile)
    plt.show()
density_bell_disconnected(meantestdf, "0.33_0.33_0.34")

In [None]:
sns.displot(data=grouped, x="sum_of_errors", hue="measure", kind="kde", fill=True)
plt.title("Density Plot of Sum of Errors by Percentile")
plt.show()