In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os as os

In [None]:
#while joachim has mixed bikengrowth_disconnected_pairs and continuous_disconnected_pairs
disconnect_pairs = "bikengrowth_disconnected_pairs"

In [None]:
measureDict = {}
file_names = []
source_folder = '../csv_data'
for file_name in os.listdir(source_folder):
    if file_name.endswith('.csv'):
        file_names.append(file_name)

for file_name in file_names:
    # Split the file name into parts
    parts = file_name.split('_')
    city = parts[0]  # The first part is the city
    measure = parts[1]  # The second part is the measure
        
        # The rest are the percentiles (everything from the third element up to '.csv')
    percentiles = '_'.join(parts[2:]).replace('.csv', '')
    if measure not in measureDict:
        measureDict[measure] = [file_name]
    else:
        measureDict[measure].append(file_name)

In [None]:
#df dict
df_dict = {}
for file_name in file_names:
    df = pd.read_csv(source_folder +"/"+ file_name)
    df_dict[file_name] = df

#all df merged with new columns for measure and percentile
megadf = pd.DataFrame() 
for key in df_dict:
    df = df_dict[key]
    parts = key.split('_')
    city = parts[0]  # The first part is the city
    measure = parts[1]  # The second part is the measure
    percentiles = '_'.join(parts[2:]).replace('.csv', '')
    df['measure'] = measure
    df['percentile'] = percentiles
    megadf = pd.concat([megadf, df], ignore_index=True)

grouped = megadf.copy()

In [None]:
#create column for mean error for every percentile grouped by measure
grouped['mean_error'] = megadf.groupby(['measure', 'percentile'])['sum_of_errors'].transform('mean')

In [None]:
sns.catplot(data=grouped, kind="bar", y="mean_error", x="measure", hue="percentile", errorbar=None)

In [None]:
# get mean error form file bern_betweenness_0.33_0.33_0.34.csv
meantestdf = pd.read_csv('../csv_data/' + "bern_betweenness_0.33_0.33_0.34.csv")
#get mean
mean = meantestdf['sum_of_errors'].mean()
print("sanity check that betweeness 0.33_0.33_0.34 mean is in ballpark: ", mean)

In [None]:
#make the cat plot larger as you cannot see the percentiles
sns.catplot(data=grouped, kind="bar", y="mean_error", x="percentile", hue="measure", errorbar=None)
#rotate x-axis labels
plt.xticks(rotation=45)
#make measure bold
plt.ylabel("Mean error")
plt.xlabel("Percentile")
plt.show()

In [None]:
def erros_vs_discon_points(file_name, name):
    df = pd.read_csv(source_folder +"/"+ file_name)
# Sort data by sum_of_errors in descending order
    df_sorted = df.sort_values(by="sum_of_errors", ascending=False).reset_index(drop=True)

    # Create the figure and primary axis
    fig, ax1 = plt.subplots(figsize=(12, 6))

    # Plot the histogram of sum_of_errors on the primary y-axis
    sns.histplot(df_sorted["sum_of_errors"], bins=10, color="gray", alpha=0.5, ax=ax1, label="Histogram of Errors")
    ax1.set_xlabel("Sum of Errors (Sorted Highest to Lowest)")
    ax1.set_ylabel("Frequency of Errors", color="gray")
    ax1.tick_params(axis="y", labelcolor="gray")

    # Create a secondary y-axis for disconnected points
    ax2 = ax1.twinx()
    ax2.plot(df_sorted["sum_of_errors"], df_sorted[disconnect_pairs], marker="o", label="Continuous Disconnected Pairs", color="blue")
    ax2.set_ylabel("Disconnected Points", color="black")
    ax2.tick_params(axis="y", labelcolor="black")

    # Combine legends
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax2.legend(lines1 + lines2, labels1 + labels2, loc="upper left")

    # Add a title and grid
    plt.title("Disconnected Points vs. Histogram of Sum of Errors with " + name)    
    plt.grid(axis="y", linestyle="--", alpha=0.5)

    # Show the plot
    plt.tight_layout()
    plt.show() 

In [None]:
# Replace with the path to your CSV file
def scatterplot(file_path, name):
    df = pd.read_csv(file_path)

    # Scatter plot with regression line for continuous_disconnected_points vs. sum_of_errors
    plt.figure(figsize=(12, 6))
    sns.regplot(x="sum_of_errors", y=disconnect_pairs, data=df, scatter_kws={'color':'blue'}, line_kws={'color':'red'})
    plt.xlabel("Sum of Errors")
    plt.ylabel("Continuous Disconnected Points")
    plt.title("Sum of Errors vs. Continuous Disconnected Points with " + name)
    plt.show()

In [None]:
def heatmap(df):

    # Calculate the correlation matrix
    corr_matrix = df.corr()

    # Plot the heatmap
    plt.figure(figsize=(10, 6))
    sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
    plt.title("Correlation Heatmap of Variables")
    plt.show()
def change_strings_to_numbers(df):
    #show 20 first rows
    #map measure to a number 
    df['measure'] = megadf['measure'].map({'betweenness': 1, 'closeness': 2, 'random': 3,})

#map percentile to a number
    df['percentile'] = megadf['percentile'].map({'0.75_0.25': 1,'_0.25_0.75':2, '0.25_0.25_0.25_0.25': 3, '0.33_0.33_0.34': 4,})
    return df

heatmap(change_strings_to_numbers(megadf.copy()))

In [None]:
# 
# for f in file_names:
#     heatmap(source_folder +"/"+ f, f.split(".")[0])
#heatmap(megadf)
# ValueError: could not convert string to float: 'closeness'

#bell curve plot for sums_of_errors by percentile and measure
sns.displot(data=grouped, x="sum_of_errors", hue="percentile", kind="kde", fill=True)
plt.title("Density Plot of Sum of Errors by Percentile")
plt.show()


In [None]:
sns.displot(data=grouped, x="sum_of_errors", hue="measure", kind="kde", fill=True)
plt.title("Density Plot of Sum of Errors by Percentile")
plt.show()

In [None]:
sns.pairplot(megadf, hue="measure", diag_kind="kde", kind="scatter")