# Statistical analysis of the distributions obtained by MCMC and GFN

We are going to split the document in two main categories following the classification in this [link](https://towardsdatascience.com/how-to-compare-two-or-more-distributions-9b06ee4d30bf): visualization and quantitative analysis.

In [2]:
import os
import sys

ROOT_DIR = os.path.abspath("__file__" + "/../../")
sys.path.insert(0, f"{ROOT_DIR}")

In [9]:
import seaborn as sns
import scipy.stats
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tableone

## Visualization

### Boxplot

For the box plot, we can have in the x-axis the theoretical distribution along with the empirical ones and in the y the respective frequencies for the grid coordinates. We can, then, check whether the simualtions overestimate grid coordinated in general or if they follow a more "smooth" distibution.

In [5]:
# Assuming a pd dataframe with three columns distribution type, grid coordinates, frequency/reward and as number of rows 3*number of ccordinates

In [None]:
sns.boxplot(data=df, x='Distribution', y='Frequency/Reward')
plt.title("Boxplot")
plt.show()
plt.clf()


### Histrogram

The same applies for the histrogram. Three histograms (one for each distribution) one on top of the other where we plot the respective frequencies/rewards for each grid ccordinate.

In [None]:
sns.histplot(data=df, x='Grid Coordinate', hue='Distribution') # Probably it is better to do the relative frequencies.
plt.title("Histogram")
plt.show()
plt.clf()

### Kernel Density

The same with the histogram but in a continous approximation.

In [None]:
sns.kdeplot(x='Grid Coordinate', data=df, hue='Distribution', common_norm=False)
plt.title("Kernel Density Function")
plt.show()
plt.clf()

### Cumulative Distributions

The same as in the histogram but for a contunuous cunulative distribution.

In [None]:
sns.histplot(x='Grid Coordinate', data=df, hue='Distribution', bins=len(df), stat="density",
             element="step", fill=False, cumulative=True, common_norm=False)
plt.title("Cumulative distribution function")
plt.show()
plt.clf()


### Q-Q Plot

Here we are going to have two plots. One for the GFN with respect to the theoretical one and one with GFN with respect to the theoretical one.

In [None]:
grid_coordinates = df['Grid Coordinate'].values
grid_coordinates_theoretical = df.loc[df.Distribution == 'Theoretical', 'Grid Coordinate'].values
grid_coordinates_MCMC = df.loc[df.Distribution=='MCMC', 'Grid Coordinate'].values
grid_coordinates_GFN = df.loc[df.Distribution=='GFN', 'Grid Coordinate'].values

df_pct = pd.DataFrame()
df_pct['q_theoretical'] = np.percentile(grid_coordinates_theoretical, range(100))
df_pct['q_MCMC'] = np.percentile(grid_coordinates_MCMC, range(100))
df_pct['q_GFN'] = np.percentile(grid_coordinates_GFN, range(100))

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(9, 6), subplot_kw={'xticks': [], 'yticks': []})

axs[0].scatter(x='q_theoretical', y='q_MCMC', data=df_pct, label='MCMC')
sns.lineplot(ax=axs[0], x='q_theoretical', y='q_theoretical', data=df_pct, color='r', label='Line of perfect fit')
axs[0].set_title("MCMC")
axs[1].scatter(x='q_theoretical', y='q_GFN', data=df_pct, label='GFN')
sns.lineplot(ax=axs[1], x='q_theoretical', y='q_theoretical', data=df_pct, color='r', label='Line of perfect fit')
axs[1].set_title("GFN")

plt.tight_layout()
plt.show()
plt.clf()


## Quantitative Analysis

For this part there are two ideas. One is to just compute the respective numbers fo the final cases of the empirical dsitributions. The other is (just for the GFN case), if we have data for each iteration (or batch of iterations) to plot the respective number for each iteration (batch).

In [6]:
# Assuming that we have mulitple dataframes: the final states (which is he same as in the previous section) and some for the intermediate states.

### T-test / Welch's t-test

In [None]:
if np.var(grid_coordinates_theoretical)/np.var(grid_coordinates_MCMC) < 4 or np.var(grid_coordinates_MCMC)/np.var(grid_coordinates_theoretical) < 4:
    equal_var_MCMC = True
else:
    equal_var_MCMC = False

stat_MCMC, p_value_MCMC = scipy.stats.ttest_ind(grid_coordinates_theoretical, grid_coordinates_MCMC, equal_var = equal_var_MCMC )
print(f"t-test for the MCMC: statistic={stat_MCMC:.4f}, p-value={p_value_MCMC:.4f}")

if np.var(grid_coordinates_theoretical)/np.var(grid_coordinates_GFN) < 4 or np.var(grid_coordinates_GFN)/np.var(grid_coordinates_theoretical) < 4:
    equal_var_GFN = True
else:
    equal_var_GFN = False


stat_GFN, p_value_GFN = scipy.stats.ttest_ind(grid_coordinates_theoretical, grid_coordinates_GFN, equal_var =equal_var_GFN)
print(f"t-test for the GFN: statistic={stat_GFN:.4f}, p-value={p_value_GFN:.4f}")

In [None]:
# For the intermediate states:
data = []
batch = 1
for file in :
    df = pd.read_csv(file, delimiter=',')
    grid_coordianates_batch = df['Grid Coordinate'].values
    grid_coordinates_theoretical_batch = df.loc[df.Distribution == 'Theoretical', 'Grid Coordinate'].values
    grid_coordinates_GFN_batch = df.loc[df.Distribution=='GFN', 'Grid Coordinate'].values

    if np.var(grid_coordinates_theoretical_batch)/np.var(grid_coordinates_GFN_batch) > 4 or np.var(grid_coordinates_GFN_batch)/np.var(grid_coordinates_theoretical_batch) > 4:
        equal_var_GFN_batch = False
    else:
        equal_var_GFN_batch = True

    stat_GFN_batch, p_value_GFN_batch = scipy.stats.ttest_ind(grid_coordinates_theoretical_batch, grid_coordinates_GFN_batch, equal_var = equal_var_GFN_batch)
    data.append([batch, p_value_GFN])
    batch += 1

plt.plot(data=data, label="t-test for everyt batch of the GFN")
plt.show()
plt.clf()

### Standardized Mean Difference

In [None]:
# The one in the website is up to python 3.9, so I used another one, where the documentation is not htat good. I will fo the rest of the plots after we get the data.
df_MCMC = df.drop(df[df['Distribution'] == "GFN"].index, inplace = True)
table_one_MCMC = tableone.TableOne(df_MCMC)
print(df_MCMC.tabulate(tablefmt="github"))

df_GFN = df.drop(df[df['Distribution'] == "MCMC"].index, inplace = True)
table_one_GFN = tableone.TableOne(df_GFN)
print(df_GFN.tabulate(tablefmt="github"))

### Mann–Whitney U Test / Brunner-Munzel test

In [None]:
if equal_var_MCMC:
    stat_MCMC, p_value_MCMC = scipy.stats.mannwhitneyu(grid_coordinates, grid_coordinates_MCMC)
    print(f" Mann–Whitney U Test: statistic={stat_MCMC:.4f}, p-value={p_value_MCMC:.4f}")
else:
    stat_MCMC, p_value_MCMC = scipy.stats.brunnermunzel(grid_coordinates, grid_coordinates_MCMC)
    print(f"  Brunner-Munzel test: statistic={stat_MCMC:.4f}, p-value={p_value_MCMC:.4f}")

if equal_var_GFN:
    stat_GFN, p_value_GFN = scipy.stats.mannwhitneyu(grid_coordinates, grid_coordinates_GFN)
    print(f" Mann–Whitney U Test: statistic={stat_GFN:.4f}, p-value={p_value_GFN:.4f}")
else:
    stat_GFN, p_value_GFN = scipy.stats.brunnermunzel(grid_coordinates, grid_coordinates_GFN)
    print(f"  Brunner-Munzel test: statistic={stat_GFN:.4f}, p-value={p_value_GFN:.4f}")

In [None]:
# For the intermediate states:
data = []
batch = 1
for file in :
    df = pd.read_csv(file, delimiter=',')
    grid_coordianates_batch = df['Grid Coordinate'].values
    grid_coordinates_theoretical_batch = df.loc[df.Distribution == 'Theoretical', 'Grid Coordinate'].values
    grid_coordinates_GFN_batch = df.loc[df.Distribution=='GFN', 'Grid Coordinate'].values

    if np.var(grid_coordinates_theoretical_batch)/np.var(grid_coordinates_GFN_batch) > 4 or np.var(grid_coordinates_GFN_batch)/np.var(grid_coordinates_theoretical_batch) > 4:
        equal_var_GFN_batch = False
    else:
        equal_var_GFN_batch = True

    if equal_var_GFN:
        stat_GFN, p_value_GFN = scipy.stats.mannwhitneyu(grid_coordinates, grid_coordinates_GFN)
        label = "Mann–Whitney U Test for each batch of the GFN"
    else:
        stat_GFN, p_value_GFN = scipy.stats.brunnermunzel(grid_coordinates, grid_coordinates_GFN)
        label = "Brunner-Munzel test for each batch of the GFN"
    data.append([batch, p_value_GFN])
    batch += 1

plt.plot(data=data, label=label)
plt.show()
plt.clf()

### Permutation Tests

### Chi-Squared Test

### Kolmogorov-Smirnov Test / Lilliefors test / Anderson-Darling test / Cramér-von Mises test