<h1> LTestChiSquaredTests.ipynb </h1>

This code is prepared so that it condenses all the information from the code files inside TestAModelFolder/ML. 

First copy and paste the folders that CrystallineMLAlone.ipynb and AmorphousMLAlone.ipynb have created inside LTestVisualCheck (at the same level as the ipynb files).
Then run the first cell of LTestChiSquaredTests.ipynb if you want to obtain information using the chi squared metric. This cell will create a subfolder called ChiSquared where only three files are important:
 - Chi2_rank_vertical.png For every experiment, the chi squared value of the preditions of each model is obtained. Then all chi squared values of the different models in each experiment is compared. The one that has the smaller chi squared value obtains a +1, the second smallest a +2 and so on. This process is repeated for each experiment and these integer values are summed. At the end we can say that the model with the smallest score has performed better overall
 - Chi2_AccumulativeScores.txt has the same information than the graph but on a txt file
 - Chi2_NumberOfExperimentsWhereEachModelOutperforms.txt as its name indicates counts only the number of experiment where each model obtains the smallest chi squared value. Sometimes, models that overfit perform better on this test (as they pass through all the points)

Run the second folder if you want the same information using a metric similar to the Mean Average Error (MAE). Instead of summing over the differences squared and divided by the uncertainty of the prediction squared, here we are summing over the absolute value of the differences and divided by the uncertainty (not squared). By running this cell you create the subfolder Lvalues that contains the same type of information (Lvalues_rank_vertical.png, Lvalues_TotalScores.txt with the information of the graph and Lvalues_WinnerScores which is the equivalent version of Chi2_NumberOfExperimentsWhereEachModelOutperforms.txt but with The L metric

If you run the third one it will automatically combine the predictions for twelve models for each experiment in a single image. This way it is easier to spot the differences between models in a quick look. The code also orders them in order of complexity (each consecutive row has a bigger complexity) and number of augmentations (each consecutive column ahs more augmentation). This way the models are organized from simpler (top left) to more complex (bottom right).

<H1> CHI SQUARED TEST </H1>

In [None]:
import os
import pandas as pd
import numpy as np
from collections import OrderedDict
import matplotlib.pyplot as plt
import numpy as np
def long_path(path):
    """Return Windows long path format if on Windows."""
    if os.name == 'nt':
        path = os.path.abspath(path)
        if not path.startswith('\\\\?\\'):
            return '\\\\?\\' + path
    return path
def compute_chi2_for_experiment(df_real, df_pred):
    """
    Compute the reduced chi-squared value for one experiment:
    
        χ² = (1/N) * Σ [ (pred - real)² / (err_real)² ]
    """
    if len(df_real) != len(df_pred):
        raise ValueError("Real and predicted files must have the same length.")
    
    N = len(df_real)
    diff = df_pred["PredictedPolarizationD3"].values - df_real["RealPolarizationD3"].values
    err = df_real["ErrRealPolarizationD3"].values
    err[err == 0] = 1e-10  # avoid division by zero
    
    chi2 = np.sum((diff**2) / (err**2)) / N
    return chi2


def load_data(experiment_folder):
    """
    Load the txt files containing real and predicted data
    for a MissingPolarizationD3_* experiment.
    """
    experiment_folder = long_path(experiment_folder)
    files = [long_path(os.path.join(experiment_folder, f)) for f in os.listdir(experiment_folder)]

    real_file = next(f for f in files if os.path.basename(f).startswith("RawData_PolarizationD3_MissingPolarizationD3_"))
    pred_file = next(f for f in files if os.path.basename(f).startswith("PredictedPoints_PolarizationD3_MissingPolarizationD3_"))

    df_real = pd.read_csv(real_file, sep="\t")
    df_pred = pd.read_csv(pred_file, sep="\t")
    return df_real, df_pred


def count_missing_folders(model_path):
    """
    Return the list of MissingPolarizationD3_* subfolders for a given model folder.
    """
    model_path = long_path(model_path)
    return sorted([
        f for f in os.listdir(model_path)
        if f.startswith("MissingPolarizationD3_") and os.path.isdir(long_path(os.path.join(model_path, f)))
    ])


def save_chi2_values_to_txt(model_folder_name, missing_folders, chi2_values, output_dir):
    """
    Save the chi-squared values to a structured text file
    inside the 'ChiSquared' folder at the root level.
    """
    chi2_dir = long_path(os.path.join(output_dir, "ChiSquared"))
    os.makedirs(chi2_dir, exist_ok=True)  # ensure folder exists

    filename = f"Chi2_values_{model_folder_name}.txt"
    filepath = long_path(os.path.join(chi2_dir, filename))

    with open(filepath, "w") as f:
        for folder_name, chi2 in zip(missing_folders, chi2_values):
            f.write(f"{folder_name}: {chi2:.6f}\n")

    print(f"File saved: {filepath}")


def main_all_models(root_folder):
    """
    Main loop over all model folders in the given root_folder.
    Each folder should contain MissingPolarizationD3_* subfolders.
    """
    root_folder = long_path(root_folder)
    all_model_folders = [
        d for d in os.listdir(root_folder)
        if os.path.isdir(long_path(os.path.join(root_folder, d)))
    ]

    if not all_model_folders:
        print("No model folders found.")
        return

    for model_folder in all_model_folders:
        model_path = long_path(os.path.join(root_folder, model_folder))
        missing_folders = count_missing_folders(model_path)

        if not missing_folders:
            print(f" No MissingPolarizationD3_ folders in {model_folder}")
            continue

        chi2_values = []
        for missing_folder in missing_folders:
            experiment_path = long_path(os.path.join(model_path, missing_folder))
            try:
                df_real, df_pred = load_data(experiment_path)
                chi2 = compute_chi2_for_experiment(df_real, df_pred)
                chi2_values.append(chi2)
            except Exception as e:
                print(f"   Error in {missing_folder}: {e}")

        if chi2_values:
            save_chi2_values_to_txt(model_folder, missing_folders, chi2_values, root_folder)
            chi2_mean = np.mean(chi2_values)
        else:
            print(f"No valid χ² values computed for {model_folder}.")


# === Example call ===
main_all_models(r".")




# Folder that contains the saved chi-squared text files
folder_path = long_path("ChiSquared")

# Quick diagnostic
if not os.path.isdir(folder_path):
    print(f"Folder not found: {folder_path}")
    print("   Tip: run the computation cell first so it creates ChiSquared/ and the files.")
    
# Helper to recognize result files from either naming style
def is_result_file(name: str) -> bool:
    return (
        name.endswith(".txt") and
        (name.startswith("Chi2_values_") or name.startswith("ChiSquared_values_"))
    )

# Initialize dictionaries
Chi2Values = {}             
ExperimentName = OrderedDict() 

# Collect files
result_files = []
if os.path.isdir(folder_path):
    for filename in os.listdir(folder_path):
        filename = long_path(os.path.join(folder_path, filename))  # wrap early
        if is_result_file(os.path.basename(filename)):
            result_files.append(filename)

# Sort for stable order
result_files.sort()

if not result_files:
    print("No results found in ChiSquared/ matching 'Chi2_values_*.txt' or 'ChiSquared_values_*.txt'.")
else:
    for full_path in result_files:
        with open(full_path, "r") as f:
            lines = f.readlines()

        Chi2Values[os.path.basename(full_path)] = {}
        for line in lines:
            if ":" in line:
                try:
                    experiment_name, value = line.split(":", 1)
                    experiment_name = experiment_name.strip()
                    value = float(value.strip())
                    Chi2Values[os.path.basename(full_path)][experiment_name] = value
                except ValueError:
                    print(f"Skipping malformed line in {full_path}: {line.strip()}")

        # Fill ExperimentName only from the first processed file
        if not ExperimentName and Chi2Values[os.path.basename(full_path)]:
            for i, experiment_name in enumerate(Chi2Values[os.path.basename(full_path)].keys(), 1):
                ExperimentName[experiment_name] = f"exp{i}"

# === Plotting ===
plt.figure(figsize=(14, 7))

exp_ids = list(ExperimentName.values())
assert len(Chi2Values) >= 2, "You need at least two Chi² values files"
files = list(Chi2Values.keys())

try:
    cmap = plt.colormaps.get_cmap('tab20')  # new API
except AttributeError:
    cmap = plt.get_cmap('tab20')  # old API

colors = cmap.colors
line_styles = ['-', '--', ':', '-.', (0, (5, 1)), (0, (3, 1, 1, 1))]
line_widths = [2] * len(files)

for j, filename in enumerate(files):
    chi2_values = [
        Chi2Values[filename].get(exp_name, np.nan)  # Use NaN if missing
        for exp_name in ExperimentName.keys()
    ]
    
    plt.plot(
        range(len(exp_ids)),
        chi2_values,
        color=colors[j % len(colors)],
        linestyle=line_styles[j % len(line_styles)],
        linewidth=line_widths[j],
        label=filename
    )

xticks_pos = [i for i in range(len(exp_ids)) if (i + 1) % 10 == 0]
xticks_labels = [exp_ids[i] for i in xticks_pos]
plt.xticks(xticks_pos, xticks_labels, rotation=45, ha='right')

plt.xlabel("Experiment", fontsize=12)
plt.ylabel(r"$\chi^2$", fontsize=12)
plt.title(r"Chi-squared values per experiment (model comparison)", fontsize=14)
plt.grid(alpha=0.2, linestyle='--')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()


# Normalize keys (replace "-" by "_")
def normalize_key(k):
    return k.replace("-", "_")

Chi2Values_norm = {
    model: {normalize_key(exp): val for exp, val in Chi2Values[model].items()}
    for model in Chi2Values
}


# --- Normalize experiment names mapping ---
ExperimentName_norm = {normalize_key(exp): exp for exp in ExperimentName.keys()}

# Build 2D array: rows = models, cols = experiments
all_chi2_values = np.array([
    [Chi2Values_norm[f].get(exp, np.nan) for exp in ExperimentName_norm.keys()]
    for f in Chi2Values_norm.keys()
])

# Preserve the original experiment order (exp1, exp2, ...)
exp_ids = [ExperimentName_norm[exp] for exp in ExperimentName_norm.keys()]
model_ids = list(Chi2Values_norm.keys())  # rows of all_chi2_values


# --- Build Chi² matrix ---
files = list(Chi2Values.keys())   # Model names (files)
nb_models = len(files)
exp_ids = list(ExperimentName.keys())  # Experiments in original order

all_chi2_values = []
for f in files:
    model_vals = []
    for exp in exp_ids:
        if exp not in Chi2Values[f]:
            print(f"Warning: experiment '{exp}' missing for model '{f}'")
            model_vals.append(np.nan)  # fill missing with NaN
        else:
            model_vals.append(Chi2Values[f][exp])
    all_chi2_values.append(model_vals)
all_chi2_values = np.array(all_chi2_values)

# --- Compute average Chi² across models for each experiment ---
avg_chi2 = np.nanmean(all_chi2_values, axis=0)  # ignore NaN when averaging

# --- Colors and line styles ---
try:
    cmap = plt.colormaps['tab20']   # new API (>=3.5)
except (AttributeError, TypeError):
    cmap = plt.get_cmap('tab20')    # old API

colors = cmap(range(nb_models))

# --- Plot deviation from the average ---
plt.figure(figsize=(18, 12))
for j in range(nb_models):
    diff_from_avg = all_chi2_values[j] - avg_chi2
    plt.plot(
        range(1, len(exp_ids) + 1),  # experiments indexed starting at 1
        diff_from_avg,
        linewidth=2,
        color=colors[j % len(colors)],
        label=files[j]
    )

xticks_pos = [k + 1 for k in range(len(exp_ids)) if (k + 1) % 10 == 0]
plt.xticks(xticks_pos, rotation=0)
plt.xlabel("Experiment Index", fontsize=12)
plt.ylabel("ΔChi² (model - average)", fontsize=12)
plt.title("Deviation of Chi² values from experiment-wise average", fontsize=14)
plt.grid(alpha=0.3)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


# --- Save number of wins (lowest Chi² per experiment) ---
output_folder = long_path("ChiSquared")
os.makedirs(output_folder, exist_ok=True)

model_scores = {f: 0 for f in files}
for chi2_vals in all_chi2_values.T:
    if np.all(np.isnan(chi2_vals)):
        continue
    min_index = np.nanargmin(chi2_vals)
    winning_model = files[min_index]
    model_scores[winning_model] += 1

sorted_scores = sorted(model_scores.items(), key=lambda x: x[1], reverse=True)

output_file = long_path(os.path.join(output_folder, "Chi2_NumberOfExperimentsWhereEachModelOutperforms.txt"))
with open(output_file, "w") as f:
    f.write("Approval Rate (Number of wins by minimal Chi². The higher the better):\n\n")
    for model, score in sorted_scores:
        f.write(f"{model}: {score} points\n")
print(f"Results saved to {output_file}")


# --- Save rank-based accumulative scores ---
all_chi2_values = []
for f in files:
    model_chi2 = []
    for exp in exp_ids:
        model_chi2.append(Chi2Values[f].get(exp, np.nan))
    all_chi2_values.append(model_chi2)
all_chi2_values = np.array(all_chi2_values)

model_scores = {f: 0 for f in files}
for chi_values in all_chi2_values.T:
    sorted_indices = np.argsort(chi_values)  # best to worst
    for rank, model_idx in enumerate(sorted_indices, start=1):
        model_name = files[model_idx]
        model_scores[model_name] += rank

sorted_scores = sorted(model_scores.items(), key=lambda x: x[1])

output_file = long_path(os.path.join(output_folder, "Chi2_AccumulativeScores.txt"))
with open(output_file, "w") as f:
    f.write("Approval Rate (Score by rank. Lower is better):\n\n")
    for model, score in sorted_scores:
        f.write(f"{model}: {score} points\n")
print(f"Scores saved to {output_file}")


# --- Ensure output folder exists for plots ---
output_folder = long_path("ChiSquared")
os.makedirs(output_folder, exist_ok=True)



model_scores = {f: 0.0 for f in files}

# For each experiment (iterate over columns of the chi² matrix)
for chi2_vals in all_chi2_values.T:
    if np.all(np.isnan(chi2_vals)):
        continue
    min_chi2 = np.nanmin(chi2_vals)
    for idx, val in enumerate(chi2_vals):
        if np.isnan(val):
            score = 0.0
        else:
            score = min_chi2 / val  # relative to best
        model_name = files[idx]
        model_scores[model_name] += score

# Normalize by number of experiments
for model in model_scores:
    model_scores[model] /= len(exp_ids)

# Sort models (higher = better)
sorted_scores = sorted(model_scores.items(), key=lambda x: x[1], reverse=True)

# --- Save results ---
output_file = long_path(os.path.join(output_folder, "Chi2_RelativeNormalizedScores.txt"))
with open(output_file, "w") as f:
    f.write("Relative Score (Normalized Chi² per experiment. Higher is better):\n\n")
    for model, score in sorted_scores:
        f.write(f"{model}: {score:.4f}\n")
print(f"Normalized relative scores saved to {output_file}")


# -------------------------------
# Vertical bar chart: Rank-based scoring (lower = better)
# -------------------------------
sorted_scores_rank = sorted(model_scores.items(), key=lambda x: x[1])
models = [m for m, _ in sorted_scores_rank]
scores = [s for _, s in sorted_scores_rank]

norm = plt.Normalize(min(scores), max(scores))
try:
    cmap = plt.colormaps['RdYlGn_r']
except (AttributeError, TypeError):
    cmap = plt.get_cmap('RdYlGn_r')

colors = cmap(norm(scores))
short_models = [m.replace("Chi2_values_AllTestsFolder_", "").replace(".txt", "") for m in models]
short_models = ["_".join(m.split("_")[-2:]) for m in short_models]

fig, ax = plt.subplots(figsize=(12, 6))
bars = ax.bar(short_models, scores, color=colors)

for bar, score in zip(bars, scores):
    ax.text(
        bar.get_x() + bar.get_width()/2,
        score + score * 0.02,
        f"{score:.3f}".replace(',', '.'),
        ha='center', va='bottom',
        fontsize=8
    )

sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array(scores)
cbar = fig.colorbar(sm, ax=ax)
cbar.set_label("Rank Score (Lower = Better)")

ax.set_ylabel("Total Rank Score", fontsize=12)
ax.set_xlabel("Model", fontsize=12)
ax.set_title("Approval Rate: Rank-Based Scoring (Chi², Green=Better, Red=Worse)", fontsize=14)
ax.grid(axis='y', alpha=0.3)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(long_path(os.path.join(output_folder, "Chi2_rank_vertical.png")), dpi=600, bbox_inches='tight')
plt.show()


# -------------------------------
# Vertical bar chart: Relative normalized scoring (higher = better)
# -------------------------------
sorted_scores_rel = sorted(model_scores.items(), key=lambda x: x[1], reverse=True)
models = [m for m, _ in sorted_scores_rel]
scores = [s for _, s in sorted_scores_rel]

short_models = [m.replace("Chi2_values_AllTestsFolder_", "").replace(".txt", "") for m in models]
short_models = ["_".join(m.split("_")[-2:]) for m in short_models]

norm = plt.Normalize(min(scores), max(scores))
try:
    cmap = plt.colormaps['RdYlGn']
except (AttributeError, TypeError):
    cmap = plt.get_cmap('RdYlGn')

colors = cmap(norm(scores))

fig, ax = plt.subplots(figsize=(14, 6))
bars = ax.bar(short_models, scores, color=colors, edgecolor='black')

for bar, score in zip(bars, scores):
    ax.text(
        bar.get_x() + bar.get_width() / 2,
        score + score * 0.02,
        f"{score:.3f}".replace(',', '.'),
        ha='center', va='bottom',
        fontsize=8
    )

sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array(scores)
cbar = fig.colorbar(sm, ax=ax)
cbar.set_label("Relative Score (Higher = Better)")

ax.set_ylabel("Total Relative Score (averaged over experiments)", fontsize=12)
ax.set_xlabel("Model", fontsize=12)
ax.set_title("Approval Rate: Normalized Relative Scoring (Chi², Green = Best, Red = Worst)", fontsize=14)
ax.grid(axis='y', alpha=0.3)
plt.xticks(rotation=45, ha='right')
ax.set_ylim(0, max(scores) * 1.1)

plt.tight_layout()
plt.savefig(long_path(os.path.join(output_folder, "Chi2_relative_vertical.png")), dpi=600, bbox_inches='tight')
plt.show()


In [None]:
# Pair models with scores
model_scores = list(zip(short_models, scores))

# Sort by score (ascending)
model_scores_sorted = sorted(model_scores, key=lambda x: x[1])

# Print sorted names with scores
print("Models sorted by score (lowest → highest):")
i=0
for model, score in model_scores_sorted:
    i=i+1
    print(f"{model}: {score:.3f}")
# Unzip sorted models and scores
print(i)
sorted_models, sorted_scores = zip(*model_scores_sorted)

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors

# Pair models with scores
model_scores = list(zip(short_models, scores))

# Sort by score (ascending → worst at top, best at bottom)
model_scores_sorted = sorted(model_scores, key=lambda x: x[1])

# Unzip
sorted_models, sorted_scores = zip(*model_scores_sorted)

# Define colormap (red → yellow → green)
cmap = cm.get_cmap("RdYlGn")
norm = mcolors.Normalize(vmin=min(sorted_scores), vmax=max(sorted_scores))

# Assign colors based on scores
colors = [cmap(norm(s)) for s in sorted_scores]

# Create figure
fig, ax = plt.subplots(figsize=(10, len(sorted_models) * 0.15))

# Plot horizontal bars with mapped colors
bars = ax.barh(sorted_models, sorted_scores, color=colors, edgecolor="black")

# Labels and title
ax.set_xlabel("Total Relative Score", fontsize=12)
ax.set_ylabel("Model", fontsize=12)
ax.set_title("Approval Rate: Sorted Scores (Red = Worst, Green = Best)", fontsize=14)
ax.grid(axis="x", alpha=0.3)

# Make tick labels smaller
ax.tick_params(axis="x", labelsize=9)
ax.tick_params(axis="y", labelsize=7)

# Add score labels at the end of each bar
for bar, score in zip(bars, sorted_scores):
    ax.text(
        bar.get_width() + max(sorted_scores) * 0.01,
        bar.get_y() + bar.get_height() / 2,
        f"{score:.3f}".replace(',', '.'),
        va="center", ha="left", fontsize=8
    )

# Remove top/bottom empty space
ax.set_ylim(-0.5, len(sorted_models) - 0.5)

# Extend X-axis so labels fit
ax.set_xlim(0, max(sorted_scores) * 1.1)
"""
# Add colorbar legend
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cbar = fig.colorbar(sm, ax=ax)
cbar.set_label("Relative Score (Higher = Better)")
"""
plt.tight_layout()
plt.savefig("FullVerticalChiCrystalline.png", dpi=900, bbox_inches="tight")

plt.show()




In [None]:
import re
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors

# Helper: extract numeric suffix
def extract_number(model_name):
    match = re.search(r'_(\d+)$', model_name)
    return int(match.group(1)) if match else float('inf')

# Pair models with scores
model_scores = list(zip(short_models, scores))

# Sort by numeric suffix (ascending: 1 → N)
model_scores_sorted = sorted(model_scores, key=lambda x: extract_number(x[0]))

# Unzip into separate lists
sorted_models, sorted_scores = zip(*model_scores_sorted)

# Define colormap (red → yellow → green)
cmap = cm.get_cmap("RdYlGn")
norm = mcolors.Normalize(vmin=min(sorted_scores), vmax=max(sorted_scores))

# Assign colors based on scores
colors = [cmap(norm(s)) for s in sorted_scores]

# Create figure
fig, ax = plt.subplots(figsize=(10, len(sorted_models) * 0.5))

# Plot horizontal bars with mapped colors
bars = ax.barh(sorted_models, sorted_scores, color=colors, edgecolor="black")

# Labels and title
ax.set_xlabel("Total Relative Score", fontsize=12)
ax.set_ylabel("Model", fontsize=12)
ax.set_title("Approval Rate: Ordered by Model Number (Red = Worst, Green = Best)", fontsize=14)
ax.grid(axis="x", alpha=0.3)

# Make tick labels smaller
ax.tick_params(axis="x", labelsize=9)
ax.tick_params(axis="y", labelsize=7)

# Add score labels at the end of each bar
for bar, score in zip(bars, sorted_scores):
    ax.text(
        bar.get_width() + max(sorted_scores) * 0.01,
        bar.get_y() + bar.get_height() / 2,
        f"{score:.3f}".replace(',', '.'),
        va="center", ha="left", fontsize=8
    )

# Remove top/bottom empty space
ax.set_ylim(-0.5, len(sorted_models) - 0.5)

# Extend X-axis so labels fit
ax.set_xlim(0, max(sorted_scores) * 1.1)

# If you want a colorbar legend, uncomment this block:
"""
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cbar = fig.colorbar(sm, ax=ax)
cbar.set_label("Relative Score (Higher = Better)")
"""

plt.tight_layout()
plt.savefig("FullVerticalChiCrystalline.png", dpi=900, bbox_inches="tight")

plt.show()


<h1> L TEST </h1>

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
def long_path(path):
    """Return Windows long path format if on Windows."""
    if os.name == 'nt':
        path = os.path.abspath(path)
        if not path.startswith('\\\\?\\'):
            return '\\\\?\\' + path
    return path

def compute_L_for_experiment(df_real, df_pred):
    """
    Compute the reduced L-squared value for one experiment:
    
        L = (1/N) * Σ abs[(pred - real) / (err_real)]
    """
    if len(df_real) != len(df_pred):
        raise ValueError("Real and predicted files must have the same length.")
    
    N = len(df_real)
    diff = df_pred["PredictedPolarizationD3"].values - df_real["RealPolarizationD3"].values
    err = df_real["ErrRealPolarizationD3"].values
    err[err == 0] = 1e-10  # avoid division by zero
    
    L = np.sum(np.abs(diff) / (err)) / N
    return L

def load_data(experiment_folder):
    """
    Load the txt files containing real and predicted data
    for a MissingPolarizationD3_* experiment.
    """
    experiment_folder = long_path(experiment_folder)
    files = os.listdir(experiment_folder)
    real_file = next(f for f in files if f.startswith("RawData_PolarizationD3_MissingPolarizationD3_"))
    pred_file = next(f for f in files if f.startswith("PredictedPoints_PolarizationD3_MissingPolarizationD3_"))

    df_real = pd.read_csv(long_path(os.path.join(experiment_folder, real_file)), sep="\t")
    df_pred = pd.read_csv(long_path(os.path.join(experiment_folder, pred_file)), sep="\t")
    return df_real, df_pred

def count_missing_folders(model_path):
    """
    Return the list of MissingPolarizationD3_* subfolders for a given model folder.
    """
    model_path = long_path(model_path)
    return sorted([
        f for f in os.listdir(model_path)
        if f.startswith("MissingPolarizationD3_") and os.path.isdir(long_path(os.path.join(model_path, f)))
    ])

def save_L_values_to_txt(model_folder_name, missing_folders, L_values, output_dir):
    """
    Save the L-squared values to a structured text file
    inside the 'L' folder at the root level.
    """
    L_dir = long_path(os.path.join(output_dir, "L"))
    os.makedirs(L_dir, exist_ok=True)  # ensure folder exists

    filename = f"L_values_{model_folder_name}.txt"
    filepath = long_path(os.path.join(L_dir, filename))

    with open(filepath, "w") as f:
        for folder_name, L in zip(missing_folders, L_values):
            f.write(f"{folder_name}: {L:.6f}\n")

    print(f"File saved: {filepath}")

def main_all_models(root_folder):
    """
    Main loop over all model folders in the given root_folder.
    Each folder should contain MissingPolarizationD3_* subfolders.
    """
    root_folder = long_path(root_folder)
    all_model_folders = [
        d for d in os.listdir(root_folder)
        if os.path.isdir(long_path(os.path.join(root_folder, d)))
    ]

    if not all_model_folders:
        print("No model folders found.")
        return

    for model_folder in all_model_folders:
        #print(f"\n Processing model: {model_folder}")
        model_path = long_path(os.path.join(root_folder, model_folder))
        missing_folders = count_missing_folders(model_path)

        if not missing_folders:
            print(f" No MissingPolarizationD3_ folders in {model_folder}")
            continue

        L_values = []
        for missing_folder in missing_folders:
            experiment_path = long_path(os.path.join(model_path, missing_folder))
            try:
                df_real, df_pred = load_data(experiment_path)
                L = compute_L_for_experiment(df_real, df_pred)
                L_values.append(L)
                #print(f"  {missing_folder}: L = {L:.6f}")
            except Exception as e:
                print(f"   Error in {missing_folder}: {e}")

        if L_values:
            save_L_values_to_txt(model_folder, missing_folders, L_values, root_folder)
            L_mean = np.mean(L_values)
            #print(f"Average L for {model_folder}: {L_mean:.6f}")
        else:
            print(f"No valid L values computed for {model_folder}.")

# === Example call ===
main_all_models(r".")
import os
from collections import OrderedDict

def long_path(path):
    """Return Windows long path format if on Windows."""
    if os.name == 'nt':
        path = os.path.abspath(path)
        if not path.startswith('\\\\?\\'):
            return '\\\\?\\' + path
    return path

# Folder that contains the saved L-squared text files
folder_path = long_path("L")

# Quick diagnostic
if not os.path.isdir(folder_path):
    print(f"Folder not found: {folder_path}")
    print("   Tip: run the computation cell first so it creates L/ and the files.")
    
# Helper to recognize result files from either naming style
def is_result_file(name: str) -> bool:
    return (
        name.endswith(".txt") and
        (name.startswith("L_values_") or name.startswith("L_values_"))
    )

# Initialize dictionaries
LValues = {}             
ExperimentName = OrderedDict() 

# Collect files
result_files = []
if os.path.isdir(folder_path):
    for filename in os.listdir(folder_path):
        if is_result_file(filename):
            result_files.append(filename)

# Sort for stable order
result_files.sort()

if not result_files:
    print("No results found in L/ matLng 'L_values_*.txt' or 'L_values_*.txt'.")
else:
    for filename in result_files:
        full_path = long_path(os.path.join(folder_path, filename))
        with open(full_path, "r") as f:
            lines = f.readlines()

        LValues[filename] = {}
        for line in lines:
            if ":" in line:
                try:
                    experiment_name, value = line.split(":", 1)
                    experiment_name = experiment_name.strip()
                    value = float(value.strip())
                    LValues[filename][experiment_name] = value
                except ValueError:
                    print(f"Skipping malformed line in {filename}: {line.strip()}")

        # Fill ExperimentName only from the first processed file
        if not ExperimentName and LValues[filename]:
            for i, experiment_name in enumerate(LValues[filename].keys(), 1):
                ExperimentName[experiment_name] = f"exp{i}"





plt.figure(figsize=(14, 7))

# List of experiment identifiers (exp1, exp2, ...)
exp_ids = list(ExperimentName.values())

# Ensure we have at least 2 models
assert len(LValues) >= 2, "You need at least two L values files"

# List of files to plot (model names)
files = list(LValues.keys())

# Use modern colormap handling
try:
    cmap = plt.colormaps.get_cmap('tab20')  # new API
except AttributeError:
    cmap = plt.get_cmap('tab20')  # old API

colors = cmap.colors
line_styles = ['-', '--', ':', '-.', (0, (5, 1)), (0, (3, 1, 1, 1))]
line_widths = [2] * len(files)

# Plot the curves
for j, filename in enumerate(files):
    L_values = [
        LValues[filename].get(exp_name, np.nan)  # Use NaN if missing
        for exp_name in ExperimentName.keys()
    ]
    
    plt.plot(
        range(len(exp_ids)),
        L_values,
        color=colors[j % len(colors)],
        linestyle=line_styles[j % len(line_styles)],
        linewidth=line_widths[j],
        label=filename
    )

# Show xticks every 10 experiments
xticks_pos = [i for i in range(len(exp_ids)) if (i + 1) % 10 == 0]
xticks_labels = [exp_ids[i] for i in xticks_pos]
plt.xticks(xticks_pos, xticks_labels, rotation=45, ha='right')

# Labels and title
plt.xlabel("Experiment", fontsize=12)
plt.ylabel(r"$L$", fontsize=12)
plt.title(r"L values per experiment (model comparison)", fontsize=14)
plt.grid(alpha=0.2, linestyle='--')

# Legend outside the plot
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()
import numpy as np

# Normalize function (replace "-" by "_")
def normalize_key(k):
    return k.replace("-", "_")

# Normalize experiment names inside LValues
LValues_norm = {
    model: {normalize_key(exp): val for exp, val in LValues[model].items()}
    for model in LValues
}

# Normalize experiment names mapping
ExperimentName_norm = {normalize_key(exp): exp for exp in ExperimentName.keys()}

# Build 2D array: rows = models, cols = experiments
all_L_values = np.array([
    [LValues_norm[f].get(exp, np.nan) for exp in ExperimentName_norm.keys()]
    for f in LValues_norm.keys()
])

# Preserve the original experiment order (exp1, exp2, ...)
exp_ids = [ExperimentName_norm[exp] for exp in ExperimentName_norm.keys()]

# Keep track of model names (rows of all_L_values)
model_ids = list(LValues_norm.keys())


import matplotlib.pyplot as plt
import numpy as np

# --- Build L matrix ---
files = list(LValues.keys())   # Model names (files)
nb_models = len(files)

exp_ids = list(ExperimentName.keys())  # Experiments in original order

all_L_values = []
for f in files:
    model_vals = []
    for exp in exp_ids:
        if exp not in LValues[f]:
            print(f"Warning: experiment '{exp}' missing for model '{f}'")
            model_vals.append(np.nan)  # fill missing with NaN
        else:
            model_vals.append(LValues[f][exp])
    all_L_values.append(model_vals)

all_L_values = np.array(all_L_values)

# --- Compute average L across models for each experiment ---
avg_L = np.nanmean(all_L_values, axis=0)  # ignore NaN when averaging

# --- Colors and line styles ---
try:
    cmap = plt.colormaps['tab20']   # new API (>=3.5)
except (AttributeError, TypeError):
    cmap = plt.get_cmap('tab20')    # old API

colors = cmap(range(nb_models))
#line_styles = ['--', ':', '-.', (0, (3, 1, 1, 1)), (0, (5, 2))]

# --- Plot deviation from the average ---
plt.figure(figsize=(18, 12))

for j in range(nb_models):
    diff_from_avg = all_L_values[j] - avg_L
    plt.plot(
        range(1, len(exp_ids) + 1),  # experiments indexed starting at 1
        diff_from_avg,
        #linestyle=line_styles[j % len(line_styles)],
        linewidth=2,
        color=colors[j % len(colors)],
        label=files[j]
    )

# X-ticks every 10 experiments
xticks_pos = [k + 1 for k in range(len(exp_ids)) if (k + 1) % 10 == 0]
plt.xticks(xticks_pos, rotation=0)

plt.xlabel("Experiment Index", fontsize=12)
plt.ylabel("ΔL (model - average)", fontsize=12)
plt.title("Deviation of L values from experiment-wise average", fontsize=14)
plt.grid(alpha=0.3)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
import os

# Folder to save results
output_folder = long_path("L")
os.makedirs(output_folder, exist_ok=True)

# Initialize a dictionary to store scores
model_scores = {f: 0 for f in files}

# For each experiment (column of all_L_values)
for L_vals in all_L_values.T:
    # Ignore NaNs when determining the minimum
    if np.all(np.isnan(L_vals)):
        continue  # skip if all models missing for this experiment
    
    min_index = np.nanargmin(L_vals)  # index of the winning model
    winning_model = files[min_index]
    model_scores[winning_model] += 1

# Sort by descending score
sorted_scores = sorted(model_scores.items(), key=lambda x: x[1], reverse=True)

# Save to txt
output_file = os.path.join(output_folder, "L_NumberOfExperimentsWhereEachModelOutperforms.txt")
with open(output_file, "w") as f:
    f.write("Approval Rate (Number of wins by minimal L. The higher the better):\n\n")
    for model, score in sorted_scores:
        f.write(f"{model}: {score} points\n")

print(f"Results saved to {output_file}")
import os
import numpy as np

# Ensure the L folder exists
output_folder = long_path("L")
os.makedirs(output_folder, exist_ok=True)

# Rebuild all_l_values from LValues
files = list(LValues.keys())  # model names
exp_ids = list(ExperimentName.keys())  # experiment names in original order

# Build the L-squared matrix with NaN for missing experiments
all_L_values = []
for f in files:
    model_L = []
    for exp in exp_ids:
        model_L.append(LValues[f].get(exp, np.nan))
    all_L_values.append(model_L)
all_L_values = np.array(all_L_values)

# Initialize a dictionary to store rank-based scores
model_scores = {f: 0 for f in files}

# For each experiment (column)
for L_values in all_L_values.T:
    sorted_indices = np.argsort(L_values)  # best to worst
    for rank, model_idx in enumerate(sorted_indices, start=1):
        model_name = files[model_idx]
        model_scores[model_name] += rank

# Sort results by ascending score (lower is better)
sorted_scores = sorted(model_scores.items(), key=lambda x: x[1])

# Save to text file
output_file = long_path(os.path.join(output_folder, "L_AccumulativeScores.txt"))
with open(output_file, "w") as f:
    f.write("Approval Rate (Score by rank. Lower is better):\n\n")
    for model, score in sorted_scores:
        f.write(f"{model}: {score} points\n")

print(f"Scores saved to: {output_file}")
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np
import os

# Ensure output folder exists
output_folder = long_path("L")
os.makedirs(output_folder, exist_ok=True)




model_scores = {f: 0.0 for f in files}

# For each experiment (iterate over columns of the L matrix)
for L_vals in all_L_values.T:
    # Skip experiments where all models are missing
    if np.all(np.isnan(L_vals)):
        continue

    # Find the minimum L for this experiment
    min_L = np.nanmin(L_vals)

    # Compute normalized score for each model: best model = 1, others < 1
    for idx, val in enumerate(L_vals):
        if np.isnan(val):
            score = 0.0  # missing data → 0
        else:
            score = min_L / val  # relative to best
        model_name = files[idx]
        model_scores[model_name] += score  # sum across experiments

# Compute average relative score per experiment
for model in model_scores:
    model_scores[model] /= len(exp_ids)  # optional: normalize by number of experiments

# Sort models by descending relative score (higher is better)
sorted_scores = sorted(model_scores.items(), key=lambda x: x[1], reverse=True)

# Save results to a text file
output_file = os.path.join(output_folder, "L_RelativeNormalizedScores.txt")
with open(output_file, "w") as f:
    f.write("Relative Score (Normalized L per experiment. Higher is better):\n\n")
    for model, score in sorted_scores:
        f.write(f"{model}: {score:.4f}\n")

print(f"Normalized relative scores saved to {output_file}")

# -------------------------------
# Vertical bar chart: Rank-based scoring (lower=better)
# -------------------------------
sorted_scores_rank = sorted(model_scores.items(), key=lambda x: x[1])  # rank-based
models = [m for m, _ in sorted_scores_rank]
scores = [s for _, s in sorted_scores_rank]

norm = plt.Normalize(min(scores), max(scores))
try:
    cmap = plt.colormaps['RdYlGn_r']   # new API (>=3.5)
except (AttributeError, TypeError):
    cmap = plt.get_cmap('RdYlGn_r')    # old API

colors = cmap(norm(scores))
# Extract short names for X-axis
short_models = [m.replace("L_values_AllTestsFolder_", "").replace(".txt", "") for m in models]
short_models = ["_".join(m.split("_")[-2:]) for m in short_models]

fig, ax = plt.subplots(figsize=(12, 6))
bars = ax.bar(short_models, scores, color=colors)

# Add score labels above bars
for bar, score in zip(bars, scores):
    ax.text(
        bar.get_x() + bar.get_width() / 2,     # center above bar
        score + score * 0.02,                  # offset above bar
        f"{score:.3f}".replace('.', ','),      # 3 decimals, comma style
        ha='center', va='bottom',
        fontsize=8                             # smaller font
    )
ymax = max(scores)
ax.set_ylim(0, ymax * 1.1)
# Colorbar
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array(scores)
cbar = fig.colorbar(sm, ax=ax)
cbar.set_label("Rank Score (Lower = Better)")

ax.set_ylabel("Total Rank Score", fontsize=12)
ax.set_xlabel("Model", fontsize=12)
ax.set_title("Approval Rate: Rank-Based Scoring (L, Green=Better, Red=Worse)", fontsize=14)
ax.grid(axis='y', alpha=0.3)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(long_path(os.path.join(output_folder, "L_rank_vertical.png")), dpi=600, bbox_inches='tight')
plt.show()
import matplotlib.pyplot as plt
import numpy as np

# --- Prepare data for the relative scoring graph ---
# model_scores here is the normalized/relative scoring dictionary
sorted_scores_rel = sorted(model_scores.items(), key=lambda x: x[1], reverse=True)  # higher = better

models = [m for m, _ in sorted_scores_rel]
scores = [s for _, s in sorted_scores_rel]

# Shorten names if needed
short_models = [m.replace("L_values_AllTestsFolder_", "").replace(".txt", "") for m in models]

# Normalize colors
norm = plt.Normalize(min(scores), max(scores))
try:
    cmap = plt.colormaps['RdYlGn']   # new API (>=3.5)
except (AttributeError, TypeError):
    cmap = plt.get_cmap('RdYlGn')

colors = cmap(norm(scores))

# Extract short names (only last two parts: Name_number)
short_models = ["_".join(m.split("_")[-2:]) for m in short_models]

# --- Create the figure ---
fig, ax = plt.subplots(figsize=(14, 6))  # wider to accommodate long names
bars = ax.bar(short_models, scores, color=colors, edgecolor='black')

# Add value labels above bars
for bar, score in zip(bars, scores):
    ax.text(
        bar.get_x() + bar.get_width() / 2,     # center above bar
        score + score * 0.02,                  # offset above bar
        f"{score:.3f}".replace('.', ','),      # 3 decimals, comma style
        ha='center', va='bottom',
        fontsize=8                             # smaller font
    )

# Colorbar
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array(scores)
cbar = fig.colorbar(sm, ax=ax)
cbar.set_label("Relative Score (Higher = Better)")

# Labels and title
ax.set_ylabel("Total Relative Score (averaged over experiments)", fontsize=12)
ax.set_xlabel("Model", fontsize=12)
ax.set_title("Approval Rate: Normalized Relative Scoring (L, Green = Best, Red = Worst)", fontsize=14)
ax.grid(axis='y', alpha=0.3)

# Rotate X-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Make the y-axis a bit taller for small differences
ax.set_ylim(0, max(scores) * 1.1)

plt.tight_layout()
plt.savefig(long_path(os.path.join(output_folder, "L_relative_vertical.png")), dpi=600, bbox_inches='tight')
plt.show()


<h1> PLOTS COMPARING THE MODELS</h1>

In [None]:
import os, glob, re
from pathlib import Path
import matplotlib.pyplot as plt
from PIL import Image
import math
search_root = Path.cwd()
output_dir = search_root / "ExperimentComparison"
output_dir.mkdir(exist_ok=True)

# Look for any Missing*.jpg in all subfolders
all_images = glob.glob(str(search_root / "**" / "Missing*.jpg"), recursive=True)
print(f"Found {len(all_images)} images total.")
def best_subplot_shape(n):
    """
    Given n images, return (rows, cols) for a balanced subplot grid.
    """
    # Start with square root
    cols = math.ceil(math.sqrt(n))
    rows = math.ceil(n / cols)
    return rows, cols
#"Naif834", "Naif838", "Naif8316", "Naif8332", "Naif854", "Naif858", "Naif8516", "Naif8532", "Naif8104", "Naif8108", "Naif81016", "Naif81032",
# , "Naif2434", "Naif2438", "Naif24316", "Naif24332", "Naif2454", "Naif2458", "Naif24516", "Naif24532", "Naif24104", "Naif24108", "Naif241016", "Naif241032"

desired_order = ["NaifTwice1D483316", "NaifTwice1D483332", "NaifTwice1D483516", "NaifTwice1D483532", "NaifTwice1D485316", "NaifTwice1D485332", "NaifTwice1D485516", "NaifTwice1D485532", "NaifTwice1D443316", "NaifTwice1D443332", "NaifTwice1D443516", "NaifTwice1D443532", "NaifTwice1D445316", "NaifTwice1D445332", "NaifTwice1D445516", "NaifTwice1D445532", "NaifTwice1D883316", "NaifTwice1D883332", "NaifTwice1D883516", "NaifTwice1D883532", "NaifTwice1D885316", "NaifTwice1D885332", "NaifTwice1D885516", "NaifTwice1D885532", "NaifTwice1D843316", "NaifTwice1D843332", "NaifTwice1D843516", "NaifTwice1D843532", "NaifTwice1D845316", "NaifTwice1D845332", "NaifTwice1D845516", "NaifTwice1D845532"]

#["NaifTwice1D883316", "NaifTwice1D883332", "NaifTwice1D883516", "NaifTwice1D883532", "NaifTwice1D885316", "NaifTwice1D885332", "NaifTwice1D885516", "NaifTwice1D885532", "NaifTwice1D843316", "NaifTwice1D843332", "NaifTwice1D843516", "NaifTwice1D843532", "NaifTwice1D845316", "NaifTwice1D845332", "NaifTwice1D845516", "NaifTwice1D845532"]

#["Naif1634", "Naif1638", "Naif16316", "Naif16332", "Naif1654", "Naif1658", "Naif16516", "Naif16532", "Naif16104", "Naif16108", "Naif161016", "Naif161032"]

order_index = {name: i for i, name in enumerate(desired_order)}

def extract_naif_name(folder_name: str) -> str:
    """Extract the NaifXXX part from a folder name like 'CrystallineAllTestsFolder_Naif81016_3'."""
    import re
    match = re.search(r"Naif\d+", folder_name)
    return match.group(0) if match else None

def shorten_exp_id(exp_id: str) -> str:
    parts = exp_id.split("_")
    try:
        miller_idx = parts.index("MillerIndex")
        core_parts = parts[1:miller_idx]  # skip first part
    except ValueError:
        core_parts = parts[1:]
    return "_".join(core_parts)

# --- Group images ---
groups = {}
for p in all_images:
    p = Path(p)
    if p.parent.name.startswith("Missing"):
        exp_id = p.parent.name
    else:
        exp_id = p.stem
    groups.setdefault(exp_id, []).append(p)

print(f"Found {len(groups)} different experiments.")


# Get the directory where the notebook is located
ssearch_root = Path.cwd()  # assuming you run this from the notebook’s folder

# List only folders in this directory
sall_folders = [f for f in ssearch_root.iterdir() if f.is_dir()]

# Count folders containing "AllTestsFolder" in their name
DIM = sum("AllTestsFolder" in f.name for f in sall_folders)
print(f"Found {DIM} different models.")

# --- Build collages ---
for idx, (exp_id, paths) in enumerate(groups.items(), start=1):
    # Debug: print original folder names
    #print(f"\n--- Group: {exp_id} ---")
    #print("Original folder names:")
    #for p in paths:
        #print("  ", p.parents[1].name)

    # Reorder paths strictly following desired_order
    paths_sorted = []
    for desired_name in desired_order:
        for p in paths:
            naif_name = extract_naif_name(p.parents[1].name)
            if naif_name == desired_name:
                paths_sorted.append(p)

    # Append any leftover images not in desired_order at the end
    for p in paths:
        if p not in paths_sorted:
            paths_sorted.append(p)

    #print("After desired_order sorting:")
    #for p in paths_sorted:
        #print("  ", p.parents[1].name)

    # Now use paths_sorted to fill the figure
    count = DIM
    rows, cols = best_subplot_shape(count)
    
    
    fig, axes = plt.subplots(rows, cols, figsize=(cols, rows))    
    axes = axes.flatten()

    for ax, img_path in zip(axes, paths_sorted):
        lp = r"\\?\{}".format(os.path.abspath(str(img_path)))
        if not os.path.exists(lp):
            print(f"WARNING: File not found: {lp}")
            ax.axis("off")
            continue

        img = Image.open(lp)
        ax.imshow(img)
        #ax.set_title(img_path.parents[1].name, fontsize=8)
        ax.axis("off")

    for ax in axes[len(paths_sorted):]:
        ax.axis("off")

    short_id = shorten_exp_id(exp_id)
    plt.tight_layout()
    save_name = output_dir / f"{short_id}.png"
    save_path = r"\\?\{}".format(os.path.abspath(str(save_name)))
    plt.savefig(save_path, bbox_inches="tight", dpi=1200)
    plt.close(fig)

    print(f"Saved {save_name}")


