In [None]:
import numpy as np
from scipy import stats, signal
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import glob
import pandas as pd
from sklearn.utils import shuffle

In [None]:
file_list = glob.glob('/content/*.xlsx')  # Adjust path if necessary
file_list = sorted(file_list)

In [None]:
real_spectras=pd.read_excel(file_list[0], engine='openpyxl', sheet_name='realSpectra').drop(columns=['Output', 'Real'])
synt_spectras=pd.read_excel(file_list[0], engine='openpyxl', sheet_name='realSpectra').drop(columns=['Output', 'Real'])

In [None]:
real_spectras.head()

Unnamed: 0,467.3957824707031,467.5306091308594,467.6654052734375,467.8002014160156,467.9349975585938,468.0697631835938,468.2045288085938,468.3392639160156,468.4739990234375,468.6087036132812,...,708.0553588867188,708.1543579101562,708.2533569335938,708.3523559570312,708.4512939453125,708.5502319335938,708.649169921875,708.748046875,708.846923828125,708.9457397460938
0,772.0,738.5,767.5,744.0,774.5,747.5,751.0,766.5,769.5,741.0,...,1193.5,1134.5,1194.0,1185.5,1158.0,1137.5,1194.0,1147.0,1211.5,1167.5
1,641.25,624.5,641.0,633.5,631.75,649.25,622.0,649.0,638.0,644.75,...,815.0,805.5,777.5,760.75,791.0,796.5,818.0,795.5,782.5,814.75
2,683.25,709.5,669.25,708.0,703.75,694.25,691.0,701.75,721.25,706.75,...,946.0,929.25,936.0,1002.75,922.0,951.25,961.5,972.0,954.25,932.25
3,679.0,690.75,710.25,672.75,715.0,713.25,710.25,660.25,711.75,687.25,...,1030.75,1023.75,995.75,1043.0,1008.0,1001.5,983.25,1056.0,1032.75,977.25
4,666.25,659.25,677.75,704.0,671.25,689.0,688.5,679.5,677.25,679.25,...,875.5,898.25,885.0,902.25,909.5,881.0,886.0,868.75,893.0,889.25


In [None]:
synt_spectras=shuffle(synt_spectras)

In [None]:
# Preprocessing pipeline
def preprocess(spectra):
    spectra = np.asarray(spectra)
    min_val = np.min(spectra, axis=1, keepdims=True)
    max_val = np.max(spectra, axis=1, keepdims=True)
    denom = max_val - min_val
    denom[denom == 0] = 1e-8
    return (spectra - min_val) / denom

real_spectra = preprocess(real_spectras)
synthetic_set = preprocess(synt_spectras)

def calculate_metrics(spectra, reference=None):
    metrics = {'RMSE': [], 'Mean': [], 'Chi2': [], 'Frequency': []}

    if reference is None:
        reference = spectra.mean(axis=0)

    for s in spectra:
        # RMSE (vs reference spectrum)
        rmse = np.sqrt(mean_squared_error(reference, s))
        metrics['RMSE'].append(rmse)

        metrics['Mean'].append(np.mean(s))

        # KL Divergence (vs reference spectrum)
        s_norm = s / s.sum() if s.sum() > 0 else np.zeros_like(s)
        r_norm = reference / reference.sum() if reference.sum() > 0 else np.zeros_like(reference)

        # Add small constant to avoid division by zero
        s_norm = np.clip(s_norm, 1e-8, 1 - 1e-8)
        r_norm = np.clip(r_norm, 1e-8, 1 - 1e-8)

        kl_div = np.sum(r_norm * np.log(r_norm / s_norm))
        metrics['Chi2'].append(kl_div)

        # Frequency similarity (vs reference spectrum)
        fft_real = np.abs(np.fft.fft(reference))
        fft_synth = np.abs(np.fft.fft(s))
        cos_sim = np.dot(fft_real, fft_synth) / (np.linalg.norm(fft_real) * np.linalg.norm(fft_synth))
        metrics['Frequency'].append(cos_sim)
    return metrics

# Add composite metric calculation to compare_datasets
def compare_datasets(real_metrics, synth_metrics):
    # Calculate averages
    real_avg = {k: np.mean(v) for k,v in real_metrics.items()}
    synth_avg = {k: np.mean(v) for k,v in synth_metrics.items()}

    # Normalize metrics (scale to [0,1] range)
    metrics = ['RMSE', 'Chi2', 'Frequency', 'Mean']
    min_real = {m: min(real_metrics[m]) for m in metrics}
    max_real = {m: max(real_metrics[m]) for m in metrics}

    # Normalize real dataset metrics
    real_norm = {}
    for m in metrics:
        if max_real[m] - min_real[m] == 0:
            real_norm[m] = 0.5
        else:
            real_norm[m] = (real_avg[m] - min_real[m]) / (max_real[m] - min_real[m])

    synth_norm = {}
    for m in metrics:
        if max_real[m] - min_real[m] == 0:
            synth_norm[m] = 0.5
        else:
            synth_norm[m] = (synth_avg[m] - min_real[m]) / (max_real[m] - min_real[m])
    # Calculate composite metric (weighted average)
    weights = {'RMSE': 1, 'Chi2': 1, 'Frequency': 1, 'Mean': 1}  # Adjust weights as needed
    real_composite = sum(weights[m] * real_norm[m] for m in metrics)
    synth_composite = sum(weights[m] * synth_norm[m] for m in metrics)
    # Print comparison with composite metric
    print("Dataset Comparison:")
    print(f"{'Metric':<20} {'Real Avg':<10} {'Synth Avg':<10} {'Difference':<10}")
    print("-"*50)
    for metric in real_avg.keys():
        diff = synth_avg[metric] - real_avg[metric]
        print(f"{metric:<20} {real_avg[metric]:.6f} {synth_avg[metric]:.6f} {diff:.6f}")

    print("\nComposite Metrics:")
    print(f"Real Composite: {real_composite:.6f}")
    print(f"Synthetic Composite: {synth_composite:.6f}")
    print(f"Difference: {synth_composite - real_composite:.6f}")

if __name__ == "__main__":
    main()