In [None]:
"""
Train a MultiSensorRegressor on *all* fold CSVs in `folds_dir`
and evaluate on the official leaderboard set
`TASK1_Leaderboard_ActualValue.csv`, reporting row‑wise cosine distance.

Only minimal code changes compared with the original cross‑validation script:
    • concatenate all folds into one training DataFrame
    • new validator that runs on the leaderboard file
    • loop only over `threshold` values (no fold loop)

Everything else stays the same.
"""

# ────────────────────────────────────────────────────────────────────────────────
# Imports
# ────────────────────────────────────────────────────────────────────────────────
import os
import tempfile

import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from scipy.stats import pearsonr
from tqdm.notebook import tqdm

from sklearn.metrics.pairwise import cosine_similarity  # noqa: F401 (kept for compatibility)

# ────────────────────────────────────────────────────────────────────────────────
# Helper functions
# ────────────────────────────────────────────────────────────────────────────────
def load_all_folds(folds_dir: str) -> pd.DataFrame:
    """Concatenate every CSV inside `folds_dir` into one DataFrame."""
    csv_files = [
        os.path.join(folds_dir, f)
        for f in os.listdir(folds_dir)
        if f.lower().endswith(".csv")
    ]
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in {folds_dir}")
    return pd.concat([pd.read_csv(p) for p in sorted(csv_files)], ignore_index=True)


def validate_on_leaderboard(model: "MultiSensorRegressor", leaderboard_path: str) -> dict:
    """
    Predict sensors for the leaderboard set and return cosine‑distance stats and Pearson correlation.
    """
    if not model.combined_csv_path:
        raise ValueError("`combined_csv_path` not set. Call "
                         "`create_combined_csv_path_all_folds()` first.")

    # 1. Load leaderboard file and attach labels
    test_df = pd.read_csv(leaderboard_path)
    stim_def = pd.read_csv(model.stimulus_file)[['stimulus', 'molecule', 'Intensity_label']]
    test_df = test_df.merge(stim_def, on='stimulus', how='left')

    true_sensor_cols = (test_df
                        .drop(columns=['Intensity', 'Pleasantness',
                                       'stimulus', 'molecule', 'Intensity_label'],
                              errors='ignore')
                        .select_dtypes(include=[np.number]))

    # 2. Make a temp CSV containing only stimulus column (what predict_new_data expects)
    with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as tmp:
        tmp_path = tmp.name
        test_df[['stimulus']].to_csv(tmp_path, index=False)

    try:
        # 3. Run inference via existing function
        pred_df = model.predict_new_data(tmp_path, return_df=True)

        common_cols = [c for c in true_sensor_cols.columns if c in pred_df.columns]
        true_vals = true_sensor_cols[common_cols].values
        pred_vals = pred_df[common_cols].values

        # Row‑wise cosine distance (0 = identical, 2 = opposite)
        row_cos_dists = cdist(true_vals, pred_vals, metric="cosine").diagonal()
        
        # Row-wise Pearson correlation
        row_pearson_corrs = []
        for i in range(len(true_vals)):
            # Handle edge cases (constant values, NaNs)
            try:
                corr, _ = pearsonr(true_vals[i], pred_vals[i])
                if np.isnan(corr):
                    corr = 0.0
            except:
                corr = 0.0
            row_pearson_corrs.append(corr)
        row_pearson_corrs = np.array(row_pearson_corrs)

        return {
            "cosine_distance_per_stimulus": dict(zip(test_df["stimulus"], row_cos_dists)),
            "mean_cosine_distance": float(row_cos_dists.mean()),
            "pearson_correlation_per_stimulus": dict(zip(test_df["stimulus"], row_pearson_corrs)),
            "mean_pearson_correlation": float(row_pearson_corrs.mean())
        }
    finally:
        os.unlink(tmp_path)


# ────────────────────────────────────────────────────────────────────────────────
# Paths & constants
# ────────────────────────────────────────────────────────────────────────────────
base_paths = {
    'stimulus_file':   r"TASK1_Stimulus_definition.csv",
    'folds_dir':       r"folds",
    'descriptors_file': r"pom_v11.csv",
    'task2_path':      r'T2_Combined_Dataset.csv',
    'output_folder':   r"DATA"
}

LEADERBOARD_PATH = (
    r"TASK1_Leaderboard_ActualValue.csv"
)



# ────────────────────────────────────────────────────────────────────────────────
# Training + validation
# ────────────────────────────────────────────────────────────────────────────────
means_cosine_dist = []
means_pearson_corr = []

top_ks = [47]
thresholds = [0.27]
n_estimatorss = [220]
max_depths = [24]
GSLF_versions = ['']

parameters = [(threshold, top_k, n_estimators, max_depth,
               GSLF_version) 
              for threshold in thresholds 
              for top_k in top_ks
              for n_estimators in n_estimatorss
              for max_depth in max_depths
              for GSLF_version in GSLF_versions]

for threshold, top_k, n_estimators, max_depth, GSLF_version in tqdm(parameters, desc="parameters"):
    print("\n══════════════════════════════════════════════════════════════════════")
    print(f"Training on ALL folds with parameters={(threshold, top_k, n_estimators, max_depth, GSLF_version)}")
    print("══════════════════════════════════════════════════════════════════════")
    
    # Pre‑computed GSLF dataframe used by your pipeline
    processed_gslf_df = pd.read_csv(
        rf'C:\Users\Asus\Desktop\food\DREAM25\processed_GSLF{GSLF_version}.csv'
    )

    
    # 1. Create and configure the model
    model = MultiSensorRegressor(
        stimulus_file    = base_paths['stimulus_file'],
        folds_dir        = base_paths['folds_dir'],
        descriptors_file = base_paths['descriptors_file'],
        rf_params        = {'n_estimators': n_estimators, 'random_state': 42, 'max_depth': max_depth}
    )
    

    # 2. Load EVERY fold as one training set
    train_df = load_all_folds(base_paths['folds_dir'])

    # 3. Save training set to a temp file (API expects CSV path)
    with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as tmp:
        train_csv_path = tmp.name
        train_df.to_csv(train_csv_path, index=False)

    try:
        # 4. Prepare combined CSV that model uses internally
        model.create_combined_csv_path_all_folds(
            task2_path        = base_paths['task2_path'],
            training_path     = train_csv_path,
            stimulus_def_path = base_paths['stimulus_file'],
            descriptors_path  = base_paths['descriptors_file'],
            output_folder     = base_paths['output_folder']
        )
                

        
        # 5. Fit RF + merge GSLF (unchanged logic)

        
        model.regress_gslf(
            processed_gslf_df,
            output_path=(
                rf"{base_paths['output_folder']}\combined_with_gslf_allfolds_thr{threshold}.csv"
            ),
            threshold=threshold,
            top_k=top_k
        )
        
        
        # 6. Validate predictions on the leaderboard set
        metrics = validate_on_leaderboard(model, LEADERBOARD_PATH)
        mean_cos_dist = metrics["mean_cosine_distance"]
        mean_pearson = metrics["mean_pearson_correlation"]
        means_cosine_dist.append(mean_cos_dist)
        means_pearson_corr.append(mean_pearson)

        print(f"Mean cosine distance (leaderboard) = {mean_cos_dist:.4f}")
        print(f"Mean Pearson correlation (leaderboard) = {mean_pearson:.4f}")

    finally:
        os.unlink(train_csv_path)

# ────────────────────────────────────────────────────────────────────────────────
# Summary
# ────────────────────────────────────────────────────────────────────────────────
print("\n======================= FINAL RESULTS =======================")
for (threshold, top_k, n_estimators, max_depth, GSLF_version), dist, corr in zip(parameters, means_cosine_dist, means_pearson_corr):
    print(f"{(threshold, top_k, n_estimators, max_depth)} → cosine distance = {dist:.4f}, Pearson correlation = {corr:.4f}")

print("-------------------------------------------------------------")