# Data Preparation
This notebook partitions hyperspectral plant disease detection data into training and validation sets. It loads CSV files containing spectral data for diseased and healthy plants, separates the spectral measurements, and assigns labels (1 for diseased, 0 for healthy). The data is then combined and transposed to prepare for an 80/20 train-validation split (for each client), ensuring stratified sampling to maintain class balance. Post-split, the data is transposed back, and labels are reattached. The processed data, along with wavelength labels, is saved separately for each tray and seed. The notebook also consolidates data across all trays, generating centralized training and validation sets for a total of 30 random seeds (labeled seeds 0 to 29 in No_MSC folder).

In [None]:
# Dependencies
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

In [None]:
def load_and_split_data(diseased_file_path, healthy_file_path, seed=42):
    # Load the CSV files
    diseased_data = pd.read_csv(diseased_file_path)
    healthy_data = pd.read_csv(healthy_file_path)

    # Extract the wavelength labels (first column)
    wavelength_labels = diseased_data.iloc[:, 0]

    # Combine the datasets
    diseased_spectra = diseased_data.iloc[:, 1:]
    healthy_spectra = healthy_data.iloc[:, 1:]

    combined_spectra = pd.concat([diseased_spectra, healthy_spectra], axis=1)

    # Create labels
    labels = np.array([1] * diseased_spectra.shape[1] + [0] * healthy_spectra.shape[1])

    # Transpose the spectra data for splitting
    spectra_data_transposed = combined_spectra.T

    # Split the transposed data and labels into 80% training and 20% validation
    train_indices, val_indices = train_test_split(np.arange(len(labels)), test_size=0.2, random_state=seed, stratify=labels)

    # Create training and validation sets using the split indices
    train_data = spectra_data_transposed.iloc[train_indices]
    val_data = spectra_data_transposed.iloc[val_indices]
    train_labels = labels[train_indices]
    val_labels = labels[val_indices]

    # Transpose the split data back to the original orientation
    train_data_transposed_back = train_data.T
    val_data_transposed_back = val_data.T

    # Add labels as the last row
    train_data_with_labels = np.vstack([train_data_transposed_back, train_labels])
    val_data_with_labels = np.vstack([val_data_transposed_back, val_labels])

    # Add the wavelength labels back to the training and validation data
    train_with_labels = pd.concat([wavelength_labels.reset_index(drop=True), pd.DataFrame(train_data_with_labels).reset_index(drop=True)], axis=1)
    val_with_labels = pd.concat([wavelength_labels.reset_index(drop=True), pd.DataFrame(val_data_with_labels).reset_index(drop=True)], axis=1)

    return train_with_labels, val_with_labels

In [None]:
def save_data(train_data, val_data, output_dir, tray_number):
    os.makedirs(output_dir, exist_ok=True)
    
    train_path = os.path.join(output_dir, f"Client_{tray_number}_Train.csv")
    val_path = os.path.join(output_dir, f"Client_{tray_number}_Validation.csv")
    
    train_data.to_csv(train_path, index=False, header=False)
    val_data.to_csv(val_path, index=False, header=False)

    print(f"Training data saved to {train_path}")
    print(f"Validation data saved to {val_path}")

In [None]:
def process_seed(csv_files, root_output_dir, seed, seed_index):
    diseased_files = [f for f in csv_files if 'diseased' in f]
    healthy_files = [f for f in csv_files if 'healthy' in f]
    
    all_train_data = []
    all_val_data = []
    
    for tray_number, (diseased_file, healthy_file) in enumerate(zip(diseased_files, healthy_files)):
        train_data, val_data = load_and_split_data(diseased_file, healthy_file, seed)
        
        output_dir = os.path.join(root_output_dir, f"Seed_{seed_index}")
        save_data(train_data, val_data, output_dir, tray_number)
        
        all_train_data.append(train_data.iloc[:, 1:])  # Exclude the wavelength column for combining
        all_val_data.append(val_data.iloc[:, 1:])      # Exclude the wavelength column for combining
    
    # Combine training and validation sets from all trays for centralized model
    combined_train = pd.concat(all_train_data, axis=1)
    combined_val = pd.concat(all_val_data, axis=1)

    # Re-add the wavelength labels
    wavelength_labels = pd.read_csv(diseased_files[0]).iloc[:, 0]
    combined_train = pd.concat([wavelength_labels.reset_index(drop=True), combined_train.reset_index(drop=True)], axis=1)
    combined_val = pd.concat([wavelength_labels.reset_index(drop=True), combined_val.reset_index(drop=True)], axis=1)

    combined_train_path = os.path.join(root_output_dir, f"Seed_{seed_index}", "Combined_Train.csv")
    combined_val_path = os.path.join(root_output_dir, f"Seed_{seed_index}", "Combined_Validation.csv")
    
    combined_train.to_csv(combined_train_path, index=False, header=False)
    combined_val.to_csv(combined_val_path, index=False, header=False)

    print(f"Combined training data saved to {combined_train_path}")
    print(f"Combined validation data saved to {combined_val_path}")

In [9]:
if __name__ == "__main__":
    csv_files = [
        "Tray1_day5_diseased.csv", "Tray2_day5_diseased.csv", "Tray3_day5_diseased.csv", 
        "Tray4_day5_diseased.csv", "Tray5_day5_diseased.csv", 
        "Tray1_day5_healthy.csv", "Tray2_day5_healthy.csv", "Tray3_day5_healthy.csv", 
        "Tray4_day5_healthy.csv", "Tray5_day5_healthy.csv"
    ]

    root_output_dir = "No_MSC"
    seeds = np.random.randint(1, 100000, size=30)  # Generate 30 random seeds

    for seed_index, seed in enumerate(seeds):
        process_seed(csv_files, root_output_dir, seed, seed_index)

Training data saved to No_MSC/Seed_0/Client_0_Train.csv
Validation data saved to No_MSC/Seed_0/Client_0_Validation.csv
Training data saved to No_MSC/Seed_0/Client_1_Train.csv
Validation data saved to No_MSC/Seed_0/Client_1_Validation.csv
Training data saved to No_MSC/Seed_0/Client_2_Train.csv
Validation data saved to No_MSC/Seed_0/Client_2_Validation.csv
Training data saved to No_MSC/Seed_0/Client_3_Train.csv
Validation data saved to No_MSC/Seed_0/Client_3_Validation.csv
Training data saved to No_MSC/Seed_0/Client_4_Train.csv
Validation data saved to No_MSC/Seed_0/Client_4_Validation.csv
Combined training data saved to No_MSC/Seed_0/Combined_Train.csv
Combined validation data saved to No_MSC/Seed_0/Combined_Validation.csv
Training data saved to No_MSC/Seed_1/Client_0_Train.csv
Validation data saved to No_MSC/Seed_1/Client_0_Validation.csv
Training data saved to No_MSC/Seed_1/Client_1_Train.csv
Validation data saved to No_MSC/Seed_1/Client_1_Validation.csv
Training data saved to No_MSC/