In [1]:

import os
import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from PIL import Image

# Define paths
data_dir = "/path/to/dataset"
labels_file = "/path/to/labels.csv"

# Load labels
labels = pd.read_csv(labels_file)

# Function to resize images
def resize_image(image_path, target_size=(256, 256)):
    image = cv2.imread(image_path)
    return cv2.resize(image, target_size)

# Function to normalize images
def normalize_image(image):
    return image / 255.0

# Function to preprocess RGB images
def preprocess_rgb_images(image_dir, target_size=(256, 256)):
    image_paths = [os.path.join(image_dir, fname) for fname in os.listdir(image_dir) if fname.endswith('.jpg') or fname.endswith('.png')]
    processed_images = []
    for path in image_paths:
        image = resize_image(path, target_size)
        image = normalize_image(image)
        processed_images.append(image)
    return processed_images, image_paths

# Function to preprocess multispectral images
def preprocess_multispectral_images(image_dir, target_size=(256, 256)):
    image_paths = [os.path.join(image_dir, fname) for fname in os.listdir(image_dir) if fname.endswith('.tif')]
    processed_images = []
    for path in image_paths:
        image = resize_image(path, target_size)
        image = normalize_image(image)
        processed_images.append(image)
    return processed_images, image_paths

# Function to preprocess aerial images
def preprocess_aerial_images(image_dir, target_size=(256, 256)):
    image_paths = [os.path.join(image_dir, fname) for fname in os.listdir(image_dir) if fname.endswith('.tif')]
    processed_images = []
    for path in image_paths:
        image = resize_image(path, target_size)
        image = normalize_image(image)
        processed_images.append(image)
    return processed_images, image_paths

# Encode labels
label_map = {"Healthy": 0, "Stage1": 1, "Stage2": 2, "Stage3": 3}
labels['encoded_stage'] = labels['stage'].map(label_map)

# Preprocess data for a specific date
def preprocess_data_for_date(date_dir):
    # Preprocess RGB images
    rgb_images, rgb_paths = preprocess_rgb_images(os.path.join(date_dir, "Ground_Photos"))
    
    # Preprocess multispectral images
    multispectral_images, multispectral_paths = preprocess_multispectral_images(os.path.join(date_dir, "Multispectral"))
    
    # Preprocess aerial images
    aerial_images, aerial_paths = preprocess_aerial_images(os.path.join(date_dir, "Aerial_photos"))
    
    # Combine all images into a single list and create a DataFrame for paths
    all_images = rgb_images + multispectral_images + aerial_images
    all_paths = rgb_paths + multispectral_paths + aerial_paths
    images_df = pd.DataFrame({'image_path': all_paths})
    
    # Merge with labels
    images_df['tree_id'] = images_df['image_path'].apply(lambda x: os.path.basename(x).split('.')[0])
    merged_df = pd.merge(images_df, labels, left_on='tree_id', right_on='tree_id')
    
    return all_images, merged_df['encoded_stage']

# Function to split the dataset into train and test sets
def split_dataset(images, labels, test_size=0.2):
    X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=test_size, random_state=42)
    return X_train, X_test, y_train, y_test

# Main function to preprocess the entire dataset
def preprocess_entire_dataset(data_dir, target_size=(256, 256)):
    all_images = []
    all_labels = []
    
    for date_dir in os.listdir(data_dir):
        date_path = os.path.join(data_dir, date_dir)
        if os.path.isdir(date_path):
            images, labels = preprocess_data_for_date(date_path)
            all_images.extend(images)
            all_labels.extend(labels)
    
    return split_dataset(all_images, all_labels)

# Run preprocessing
X_train, X_test, y_train, y_test = preprocess_entire_dataset(data_dir)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
