In [None]:
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import datetime

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
root_dir = '/home/ayadav7/'

In [None]:
x_files = [os.path.join(root_dir, 'preprocessed_data', f'X_{i}.npy') for i in range(1, 2)]
y_files = [os.path.join(root_dir, 'preprocessed_data', f'y_{i}.npy') for i in range(1, 2)]

# Split file paths into training and testing sets
x_train_files, x_test_files, y_train_files, y_test_files = train_test_split(
    x_files, y_files, test_size=0.2, random_state=42
)

In [None]:
# Function to normalize data using min-max normalization for each channel
def min_max_normalize_channel_wise(data):
    normalized_data = np.empty_like(data)
    num_channels = data.shape[-1]
    for i in range(num_channels):
        min_val = np.min(data[..., i])
        max_val = np.max(data[..., i])
        normalized_data[..., i] = (data[..., i] - min_val) / (max_val - min_val)
    return normalized_data

# Function to load, normalize, and combine data
def load_and_normalize_data(x_files, y_files):
    X_combined = []
    y_combined = []

    for i in tqdm(range(len(x_files)), desc="Loading and Normalizing Data"):
        X = np.nan_to_num(np.load(x_files[i]), nan=0.0)
        y = np.load(y_files[i])

        # Normalize each channel separately
        X = min_max_normalize_channel_wise(X)

        # Reshape X: (30, 896, 896, 18) -> (30 * 896 * 896, 18)
        X_reshaped = X.reshape(-1, X.shape[-1])
        X_combined.append(X_reshaped)

        # Reshape y: (1, 896, 896, 1) -> (896 * 896,)
        y_reshaped = y.reshape(-1)
        y_combined.append(y_reshaped)

    # Combine all the data
    X_combined = np.vstack(X_combined)
    y_combined = np.hstack(y_combined)

    return X_combined, y_combined

In [None]:
# Load and normalize training data
X_train, y_train = load_and_normalize_data(x_train_files, y_train_files)

# Load and normalize testing data
X_test, y_test = load_and_normalize_data(x_test_files, y_test_files)

In [None]:
# Train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate the model
y_pred_train = rf_model.predict(X_train)
train_mse = mean_squared_error(y_train, y_pred_train)
print(f"Training Mean Squared Error: {train_mse}")

y_pred_test = rf_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred_test)
print(f"Testing Mean Squared Error: {test_mse}")

In [None]:
new_X_list = [np.load(f) for f in x_test_files[:1]]  # Replace with actual new data file paths
new_X_combined, _ = load_and_normalize_data(new_X_list, new_X_list)  # Second parameter is dummy
new_predictions = rf_model.predict(new_X_combined)

# Reshape the predictions back to the original image shape
new_predictions_reshaped = new_predictions.reshape(30, 896, 896, 1)  # Adjust based on actual data
print(f"Predictions shape: {new_predictions_reshaped.shape}")