### Import Necessay Library

In [1]:
import numpy as np
import os
from collections import Counter
import random
from sklearn.model_selection import train_test_split

#### Comparing the number of numpy array files with and without label(stroke and non-stroke)

In [3]:
def count_labels_in_npz_files(folder_path):
    """
    This function counts the occurrences of each label (0 or 1) in all .npz files in a folder.
    It handles cases where keys might be 'X', 'Y' or 'flair', 'label', and handles both scalar and array labels.
    
    Parameters:
    folder_path: str - Path to the folder containing .npz files

    Returns:
    label_counts: dict - A dictionary with counts for each label
    """
    label_counts = Counter()
    files_with_missing_labels = []

    # Loop through each file in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".npz"):
            file_path = os.path.join(folder_path, filename)
            
            # Load the .npz file
            data = np.load(file_path)
            
            # Check if 'Y' exists, else check for 'label'
            if 'Y' in data:
                Y = data['Y']
            elif 'label' in data:
                Y = data['label']
            else:
                files_with_missing_labels.append(filename)
                print(f"File {filename} does not contain 'Y' or 'label'. Available keys: {list(data.files)}")
                continue
            
            # Handle both scalar and array labels
            if np.isscalar(Y):
                label_counts[int(Y)] += 1  # For scalar values
            else:
                label_counts[int(Y)] += 1  # For array values without indexing

    print(f"Files missing 'Y' and 'label': {files_with_missing_labels}")
    return label_counts

# Example usage
folder_path = r"D:\MRI_project\subject"  # Use your actual folder path
label_counts = count_labels_in_npz_files(folder_path)

# Print the result
print(f"Label counts: {label_counts}")


Files missing 'Y' and 'label': []
Label counts: Counter({1: 1456, 0: 259})


In [4]:
def identify_files_with_label_0(folder_path):
    """
    This function identifies and lists the .npz files that have a label of 0.
    
    Parameters:
    folder_path: str - Path to the folder containing .npz files

    Returns:
    files_with_label_0: list - A list of filenames where the label is 0
    """
    files_with_label_0 = []
    files_with_missing_labels = []

    # Loop through each file in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".npz"):
            file_path = os.path.join(folder_path, filename)
            
            # Load the .npz file
            data = np.load(file_path)
            
            # Check if 'Y' exists, else check for 'label'
            if 'Y' in data:
                Y = data['Y']
            elif 'label' in data:
                Y = data['label']
            else:
                files_with_missing_labels.append(filename)
                print(f"File {filename} does not contain 'Y' or 'label'. Available keys: {list(data.files)}")
                continue
            
            # Handle both scalar and array labels
            if np.isscalar(Y):  # If it's a scalar
                label_value = int(Y)
            else:
                label_value = int(Y)  # Directly convert array to integer if needed
            
            # Check if the label is 0
            if label_value == 0:
                files_with_label_0.append(filename)

    print(f"Files missing 'Y' and 'label': {files_with_missing_labels}")
    return files_with_label_0

# Example usage
folder_path = r"D:\MRI_project\subject"  # Use your actual folder path
files_with_label_0 = identify_files_with_label_0(folder_path)

# Print the result
print(f"Files with label 0: {files_with_label_0}")


Files missing 'Y' and 'label': []
Files with label 0: ['sub-1002.npz', 'sub-1008.npz', 'sub-1023.npz', 'sub-1030.npz', 'sub-1031.npz', 'sub-1044.npz', 'sub-1048.npz', 'sub-1051.npz', 'sub-1059.npz', 'sub-1060.npz', 'sub-1074.npz', 'sub-1079.npz', 'sub-1080.npz', 'sub-109.npz', 'sub-1094.npz', 'sub-1097.npz', 'sub-1103.npz', 'sub-1114.npz', 'sub-1116.npz', 'sub-1123.npz', 'sub-1125.npz', 'sub-1135.npz', 'sub-1138.npz', 'sub-1139.npz', 'sub-1141.npz', 'sub-1161.npz', 'sub-1167.npz', 'sub-1170.npz', 'sub-1173.npz', 'sub-1174.npz', 'sub-1176.npz', 'sub-1194.npz', 'sub-1196.npz', 'sub-12.npz', 'sub-1231.npz', 'sub-1235.npz', 'sub-1241.npz', 'sub-125.npz', 'sub-1251.npz', 'sub-1254.npz', 'sub-1276.npz', 'sub-1284.npz', 'sub-1293.npz', 'sub-1308.npz', 'sub-1322.npz', 'sub-1329.npz', 'sub-133.npz', 'sub-1335.npz', 'sub-1337.npz', 'sub-1339.npz', 'sub-1340.npz', 'sub-1381.npz', 'sub-1384.npz', 'sub-1389.npz', 'sub-1390.npz', 'sub-1406.npz', 'sub-1419.npz', 'sub-1421.npz', 'sub-1428.npz', 'sub-1

In [5]:
files_with_label_0

['sub-1002.npz',
 'sub-1008.npz',
 'sub-1023.npz',
 'sub-1030.npz',
 'sub-1031.npz',
 'sub-1044.npz',
 'sub-1048.npz',
 'sub-1051.npz',
 'sub-1059.npz',
 'sub-1060.npz',
 'sub-1074.npz',
 'sub-1079.npz',
 'sub-1080.npz',
 'sub-109.npz',
 'sub-1094.npz',
 'sub-1097.npz',
 'sub-1103.npz',
 'sub-1114.npz',
 'sub-1116.npz',
 'sub-1123.npz',
 'sub-1125.npz',
 'sub-1135.npz',
 'sub-1138.npz',
 'sub-1139.npz',
 'sub-1141.npz',
 'sub-1161.npz',
 'sub-1167.npz',
 'sub-1170.npz',
 'sub-1173.npz',
 'sub-1174.npz',
 'sub-1176.npz',
 'sub-1194.npz',
 'sub-1196.npz',
 'sub-12.npz',
 'sub-1231.npz',
 'sub-1235.npz',
 'sub-1241.npz',
 'sub-125.npz',
 'sub-1251.npz',
 'sub-1254.npz',
 'sub-1276.npz',
 'sub-1284.npz',
 'sub-1293.npz',
 'sub-1308.npz',
 'sub-1322.npz',
 'sub-1329.npz',
 'sub-133.npz',
 'sub-1335.npz',
 'sub-1337.npz',
 'sub-1339.npz',
 'sub-1340.npz',
 'sub-1381.npz',
 'sub-1384.npz',
 'sub-1389.npz',
 'sub-1390.npz',
 'sub-1406.npz',
 'sub-1419.npz',
 'sub-1421.npz',
 'sub-1428.npz',
 '

From the above result, There are more individual with stroke than non-stroke. Therefore, it is require to make a data augmentation on non stroke individual so that the imbalance data problem will not be occured during model development (High Bias).

#### Data Augmentation and Train/test split

In [6]:
import numpy as np
import os
import random
from scipy.ndimage import rotate
import shutil

def augment_data(X):
    """
    Apply random augmentations to the 3D MRI scan.
    Augmentations include rotation, flipping, and adding Gaussian noise.
    
    Parameters:
    X: numpy array - The 3D MRI scan to augment

    Returns:
    X_augmented: numpy array - The augmented 3D MRI scan
    """
    # Random rotation (90, 180, or 270 degrees)
    angle = random.choice([90, 180, 270])
    X_aug = rotate(X, angle, axes=(0, 1), reshape=False)

    # Random flip (along x, y, or z axis)
    if random.choice([True, False]):
        X_aug = np.flip(X_aug, axis=random.choice([0, 1, 2]))

    # Add random Gaussian noise
    noise = np.random.normal(0, 0.01, X_aug.shape)
    X_aug += noise
    X_aug = np.clip(X_aug, 0, 1)  # Ensure pixel values are within [0, 1]

    return X_aug

def balance_dataset_with_augmentation(folder_path, label_0_files, target_count):
    """
    Augment the files with label 0 until the number of label 0 samples matches the target count.

    Parameters:
    folder_path: str - Path to the folder containing .npz files
    label_0_files: list - List of filenames with label 0
    target_count: int - Target number of samples for label 0 (to match label 1 count)

    Returns:
    None
    """
    current_count = len(label_0_files)
    augment_count = target_count - current_count

    print(f"Augmenting {augment_count} new samples for label 0 to match {target_count} samples.")

    for i in range(augment_count):
        # Randomly select a file to augment
        selected_file = random.choice(label_0_files)
        file_path = os.path.join(folder_path, selected_file)
        
        # Load the selected file
        data = np.load(file_path)
        X = data['flair'] if 'flair' in data else data['X']
        Y = data['label'] if 'label' in data else data['Y']
        
        # Apply augmentation
        X_aug = augment_data(X)

        # Save the augmented file with a new filename
        new_filename = f"{os.path.splitext(selected_file)[0]}_aug_{i}.npz"
        new_file_path = os.path.join(folder_path, new_filename)
        np.savez_compressed(new_file_path, flair=X_aug, label=Y)

    print(f"Augmentation complete. {augment_count} new files added.")

# Example usage
folder_path = r"D:\MRI_project\subject"  # Use your actual folder path
label_0_files = identify_files_with_label_0(folder_path)  # Get files with label 0

# Target to match the count of label 1 (1456 samples)
target_count = 1456
balance_dataset_with_augmentation(folder_path, files_with_label_0, target_count)


Files missing 'Y' and 'label': []
Augmenting 1197 new samples for label 0 to match 1456 samples.
Augmentation complete. 1197 new files added.


In [7]:
def normalize_label_format(Y):
    """
    Normalize label format to ensure that all labels are 1D numpy arrays with shape (1,).
    
    Parameters:
    Y: numpy array or scalar - The label to normalize

    Returns:
    Y_normalized: numpy array - The normalized label with shape (1,)
    """
    if np.isscalar(Y) or Y.shape == ():  # Check for scalar or 0-dimensional array
        return np.array([Y])  # Convert scalar to 1D array
    elif isinstance(Y, np.ndarray) and Y.shape == (1,):
        return Y  # Already in the correct format
    else:
        raise ValueError(f"Unexpected label shape: {Y.shape}")

def load_and_concatenate_npz_files(folder_path):
    """
    Load and concatenate all the .npz files in the folder into a single dataset.
    Ensures that all MRI data and labels have consistent shapes.
    
    Parameters:
    folder_path: str - Path to the folder containing .npz files

    Returns:
    X_data: numpy array - Concatenated MRI data
    Y_data: numpy array - Concatenated labels
    """
    X_list = []
    Y_list = []
    invalid_files = []

    # Loop through each file in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".npz"):
            file_path = os.path.join(folder_path, filename)
            
            # Load the .npz file
            try:
                data = np.load(file_path)
                X = data['flair'] if 'flair' in data else data['X']
                Y = data['label'] if 'label' in data else data['Y']
                
                # Ensure X has the shape (64, 64, 64)
                if X.shape != (64, 64, 64):
                    print(f"Skipping file {filename}: Incorrect X shape {X.shape}")
                    invalid_files.append(filename)
                    continue
                
                # Normalize Y to ensure it is a 1D array with shape (1,)
                Y = normalize_label_format(Y)

                # Append to the lists
                X_list.append(X)
                Y_list.append(Y)
            except Exception as e:
                print(f"Error loading file {filename}: {e}")
                invalid_files.append(filename)

    # Concatenate all the valid files into one large array
    X_data = np.stack(X_list, axis=0)  # Concatenate along a new dimension
    Y_data = np.stack(Y_list, axis=0)  # Same for the labels
    
    print(f"Invalid files skipped: {invalid_files}")
    return X_data, Y_data


def split_and_save_dataset(X_data, Y_data, test_size=0.2, output_folder='output'):
    """
    Split the dataset into training and testing sets and save them as .npz files.
    
    Parameters:
    X_data: numpy array - The full feature dataset
    Y_data: numpy array - The full label dataset
    test_size: float - The proportion of data to allocate to the test set
    output_folder: str - Path to save the split datasets

    Returns:
    None
    """

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Split the data into training and testing sets
    X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=test_size, random_state=42)

    # Save the training data
    np.savez_compressed(os.path.join(output_folder, 'train_data.npz'), X_train=X_train, Y_train=Y_train)
    
    # Save the testing data
    np.savez_compressed(os.path.join(output_folder, 'test_data.npz'), X_test=X_test, Y_test=Y_test)

    print(f"Training and test datasets saved in '{output_folder}'.")


# Example usage
folder_path = r"D:\MRI_project\subject"  # Path to your folder with .npz files

# Load and concatenate all .npz files
X_data, Y_data = load_and_concatenate_npz_files(folder_path)

# Split the data and save the training and testing datasets
split_and_save_dataset(X_data, Y_data, test_size=0.2, output_folder="D:/MRI_project/output")


Invalid files skipped: []
Training and test datasets saved in 'D:/MRI_project/output'.


In [8]:
data = np.load(r"D:\MRI_project\output\train_data.npz")
data['X_train'].shape

(2329, 64, 64, 64)