In [19]:
# The following operations are performed on the dataset with a random seed of 1.
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler instance
scaler = StandardScaler()

# Function to read and normalize data
def read_and_normalize(file_path, save_path):
    # Read data
    df = pd.read_csv(file_path)
    
    # Normalize features using StandardScaler, excluding the 'Label' column if it exists
    if 'Label' in df.columns:
        labels = df[['Label']]
        df.iloc[:, 1:] = scaler.fit_transform(df.iloc[:, 1:])
        df = pd.concat([labels, df.iloc[:, 1:]], axis=1)
    else:
        df.iloc[:, :] = scaler.fit_transform(df.iloc[:, :])
    
    # Save normalized dataset
    df.to_csv(save_path, index=False)
    print(f"Normalized dataset saved to '{save_path}'")
    
    return df

# Paths for reading and saving normalized datasets
file_paths = {
    'CT': r'C:\Users\37427\Desktop\github\preprocessing\RFs_CT_bin50\HighICC-CTbin50.csv',
    'PET1': r'C:\Users\37427\Desktop\github\preprocessing\RFs_PET1_bin0.25\HighICC-PET1bin0.25.csv',
    'PET2': r'C:\Users\37427\Desktop\github\preprocessing\RFs_PET2_bin0.05\HighICC-PET2bin0.05.csv',
    'Baseline1': r'C:\Users\37427\Desktop\github\preprocessing\Baseline\HighICC-Baseline1.csv',
    'Baseline2': r'C:\Users\37427\Desktop\github\preprocessing\Baseline\HighICC-Baseline2.csv'
}

save_paths = {
    'CT_norm': r'C:\Users\37427\Desktop\github\preprocessing\RFs_CT_bin50\CTbin50-norm.csv',
    'PET1_norm': r'C:\Users\37427\Desktop\github\preprocessing\RFs_PET1_bin0.25\PET1bin0.25-norm.csv',
    'PET2_norm': r'C:\Users\37427\Desktop\github\preprocessing\RFs_PET2_bin0.05\PET2bin0.05-norm.csv',
    'Baseline1_norm': r'C:\Users\37427\Desktop\github\preprocessing\Baseline\Baseline1-norm.csv',
    'Baseline2_norm': r'C:\Users\37427\Desktop\github\preprocessing\Baseline\Baseline2-norm.csv'
}

# Process each dataset
normalized_datasets = {}
for key, file_path in file_paths.items():
    normalized_datasets[key] = read_and_normalize(file_path, save_paths[f'{key}_norm'])

Normalized dataset saved to 'C:\Users\37427\Desktop\github\preprocessing\RFs_CT_bin50\CTbin50-norm.csv'
Normalized dataset saved to 'C:\Users\37427\Desktop\github\preprocessing\RFs_PET1_bin0.25\PET1bin0.25-norm.csv'
Normalized dataset saved to 'C:\Users\37427\Desktop\github\preprocessing\RFs_PET2_bin0.05\PET2bin0.05-norm.csv'
Normalized dataset saved to 'C:\Users\37427\Desktop\github\preprocessing\Baseline\Baseline1-norm.csv'
Normalized dataset saved to 'C:\Users\37427\Desktop\github\preprocessing\Baseline\Baseline2-norm.csv'


In [23]:
# The following operations are performed on the dataset with a random seed of 1.
# Function to split dataset into training and testing sets, then save them in specified directories
def split_and_save_dataset(df, base_dir, train_ratio=0.6, random_state=1):
    # Ensure the directory exists
    os.makedirs(base_dir, exist_ok=True)

    # Split dataset into training and testing sets directly, assuming 'Label' is the first column
    train_df, test_df = train_test_split(df, test_size=1-train_ratio, random_state=random_state)

    # Define paths for saving the datasets within the new directory
    train_path = os.path.join(base_dir, 'train.csv')
    test_path = os.path.join(base_dir, 'test.csv')

    # Save the training and testing datasets
    train_df.to_csv(train_path, index=False)
    print(f"Training dataset saved to '{train_path}'")

    test_df.to_csv(test_path, index=False)
    print(f"Testing dataset saved to '{test_path}'")

# Specify new directories for each dataset type
new_dirs = {
    'CT': r'C:\Users\37427\Desktop\github\preprocessing\RFs_CT_bin50\6-4ADASYN-1',
    'PET1': r'C:\Users\37427\Desktop\github\preprocessing\RFs_PET1_bin0.25\6-4ADASYN-1\original',
    'PET2': r'C:\Users\37427\Desktop\github\preprocessing\RFs_PET2_bin0.05\6-4ADASYN-1',
    'Baseline1': r'C:\Users\37427\Desktop\github\preprocessing\Baseline\6-4ADASYN-1\original',
    'Baseline2': r'C:\Users\37427\Desktop\github\preprocessing\Baseline\6-4ADASYN-1\standardized'
}

# Apply the function to each normalized dataset
for key, df in normalized_datasets.items():
    split_and_save_dataset(df, new_dirs[key])

Training dataset saved to 'C:\Users\37427\Desktop\github\preprocessing\RFs_CT_bin50\6-4ADASYN-1\train.csv'
Testing dataset saved to 'C:\Users\37427\Desktop\github\preprocessing\RFs_CT_bin50\6-4ADASYN-1\test.csv'
Training dataset saved to 'C:\Users\37427\Desktop\github\preprocessing\RFs_PET1_bin0.25\6-4ADASYN-1\original\train.csv'
Testing dataset saved to 'C:\Users\37427\Desktop\github\preprocessing\RFs_PET1_bin0.25\6-4ADASYN-1\original\test.csv'
Training dataset saved to 'C:\Users\37427\Desktop\github\preprocessing\RFs_PET2_bin0.05\6-4ADASYN-1\train.csv'
Testing dataset saved to 'C:\Users\37427\Desktop\github\preprocessing\RFs_PET2_bin0.05\6-4ADASYN-1\test.csv'
Training dataset saved to 'C:\Users\37427\Desktop\github\preprocessing\Baseline\6-4ADASYN-1\original\train.csv'
Testing dataset saved to 'C:\Users\37427\Desktop\github\preprocessing\Baseline\6-4ADASYN-1\original\test.csv'
Training dataset saved to 'C:\Users\37427\Desktop\github\preprocessing\Baseline\6-4ADASYN-1\standardized\tra

In [24]:
# The following operations are performed on the dataset with a random seed of 1.
# Function to apply ADASYN oversampling and save the balanced dataset
def apply_adasyn_and_save(train_file_path, save_balanced_train_path):
    # Load the training data
    train_data = pd.read_csv(train_file_path)
    
    # Separate features and labels
    X_train = train_data.drop('Label', axis=1)
    y_train = train_data['Label']

    # Initialize ADASYN without setting sampling_strategy; it will automatically generate samples based on classification difficulty
    adasyn = ADASYN(random_state=1)

    # Apply ADASYN oversampling
    X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)

    # Create a DataFrame for the balanced training set
    train_df_balanced = pd.concat([
        pd.DataFrame(y_resampled, columns=['Label']),
        pd.DataFrame(X_resampled, columns=X_train.columns)
    ], axis=1)

    # Ensure the directory exists
    os.makedirs(os.path.dirname(save_balanced_train_path), exist_ok=True)

    # Save the balanced training set
    train_df_balanced.to_csv(save_balanced_train_path, index=False)
    print(f"Balanced train dataset saved to '{save_balanced_train_path}'")

    # Calculate the number of synthetic samples generated
    original_minority_count = y_train.value_counts().get(1, 0)  # Count only the minority class (label 1) samples
    balanced_minority_count = y_resampled.value_counts().get(1, 0)  # Count only the minority class after balancing
    synthetic_count = balanced_minority_count - original_minority_count

    label_counts = y_train.value_counts()
    print(label_counts)
    print(f"Original minority count: {original_minority_count}")
    print(f"Balanced minority count: {balanced_minority_count}")
    print(f"Synthetic samples count: {synthetic_count}")

# Apply ADASYN and save to new directories
for key in file_paths:
    base_dir = new_dirs[key]  # Use the new directories defined earlier
    train_file_path = os.path.join(base_dir, 'train.csv')
    balanced_train_path = os.path.join(base_dir, 'train_ADASYN.csv')

    apply_adasyn_and_save(train_file_path, balanced_train_path)

# Function to count the number of synthetic samples in a balanced dataset
def count_synthetic_samples(file_path):
    df = pd.read_csv(file_path)
    return df['Label'].value_counts().get(1, 0)  # Assuming '1' is the minority class label

# Function to trim excess synthetic samples from a dataset
def trim_excess_synthetic_samples(file_path, target_synthetic_count):
    df = pd.read_csv(file_path)
    
    # Identify the indices of the synthetic samples (assuming they are at the end)
    synthetic_indices = df[df['Label'] == 1].index
    
    # Calculate how many samples to remove
    excess_count = len(synthetic_indices) - target_synthetic_count
    
    if excess_count > 0:
        # Remove the last `excess_count` synthetic samples
        df.drop(synthetic_indices[-excess_count:], inplace=True)
        
        # Ensure the directory exists
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        
        # Save the trimmed DataFrame back to CSV
        df.to_csv(file_path, index=False)
        print(f"Trimmed {excess_count} synthetic samples from '{file_path}'")

# Apply ADASYN and save, then find minimum synthetic sample count
synthetic_sample_counts = {}
for key in file_paths:
    base_dir = new_dirs[key]
    train_file_path = os.path.join(base_dir, 'train.csv')
    balanced_train_path = os.path.join(base_dir, 'train_ADASYN.csv')

    apply_adasyn_and_save(train_file_path, balanced_train_path)
    
    # Count synthetic samples for each file
    synthetic_sample_counts[balanced_train_path] = count_synthetic_samples(balanced_train_path)

# Find the minimum number of synthetic samples among all files
min_synthetic_count = min(synthetic_sample_counts.values())

# Trim excess synthetic samples for all files to match the minimum count
for file_path, synthetic_count in synthetic_sample_counts.items():
    if synthetic_count > min_synthetic_count:
        trim_excess_synthetic_samples(file_path, min_synthetic_count)

Balanced train dataset saved to 'C:\Users\37427\Desktop\github\preprocessing\RFs_CT_bin50\6-4ADASYN-1\train_ADASYN.csv'
0    110
1     54
Name: Label, dtype: int64
Original minority count: 54
Balanced minority count: 97
Synthetic samples count: 43
Balanced train dataset saved to 'C:\Users\37427\Desktop\github\preprocessing\RFs_PET1_bin0.25\6-4ADASYN-1\original\train_ADASYN.csv'
0    110
1     54
Name: Label, dtype: int64
Original minority count: 54
Balanced minority count: 102
Synthetic samples count: 48
Balanced train dataset saved to 'C:\Users\37427\Desktop\github\preprocessing\RFs_PET2_bin0.05\6-4ADASYN-1\train_ADASYN.csv'
0    110
1     54
Name: Label, dtype: int64
Original minority count: 54
Balanced minority count: 104
Synthetic samples count: 50
Balanced train dataset saved to 'C:\Users\37427\Desktop\github\preprocessing\Baseline\6-4ADASYN-1\original\train_ADASYN.csv'
0    110
1     54
Name: Label, dtype: int64
Original minority count: 54
Balanced minority count: 108
Synthetic s

In [26]:
# The following operations are performed on the dataset with a random seed of 1.
# Function to reduce features based on Spearman correlation
def reduce_features_by_spearman(train_file_path, test_file_path, output_train_path, output_test_path):
    # Read CSV files
    train = pd.read_csv(train_file_path)
    test = pd.read_csv(test_file_path)

    # Assume the first column is the label column, and the rest are feature columns
    label_column = 'Label'  # Explicitly specify the label column name
    features = [col for col in train.columns if col != label_column]

    # Initialize a set to store columns that need to be dropped
    columns_to_drop = set()

    # Set maximum iterations
    max_iterations = 100

    # Iterate until no pairs of features have a Spearman correlation greater than 0.7 or reach max iterations
    for iteration in range(max_iterations):
        print(f"Iteration {iteration + 1}: {len(features)} features remaining")

        if len(features) <= 1:
            break

        # Step 2: Calculate Spearman correlation matrix among features
        corr_matrix = train[features].corr(method='spearman')

        # Step 2.1: Find pairs of features with correlation greater than 0.7
        upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        high_corr_pairs = [(col1, col2) for col1, col2 in zip(*np.where(upper_triangle > 0.7))]

        # If no highly correlated pairs found, exit loop
        if not high_corr_pairs:
            break

        # Step 3: For each pair of highly correlated features, retain the one more correlated with the label
        new_columns_to_drop = set()
        for idx1, idx2 in high_corr_pairs:
            col1, col2 = features[idx1], features[idx2]

            # Calculate correlation with label
            corr_with_label_col1 = abs(train[col1].corr(train[label_column], method='spearman'))
            corr_with_label_col2 = abs(train[col2].corr(train[label_column], method='spearman'))

            # Retain the feature more correlated with the label
            if corr_with_label_col1 > corr_with_label_col2:
                new_columns_to_drop.add(col2)
            else:
                new_columns_to_drop.add(col1)

        # Update the set of columns to drop
        columns_to_drop.update(new_columns_to_drop)

        # Update the list of features, removing columns to drop
        features = [col for col in features if col not in columns_to_drop]

        # Drop these columns from training and testing datasets
        train = train.drop(columns=new_columns_to_drop)
        test = test.drop(columns=new_columns_to_drop)

    print("Process completed.")

    # Ensure directories exist before saving
    os.makedirs(os.path.dirname(output_train_path), exist_ok=True)
    os.makedirs(os.path.dirname(output_test_path), exist_ok=True)

    # Save modified training and testing datasets to new CSV files
    train.to_csv(output_train_path, index=False)
    test.to_csv(output_test_path, index=False)

# Process all specified datasets
for key in file_paths:
    base_dir = new_dirs[key]  # Use the new directories defined earlier
    
    # Define paths for ADASYN balanced training and testing datasets
    train_file_path = os.path.join(base_dir, 'train_ADASYN.csv')
    test_file_path = os.path.join(base_dir, 'test.csv')
    
    # Define output paths for Spearman-reduced datasets within the same directory
    output_train_path = os.path.join(base_dir, 'train_Spearman.csv')
    output_test_path = os.path.join(base_dir, 'test_Spearman.csv')

    # Apply feature reduction by Spearman correlation
    reduce_features_by_spearman(train_file_path, test_file_path, output_train_path, output_test_path)

Iteration 1: 908 features remaining
Iteration 2: 110 features remaining
Process completed.
Iteration 1: 1006 features remaining
Iteration 2: 77 features remaining
Process completed.
Iteration 1: 967 features remaining
Iteration 2: 102 features remaining
Process completed.
Iteration 1: 6 features remaining
Iteration 2: 4 features remaining
Process completed.
Iteration 1: 6 features remaining
Iteration 2: 4 features remaining
Process completed.
