In [1]:
import pandas as pd

# Load datasets
train_data = pd.read_csv('TrainDataset2024.csv')
test_data = pd.read_csv('TestDatasetExample.csv')


def displayDatasets(train_data, test_data):
    #Print the current datasets
    print("Training Dataset:")
    print(train_data.head())
    print("\nTest Dataset:")
    print(test_data.head())

In [2]:
from sklearn.preprocessing import MinMaxScaler
from scipy.stats.mstats import winsorize

def windsorizeDataset(data, lower_percentile=0.05, upper_percentile=0.95):
    #Apply winsorization to numeric columns of a dataset.
    
    # Select numeric columns
    numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
    
    # Apply windsorization column-wise
    for column in numeric_columns:
        data[column] = winsorize(data[column], limits=(lower_percentile, 1-upper_percentile))
    
    print(f"Windsorization complete. Applied {lower_percentile*100}% - {upper_percentile*100}% limits.")
    return data

def minMaxNormalisation(train_data, test_data):
    #Apply Min-Max normalization to the dataset after windsorization.
    
    # Identify common numeric columns
    train_numeric = train_data.select_dtypes(include=['float64', 'int64']).columns
    test_numeric = test_data.select_dtypes(include=['float64', 'int64']).columns
    common_numeric_columns = train_numeric.intersection(test_numeric)
    
    # Initialize the MinMaxScaler
    scaler = MinMaxScaler()
    
    # Apply Min-Max normalization only on common columns
    train_data[common_numeric_columns] = scaler.fit_transform(train_data[common_numeric_columns])
    test_data[common_numeric_columns] = scaler.transform(test_data[common_numeric_columns])

    """
    # Save the normalized datasets
    train_data.to_csv('normalized_train_dataset.csv', index=False)
    test_data.to_csv('normalized_test_dataset.csv', index=False)"""

    print("Normalization complete.")

    displayDatasets(train_data, test_data)
    
# Apply windsorization
train_data = windsorizeDataset(train_data, lower_percentile=0.05, upper_percentile=0.95)
test_data = windsorizeDataset(test_data, lower_percentile=0.05, upper_percentile=0.95)

# Apply Min-Max normalization
minMaxNormalisation(train_data, test_data)


Windsorization complete. Applied 5.0% - 95.0% limits.
Windsorization complete. Applied 5.0% - 95.0% limits.
Training Dataset:
          ID  pCR (outcome)  RelapseFreeSurvival (outcome)       Age   ER  \
0  TRG002174              1                          102.0  0.190903  0.0   
1  TRG002178              0                          102.0  0.133917  1.0   
2  TRG002204              1                          102.0  0.000000  0.0   
3  TRG002206              0                           12.0  0.019945  0.0   
4  TRG002210              0                          102.0  0.760764  1.0   

   PgR  HER2  TrippleNegative  ChemoGrade  Proliferation  ...  \
0  0.0   0.0              1.0         1.0            1.0  ...   
1  1.0   0.0              0.0         1.0            1.0  ...   
2  0.0   0.0              1.0         0.0            0.0  ...   
3  0.0   0.0              1.0         1.0            1.0  ...   
4  0.0   0.0              0.0         0.0            0.0  ...   

   original_glszm_Sm