In [1]:
import pandas as pd
from seaborn import xkcd_palette
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import numpy as np

In [2]:
import pandas as pd

# Load data
data = pd.read_csv('mimic_flood_drivers_dataset.csv')

# Identify label columns (assuming they are named L1, L2, ..., Ln)
label_columns = [col for col in data.columns if col.startswith('L')]

# Define categories based on column prefixes
categories = {
    "Labels": label_columns,
    "Topography": [col for col in data.columns if col.startswith("T")],
    "Hydrological": [col for col in data.columns if col.startswith("H")],
    "Vegetation": [col for col in data.columns if col.startswith("V")],
    "Shape": [col for col in data.columns if col.startswith("S")],
    "Climate": [col for col in data.columns if col.startswith("C")]
}

# Create separate dataframes for each category
category_dataframes = {category: data[columns] for category, columns in categories.items()}

# Print each category and its columns
print("Categories and their columns:")
for category, columns in categories.items():
    print(f"\nCategory: {category}")
    print(f"Columns: {columns}")

# Preview one of the category-specific dataframes (e.g., Topography)
print("\nPreview of Topography DataFrame:")
print(category_dataframes["Topography"].head())
data.head()

Categories and their columns:

Category: Labels
Columns: ['L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8', 'L9', 'L10', 'L11', 'L12']

Category: Topography
Columns: ['T1', 'T2', 'T3', 'T6', 'T4', 'T7', 'T8', 'T9', 'T10', 'T11', 'T5']

Category: Hydrological
Columns: ['H1', 'H6', 'H7', 'H8', 'H2', 'H3', 'H4', 'H5', 'H9', 'H10', 'H11', 'H12']

Category: Vegetation
Columns: ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8']

Category: Shape
Columns: ['S11', 'S8', 'S9', 'S10', 'S2', 'S1', 'S3', 'S4', 'S5', 'S7', 'S6']

Category: Climate
Columns: ['C1', 'C2']

Preview of Topography DataFrame:
           T1          T2           T3         T6         T4        T7  \
0  396.614090  179.453893   673.961939   4.199028   3.063464  1.254181   
1  716.477570  716.242822  2560.734611  11.031872   7.909028  3.359525   
2  249.787542   36.917722   100.063143   1.169048   0.888382  0.342685   
3  888.418646  731.016700  2277.399087  22.827090  15.809505  6.899293   
4  175.355852  180.392630   422.334951

Unnamed: 0,L1,L2,L3,L4,L5,L6,L7,L8,L9,L10,...,C2,H2,H3,H4,H5,T5,H9,H10,H11,H12
0,0.005477,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0085,...,3.045662,8.579017,29.01883,3.646602,0.531858,34.238893,19.603891,0.05524,46445.299255,276.380196
1,0.01664,0.004072,0.069643,0.123688,0.0,0.007365,0.007276,0.0,0.0,0.0,...,0.990749,8.447919,27.682428,2.325901,0.189171,22.110378,115.931178,0.061129,29785.452246,290.797341
2,0.067981,0.0,0.013596,0.053496,0.0,0.0,0.0,0.008777,0.0,0.023665,...,0.979768,9.506858,24.936928,4.654782,0.756782,6.221572,16.012457,0.0,19598.888752,272.694758
3,0.546691,0.025162,0.115282,0.449221,1.041129,0.0,0.007867,0.048825,0.0,0.0,...,6.013421,7.254344,26.87619,2.464772,1.155325,16.030205,142.034012,0.0,10896.155792,287.459756
4,0.117176,0.059917,0.0,0.067867,0.080017,0.059768,0.037533,0.077548,0.044131,0.028936,...,1.039374,9.123762,32.668775,4.059705,2.213218,11.052639,38.298921,0.024917,29128.025386,271.329192


### Data profiling

In [8]:
from ydata_profiling import ProfileReport
# Loop through each category and generate profiling reports
for category, columns in categories.items():
    category_df = data[columns]
    profile = ProfileReport(category_df, title=f"{category} Profiling Report", explorative=True)
    profile.to_file(f"{category}_profiling_report.html")

print("Profiling reports have been generated for all categories.")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"index": "df_index"}, inplace=True)
Summarize dataset: 100%|██████████| 165/165 [00:15<00:00, 10.89it/s, Completed]               
Generate report structure: 100%|██████████| 1/1 [00:03<00:00,  3.32s/it]
Render HTML: 100%|██████████| 1/1 [00:03<00:00,  3.29s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 35.00it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"index": "df_index"}, inplace=True)
Summarize dataset: 100%|██████████| 141/141 [00:13<00:00, 10.27it/s, Completed]               
Generate report structure: 100%|██████████| 1/1 [00:03<00:00,  3.22s/it]
Render HTM

Profiling reports have been generated for all categories.





### Pair plots


In [11]:
import seaborn as sns

# Loop through each category and save pair plots
for category, columns in categories.items():
    if len(columns) > 1:  # Ensure there are at least two columns for pair plots
        print(f"Generating and saving pair plot for {category}...")  # Inform the user
        pair_plot = sns.pairplot(data[columns])  # Generate pair plot
        pair_plot.fig.suptitle(f"{category} Pair Plot", y=1.02)  # Add title to the plot
        file_name = f"{category}_pair_plot.png"  # Define the file name
        pair_plot.savefig(file_name)  # Save the plot as a PNG file
        plt.close(pair_plot.fig)  # Close the plot to free memory
        print(f"Saved pair plot for {category} as {file_name}.")  # Confirm save
    else:
        print(f"Skipping {category} as it has less than 2 columns.")  # Skip if not enough columns


Generating and saving pair plot for Labels...
Saved pair plot for Labels as Labels_pair_plot.png.
Generating and saving pair plot for Topography...
Saved pair plot for Topography as Topography_pair_plot.png.
Generating and saving pair plot for Hydrological...
Saved pair plot for Hydrological as Hydrological_pair_plot.png.
Generating and saving pair plot for Vegetation...
Saved pair plot for Vegetation as Vegetation_pair_plot.png.
Generating and saving pair plot for Shape...
Saved pair plot for Shape as Shape_pair_plot.png.
Generating and saving pair plot for Climate...
Saved pair plot for Climate as Climate_pair_plot.png.


### Normalizing data 


In this script, I am preprocessing a dataset by dynamically scaling its feature columns based on their skewness and then generating pair plots for visualization. The scaling process involves identifying highly skewed features and applying a combination of RobustScaler and MinMaxScaler to normalize their distribution, while less skewed features are scaled using percentile clipping or standard Min-Max scaling. The scaled feature data is merged back with the original dataset, retaining identifier and label columns. Finally, pair plots are generated for each category of features, enabling you to visualize relationships and correlations within each category. The pair plots are saved as separate image files for further analysis. This process ensures the dataset is cleaned, normalized, and ready for exploratory data analysis.

In [3]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler

# Function to scale features based on skewness
def custom_scaling(df, features_columns):
    scaled_features = pd.DataFrame(index=df.index)  # Initialize with the original index

    for column in features_columns:
        # Drop NA values for skewness calculation and scaler fitting
        feature_data = df[column].dropna()

        # Calculate skewness
        skewness = feature_data.skew()

        if skewness > 2 or skewness < -2:
            # Step 1: Apply RobustScaler to handle skewness
            robust_scaler = RobustScaler()
            robust_scaled_column = robust_scaler.fit_transform(feature_data.values.reshape(-1, 1))

            # Step 2: Apply MinMaxScaler to scale to range [0, 1]
            min_max_scaler = MinMaxScaler(feature_range=(0, 1))
            scaled_column = min_max_scaler.fit_transform(robust_scaled_column)

            print(f"{column}: Using RobustScaler + MinMaxScaler (Skewness: {skewness:.2f})")

        elif 1 < skewness <= 2:
            # Define min and max using the 1st and 99th percentiles
            lower_bound = feature_data.quantile(0.01)
            upper_bound = feature_data.quantile(0.99)
            scaled_column = ((df[column].clip(lower=lower_bound, upper=upper_bound) - lower_bound) /
                             (upper_bound - lower_bound)).values.reshape(-1, 1)

            print(f"{column}: Using Percentile Clipping (Skewness: {skewness:.2f})")

        else:
            # Use Min-Max Scaling for less skewed data
            min_max_scaler = MinMaxScaler(feature_range=(0, 1))
            scaled_column = min_max_scaler.fit_transform(feature_data.values.reshape(-1, 1))

            print(f"{column}: Using Min-Max Scaling (Skewness: {skewness:.2f})")

        # Add scaled column to the result dataframe (flatten to convert to a 1D array)
        scaled_features[column] = scaled_column.flatten()

    return scaled_features

# Define feature columns (all columns except labels and IDs)
label_columns = [col for col in data.columns if col.startswith('L')]
features_columns = [col for col in data.columns if col not in label_columns]

# Apply the custom scaling function to the data's feature columns
scaled_features_df = custom_scaling(data, features_columns)

# Merge scaled features back with the original data (if needed)
data_scaled = pd.concat([data[label_columns], scaled_features_df], axis=1)

scaled_data_file = "scaled_flood_drivers_dataset.csv"
data_scaled.to_csv(scaled_data_file, index=False)

data_scaled.head()



S11: Using RobustScaler + MinMaxScaler (Skewness: 8.26)
S8: Using RobustScaler + MinMaxScaler (Skewness: -4.97)
T1: Using Percentile Clipping (Skewness: 1.34)
T2: Using Percentile Clipping (Skewness: 1.18)
T3: Using Percentile Clipping (Skewness: 1.82)
T6: Using Percentile Clipping (Skewness: 1.91)
S9: Using Percentile Clipping (Skewness: 1.78)
S10: Using Percentile Clipping (Skewness: 1.52)
T4: Using Percentile Clipping (Skewness: 1.75)
H1: Using Min-Max Scaling (Skewness: 0.07)
T7: Using Percentile Clipping (Skewness: 1.92)
T8: Using Percentile Clipping (Skewness: 1.84)
T9: Using Percentile Clipping (Skewness: 1.13)
T10: Using RobustScaler + MinMaxScaler (Skewness: 3.07)
V1: Using Min-Max Scaling (Skewness: -0.29)
V2: Using Percentile Clipping (Skewness: 1.28)
V3: Using RobustScaler + MinMaxScaler (Skewness: 2.96)
V4: Using RobustScaler + MinMaxScaler (Skewness: 2.82)
V5: Using RobustScaler + MinMaxScaler (Skewness: 3.52)
V6: Using RobustScaler + MinMaxScaler (Skewness: 7.17)
V7: Usi

Unnamed: 0,L1,L2,L3,L4,L5,L6,L7,L8,L9,L10,...,C2,H2,H3,H4,H5,T5,H9,H10,H11,H12
0,0.005477,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0085,...,0.060988,0.54763,0.612131,0.521886,0.037818,0.298928,0.039498,0.529595,0.685924,0.055321
1,0.01664,0.004072,0.069643,0.123688,0.0,0.007365,0.007276,0.0,0.0,0.0,...,0.019839,0.512374,0.526266,0.206557,0.005339,0.189567,0.32654,0.586057,0.43231,0.172894
2,0.067981,0.0,0.013596,0.053496,0.0,0.0,0.0,0.008777,0.0,0.023665,...,0.019619,0.797151,0.349864,0.762598,0.059135,0.046301,0.028796,0.0,0.27724,0.025265
3,0.546691,0.025162,0.115282,0.449221,1.041129,0.0,0.007867,0.048825,0.0,0.0,...,0.120415,0.19139,0.474464,0.239714,0.096908,0.134743,0.404323,0.0,0.144758,0.145675
4,0.117176,0.059917,0.0,0.067867,0.080017,0.059768,0.037533,0.077548,0.044131,0.028936,...,0.020813,0.694126,0.846645,0.620518,0.197171,0.089861,0.095207,0.238884,0.422302,0.014129


### Scaled Data Profiling

In [16]:
# Generate and save pair plots for each category
for category, columns in categories.items():
    if len(columns) > 1:  # Ensure there are at least two columns for pair plots
        print(f"Generating and saving pair plot for {category}...")  # Inform the user
        category_df = data_scaled[columns]  # Use scaled data for pair plots
        pair_plot = sns.pairplot(category_df)  # Generate pair plot
        pair_plot.fig.suptitle(f"{category} Pair Plot", y=1.02)  # Add title to the plot
        file_name = f"{category}_pair_plot.png"  # Define the file name
        pair_plot.savefig(file_name)  # Save the plot as a PNG file
        plt.close(pair_plot.fig)  # Close the plot to free memory
        print(f"Saved pair plot for {category} as {file_name}.")  # Confirm save
    else:
        print(f"Skipping {category} as it has less than 2 columns.")  # Skip if not enough columns


Generating and saving pair plot for Labels...
Saved pair plot for Labels as Labels_pair_plot.png.
Generating and saving pair plot for Topography...
Saved pair plot for Topography as Topography_pair_plot.png.
Generating and saving pair plot for Hydrological...
Saved pair plot for Hydrological as Hydrological_pair_plot.png.
Generating and saving pair plot for Vegetation...
Saved pair plot for Vegetation as Vegetation_pair_plot.png.
Generating and saving pair plot for Shape...
Saved pair plot for Shape as Shape_pair_plot.png.
Generating and saving pair plot for Climate...
Saved pair plot for Climate as Climate_pair_plot.png.
