# **export_datasets.ipynb**

In [1]:
%load_ext cudf.pandas

# Standard libraries
import numpy as np
import pandas as pd

# Scikit-learn for preprocessing, scaling, and evaluation
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report
from scipy.stats import boxcox

# PyTorch
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import nn, optim

In [2]:
# Set random seed for reproducibility
SEED = 287
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f5642f20c90>

In [3]:
# DATA_PATH = "./data/concat.csv"								# Large dataset
DATA_PATH = "./data/Tuesday-WorkingHours.pcap_ISCX.csv"		# Small dataset
df = pd.read_csv(DATA_PATH)

print("Dataset loaded successfully.")
print(df.info())

Dataset loaded successfully.
<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 445909 entries, 0 to 445908
Data columns (total 79 columns):
 #   Column                        Non-Null Count   Dtype
---  ------                        --------------   -----
 0    Destination Port             445909 non-null  int64
 1    Flow Duration                445909 non-null  int64
 2    Total Fwd Packets            445909 non-null  int64
 3    Total Backward Packets       445909 non-null  int64
 4   Total Length of Fwd Packets   445909 non-null  int64
 5    Total Length of Bwd Packets  445909 non-null  int64
 6    Fwd Packet Length Max        445909 non-null  int64
 7    Fwd Packet Length Min        445909 non-null  int64
 8    Fwd Packet Length Mean       445909 non-null  float64
 9    Fwd Packet Length Std        445909 non-null  float64
 10  Bwd Packet Length Max         445909 non-null  int64
 11   Bwd Packet Length Min        445909 non-null  int64
 12   Bwd Packet Length Mean       445909 

#### General Preprocessing

In [None]:
# Identify constant features (only one unique value) and drop them
constant_features = [col for col in df.columns if df[col].nunique() == 1]
df_filtered = df.drop(columns=constant_features)

# Display removed features
print("Removed constant features:", constant_features)

Removed constant features: [' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags', ' CWE Flag Count', 'Fwd Avg Bytes/Bulk', ' Fwd Avg Packets/Bulk', ' Fwd Avg Bulk Rate', ' Bwd Avg Bytes/Bulk', ' Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate']


## Outlier Handling Techniques

In [5]:
def get_percentile(series, percentile):
    """Computes percentile from a Pandas series"""
    return np.percentile(series, percentile)

### Defining different outlier-handling techniques

In [None]:
def winsorize_df(df, lower_percentile=1, upper_percentile=99):
    """Applies Winsorization, capping extreme values at given percentiles."""
    df_adj = df.copy()
    
    for col in df.select_dtypes(include=[np.number]).columns:
        lower_limit = get_percentile(df[col], lower_percentile)
        upper_limit = get_percentile(df[col], upper_percentile)
        df_adj[col] = df[col].clip(lower=lower_limit, upper=upper_limit)
    
    print("✅ (1/8) Winsorization complete.")
    return df_adj

def trim_outliers_df(df, lower_percentile=1, upper_percentile=99):
	"""Removes rows where numerical features exceed given percentiles."""
	df_adj = df.copy()
	initial_size = len(df_adj)
	
	for col in df.select_dtypes(include=[np.number]).columns:
		lower_limit = get_percentile(df[col], lower_percentile)
		upper_limit = get_percentile(df[col], upper_percentile)
		df_adj = df_adj[(df_adj[col] >= lower_limit) & (df_adj[col] <= upper_limit)]
	
	print("✅ (2/8) Outlier trimming complete. Kept {:.2f}% of the data.".format(len(df_adj) / initial_size * 100))
	return df_adj

def log_transform_df(df):
    """Applies log(1+x) transformation to numerical columns to reduce skew."""
    df_adj = df.copy()
    
    for col in df.select_dtypes(include=[np.number]).columns:
        df_adj[col] = np.log1p(df[col])
    
    print("✅ (3/8) Log transformation complete.")
    return df_adj

def boxcox_transform_df(df):
    """Applies Box-Cox transformation to positive numerical columns."""
    df_adj = df.copy()
    
    for col in df.select_dtypes(include=[np.number]).columns:
        if (df[col] > 0).all():  # Box-Cox requires strictly positive values
            df_adj[col], _ = boxcox(df[col] + 1e-5)  # Small shift to avoid zero
    
    print("✅ (4/8) Box-Cox transformation complete.")
    return df_adj

def robust_scaler_df(df):
    """Applies RobustScaler to normalize numerical features while being resistant to outliers."""
    df_adj = df.copy()
    scaler = RobustScaler()
    num_cols = df.select_dtypes(include=[np.number]).columns
    
    # Handle infinite values & NaNs before scaling
    df_adj[num_cols] = df_adj[num_cols].replace([np.inf, -np.inf], np.nan)
    df_adj[num_cols] = df_adj[num_cols].fillna(df_adj[num_cols].median())
    
    # Apply RobustScaler to numerical columns
    df_adj[num_cols] = scaler.fit_transform(df_adj[num_cols])
    
    print("✅ (5/8) Robust scaling complete.")
    return df_adj

def iqr_filter_df(df, threshold=1.5):
	"""Removes outliers using the IQR method with a relaxed threshold."""
	df_adj = df.copy()
	initial_size = len(df_adj)
	
	for col in df.select_dtypes(include=[np.number]).columns:
		Q1 = get_percentile(df[col], 25)
		Q3 = get_percentile(df[col], 75)
		IQR = Q3 - Q1
		lower_bound = Q1 - threshold * IQR
		upper_bound = Q3 + threshold * IQR
		df_adj = df_adj[(df_adj[col] >= lower_bound) & (df_adj[col] <= upper_bound)]
	
	print("✅ (6/8) IQR filtering complete. Kept {:.2f}% of the data.".format(len(df_adj) / initial_size * 100))
	return df_adj

def isolation_forest_df(df, contamination=0.01):
    """Removes outliers using an Isolation Forest model."""
    df_adj = df.copy()
    num_cols = df.select_dtypes(include=[np.number]).columns
    iso_forest = IsolationForest(contamination=contamination, random_state=42)
    outliers = iso_forest.fit_predict(df[num_cols])
    df_adj = df_adj[outliers == 1]  # Keep only inliers
    
    print("✅ (7/8) Isolation Forest filtering complete.")
    return df_adj

def median_impute_outliers_df(df, lower_percentile=1, upper_percentile=99):
    """Replaces outliers beyond percentiles with the column median."""
    df_adj = df.copy()
    
    for col in df.select_dtypes(include=[np.number]).columns:
        lower_limit = get_percentile(df[col], lower_percentile)
        upper_limit = get_percentile(df[col], upper_percentile)
        median_value = df[col].median()
        df_adj[col] = df[col].where((df[col] >= lower_limit) & (df[col] <= upper_limit), median_value)
    
    print("✅ (8/8) Median imputation complete.")
    return df_adj


### Applying different outlier-handling techniques

In [None]:
df_winsorized = winsorize_df(df)
df_trimmed = trim_outliers_df(df)
df_log_transformed = log_transform_df(df)
df_boxcox_transformed = boxcox_transform_df(df)
df_robust_scaled = robust_scaler_df(df)
df_iqr_filtered = iqr_filter_df(df)
df_isolation_forest = isolation_forest_df(df)
df_median_imputed = median_impute_outliers_df(df)

✅ (1/8) Winsorization complete.
✅ (2/8) Outlier trimming complete. Kept 0.00% of the data.
✅ (3/8) Log transformation complete.
✅ (4/8) Box-Cox transformation complete.
✅ (5/8) Robust scaling complete.
✅ (6/8) IQR filtering complete. Kept 0.00% of the data.
✅ (7/8) Isolation Forest filtering complete.
✅ (8/8) Median imputation complete.


In [None]:
dataframe_dictionary = {
	"Windorized": df_winsorized,
	"Trimmed": df_trimmed,
	"Log-transformed": df_log_transformed,
	"Box-Cox transformed": df_boxcox_transformed,
	"Robust-scaled": df_robust_scaled,
	"IQR-filtered": df_iqr_filtered,
	"Isolation Forest": df_isolation_forest,
	"Median-imputed": df_median_imputed
}

for name, df in dataframe_dictionary.items():
	df.to_csv(f"./data/exported/Tuesday/{name}.csv", index=False)

# For some reason the `ÌQR-filtered` and `Trimmed` datasets are only 2 KB.
# Those functions got rid of all the data. 

# We will not use those datasets for the rest of the notebook.