# PREPROCESSING OF DATASET STANDARDIZATION, ONE-HOT ENCODING AND RESAMPLING

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer, MinMaxScaler
from imblearn.over_sampling import ADASYN

In [57]:
test_data = pd.read_csv("./dataset/test_dataset.csv")
train_data = pd.read_csv("./dataset/train_dataset.csv")

In [58]:
drop_columns = ['Unnamed: 0.1', 'Unnamed: 0', 'uid', 'originh', 'responh', 'traffic_category', 'Label']
y_train = train_data["Label"]
X_train = train_data.drop(columns=drop_columns)
y_test = test_data["Label"]
X_test = test_data.drop(columns=drop_columns)

In [None]:
port_columns = ['originp', 'responp']
numerical_cols = X_train.select_dtypes(include=['number']).columns.difference(port_columns)
categorical_cols = X_train.select_dtypes(include=['object']).columns
numeric_transformer = Pipeline(steps=[
    ('yeo_johnson', PowerTransformer(method='yeo-johnson')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])
port_transformer = Pipeline(steps=[
    ('Rescale', MinMaxScaler())
])
# Column transformer including all transformations
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols),
    ('port', port_transformer, port_columns)  # Treat port numbers separately
])
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])
X_train_transformed = pipeline.fit_transform(X_train)
X_test_transformed = pipeline.transform(X_test)
adasyn = ADASYN(sampling_strategy=0.2, n_neighbors=5)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train_transformed, y_train)