In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
#pip install git+https://github.com/KindXiaoming/pykan.git
import sys

In [69]:
'''import os
import pandas as pd

test_data_path = os.getenv("TEST_DATA_PATH", "data/UNSW_NB15_testing-set.csv")
train_data_path = os.getenv("TRAIN_DATA_PATH", "data/UNSW_NB15_training-set.csv")

test_data = pd.read_csv(test_data_path)
train_data = pd.read_csv(train_data_path)'''

'import os\nimport pandas as pd\n\ntest_data_path = os.getenv("TEST_DATA_PATH", "data/UNSW_NB15_testing-set.csv")\ntrain_data_path = os.getenv("TRAIN_DATA_PATH", "data/UNSW_NB15_training-set.csv")\n\ntest_data = pd.read_csv(test_data_path)\ntrain_data = pd.read_csv(train_data_path)'

In [70]:
test_data = pd.read_csv(r"UNSW_NB15_testing-set.csv")
train_data = pd.read_csv(r"UNSW_NB15_training-set.csv")

In [72]:
X_test = test_data.drop(columns = ["attack_cat","label"])

In [73]:
X_train = train_data.drop(columns = ["attack_cat","label"])

In [75]:
y_train = train_data["attack_cat"]

In [76]:
y_test = test_data["attack_cat"]

In [77]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class OutlierTreatmentTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        """
        Initialize the transformer.
        """
        self.iqr_bounds = {}  # Store IQR bounds for each numeric column
        self.columns_numeric = []  # Store numeric columns to process

    def fit(self, X, y=None):
        """
        Compute IQR bounds for each numeric column.

        Parameters:
            X (pd.DataFrame): The training data (feature matrix).
            y (ignored): Not used, present for compatibility.

        Returns:
            self (OutlierTreatmentTransformer): The fitted transformer.
        """
        print("Fitting outlier treatment...")

        # Identify numeric columns (excluding 'id' and columns with fewer than 3 unique values)
        self.columns_numeric = X.select_dtypes(include="number").drop(columns="id", errors="ignore").columns
        self.columns_numeric = [
            col for col in self.columns_numeric
            if X[col].nunique() >= 3  # Ensure column has at least 3 unique values
        ]

        # Ensure all numeric columns are cast to float
        for col in self.columns_numeric:
            if col in X.columns:
                X[col] = X[col].astype(float)
            else:
                raise ValueError(f"Column '{col}' not found in the input data.")

        # Process each numeric column
        for col in self.columns_numeric:
            # Calculate IQR bounds for the column
            q1 = X[col].quantile(0.25)
            q3 = X[col].quantile(0.75)
            iqr = q3 - q1
            lower_bound = q1 - 1.5 * iqr
            upper_bound = q3 + 1.5 * iqr
            self.iqr_bounds[col] = (lower_bound, upper_bound)

        print("Fitting completed.")
        return self

    def transform(self, X):
        """
        Treat outliers in the dataset using the stored IQR bounds.

        Parameters:
            X (pd.DataFrame): The data to transform (feature matrix).

        Returns:
            pd.DataFrame: The transformed data with outliers treated.
        """
        print("Transforming data...")

        # Ensure all numeric columns are cast to float
        for col in self.columns_numeric:
            if col in X.columns:
                X[col] = X[col].astype(float)
            else:
                raise ValueError(f"Column '{col}' not found in the input data.")

        # Process each numeric column
        for col in self.columns_numeric:
            lower_bound, upper_bound = self.iqr_bounds[col]

            # Identify outliers
            outliers = (X[col] < lower_bound) | (X[col] > upper_bound)

            # Replace outliers with the column mean
            replacement_mean = X[col].mean()
            X.loc[outliers, col] = replacement_mean

        print("Transformation completed.")
        return X

In [47]:
#testing
OutlierTreater = OutlierTreatmentTransformer()

train_data = OutlierTreater.fit_transform(train_data)
test_data = OutlierTreater.transform(test_data)

Fitting outlier treatment...
Fitting completed.
Transforming data...
Transformation completed.
Transforming data...
Transformation completed.


In [78]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class CategoryPruner(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=6, debug=False):
        """
        Parameters:
            threshold (int): The maximum number of categories to keep for each feature.
            debug (bool): Whether to print debug information.
        """
        self.threshold = threshold
        self.debug = debug
        self.top_categories = {}  # Store top categories for each feature

    def fit(self, X, y=None):
        """
        Identify the top categories for each categorical feature in the training data.

        Parameters:
            X (pd.DataFrame): The training data (feature matrix).
            y (ignored): Not used, present for compatibility.

        Returns:
            self (CategoryPruner): The fitted transformer.
        """
        # Select categorical columns
        categorical_columns = X.select_dtypes(exclude=[np.number]).columns

        # Iterate through categorical columns
        for feature in categorical_columns:
            if self.debug:
                print(f"Processing feature: {feature}")
                print(f"Number of unique categories before reduction: {X[feature].nunique()}")
                print('----------------------------------------------------')

            # Check if the number of unique categories exceeds the threshold
            if X[feature].nunique() > self.threshold:
                # Identify the top categories in the training data
                self.top_categories[feature] = X[feature].value_counts().head(self.threshold).index

        return self

    def transform(self, X):
        """
        Reduce categories in the dataset to the top categories identified during fitting.

        Parameters:
            X (pd.DataFrame): The data to transform (feature matrix).

        Returns:
            pd.DataFrame: The transformed data with reduced categories.
        """
        # Select categorical columns
        categorical_columns = X.select_dtypes(exclude=[np.number]).columns

        # Iterate through categorical columns
        for feature in categorical_columns:
            if feature in self.top_categories:
                # Reduce data to the top categories, replacing others with '-'
                X[feature] = np.where(X[feature].isin(self.top_categories[feature]), X[feature], '-')

        return X

In [26]:
CategoryPruner_object = CategoryPruner(target_column = "attack_cat").fit(train_data)

train_data = CategoryPruner_object.transform(train_data)

test_data = CategoryPruner_object.transform(test_data)

In [48]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class SkewnessLogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, skew_threshold=1):
        """
        Parameters:
            skew_threshold (float): Threshold for identifying skewed columns.
        """
        self.skew_threshold = skew_threshold
        self.skewed_columns = []

    def fit(self, X, y=None):

        numeric_cols = X.select_dtypes(include="number").columns

        # Calculate skewness for numeric columns
        skewness = X[numeric_cols].skew()

        # Identify columns with skewness above the threshold
        self.skewed_cols = skewness[skewness.abs() > self.skew_threshold].index
        print("Skewed columns identified during fit:", self.skewed_cols)

        return self

    def transform(self,X):

        for col in self.skewed_cols:
            if col in X.columns:
                X[col] = np.log1p(X[col])  # log1p to avoid log(0)
            else:
                raise ValueError(f"Column '{col}' not found in the input data.")

        return X

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#standardScaler
#minmaxScaler

In [58]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class CategoricalColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.categorical_cols = None

    def fit(self, X, y=None):
        # Identify categorical columns dynamically
        self.categorical_cols = X.select_dtypes(include="object").columns
        return self

    def transform(self, X):
        # Return only the categorical columns
        return X[self.categorical_cols]

In [60]:
categorical_cols = []
ct = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_cols)
    ],
    remainder='passthrough'  # Pass through non-categorical columns unchanged
)

In [61]:
from efficient_kan import KAN

model = KAN([40, 20,6, 3,10],grid_size = 3, scale_noise=0.2, scale_base=0.2, scale_spline=0.2)