In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
#pip install git+https://github.com/KindXiaoming/pykan.git
import sys

In [44]:
'''import os
import pandas as pd

test_data_path = os.getenv("TEST_DATA_PATH", "data/UNSW_NB15_testing-set.csv")
train_data_path = os.getenv("TRAIN_DATA_PATH", "data/UNSW_NB15_training-set.csv")

test_data = pd.read_csv(test_data_path)
train_data = pd.read_csv(train_data_path)'''

'import os\nimport pandas as pd\n\ntest_data_path = os.getenv("TEST_DATA_PATH", "data/UNSW_NB15_testing-set.csv")\ntrain_data_path = os.getenv("TRAIN_DATA_PATH", "data/UNSW_NB15_training-set.csv")\n\ntest_data = pd.read_csv(test_data_path)\ntrain_data = pd.read_csv(train_data_path)'

In [46]:
test_data = pd.read_csv(r"UNSW_NB15_testing-set.csv")
train_data = pd.read_csv(r"UNSW_NB15_training-set.csv")

In [47]:
X_test = test_data.drop(columns = ["id","attack_cat","label"])

In [48]:
X_train = train_data.drop(columns = ["id","attack_cat","label"])

In [52]:
y_train = train_data["attack_cat"]

In [54]:
y_test = test_data["attack_cat"]

In [56]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class OutlierTreatmentTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        """
        Initialize the transformer.
        """
        self.iqr_bounds = {}  # Store IQR bounds for each numeric column
        self.columns_numeric = []  # Store numeric columns to process

    def fit(self, X, y=None):
        """
        Compute IQR bounds for each numeric column.

        Parameters:
            X (pd.DataFrame): The training data (feature matrix).
            y (ignored): Not used, present for compatibility.

        Returns:
            self (OutlierTreatmentTransformer): The fitted transformer.
        """
        print("Fitting outlier treatment...")

        # Identify numeric columns (excluding 'id' and columns with fewer than 3 unique values)
        self.columns_numeric = X.select_dtypes(include="number").drop(columns="id", errors="ignore").columns
        self.columns_numeric = [
            col for col in self.columns_numeric
            if X[col].nunique() >= 3  # Ensure column has at least 3 unique values
        ]

        # Ensure all numeric columns are cast to float
        for col in self.columns_numeric:
            if col in X.columns:
                X[col] = X[col].astype(float)
            else:
                raise ValueError(f"Column '{col}' not found in the input data.")

        # Process each numeric column
        for col in self.columns_numeric:
            # Calculate IQR bounds for the column
            q1 = X[col].quantile(0.25)
            q3 = X[col].quantile(0.75)
            iqr = q3 - q1
            lower_bound = q1 - 1.5 * iqr
            upper_bound = q3 + 1.5 * iqr
            self.iqr_bounds[col] = (lower_bound, upper_bound)

        print("Fitting completed.")
        return self

    def transform(self, X):
        """
        Treat outliers in the dataset using the stored IQR bounds.

        Parameters:
            X (pd.DataFrame): The data to transform (feature matrix).

        Returns:
            pd.DataFrame: The transformed data with outliers treated.
        """
        print("Transforming data...")

        # Ensure all numeric columns are cast to float
        for col in self.columns_numeric:
            if col in X.columns:
                X[col] = X[col].astype(float)
            else:
                raise ValueError(f"Column '{col}' not found in the input data.")

        # Process each numeric column
        for col in self.columns_numeric:
            lower_bound, upper_bound = self.iqr_bounds[col]

            # Identify outliers
            outliers = (X[col] < lower_bound) | (X[col] > upper_bound)

            # Replace outliers with the column mean
            replacement_mean = X[col].mean()
            X.loc[outliers, col] = replacement_mean

        print("Transformation completed.")
        return X

In [58]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class CategoryPruner(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=6, debug=False):
        """
        Parameters:
            threshold (int): The maximum number of categories to keep for each feature.
            debug (bool): Whether to print debug information.
        """
        self.threshold = threshold
        self.debug = debug
        self.top_categories = {}  # Store top categories for each feature

    def fit(self, X, y=None):
        """
        Identify the top categories for each categorical feature in the training data.

        Parameters:
            X (pd.DataFrame): The training data (feature matrix).
            y (ignored): Not used, present for compatibility.

        Returns:
            self (CategoryPruner): The fitted transformer.
        """
        # Select categorical columns
        categorical_columns = X.select_dtypes(exclude=[np.number]).columns

        # Iterate through categorical columns
        for feature in categorical_columns:
            if self.debug:
                print(f"Processing feature: {feature}")
                print(f"Number of unique categories before reduction: {X[feature].nunique()}")
                print('----------------------------------------------------')

            # Check if the number of unique categories exceeds the threshold
            if X[feature].nunique() > self.threshold:
                # Identify the top categories in the training data
                self.top_categories[feature] = X[feature].value_counts().head(self.threshold).index

        return self

    def transform(self, X):
        """
        Reduce categories in the dataset to the top categories identified during fitting.

        Parameters:
            X (pd.DataFrame): The data to transform (feature matrix).

        Returns:
            pd.DataFrame: The transformed data with reduced categories.
        """
        # Select categorical columns
        categorical_columns = X.select_dtypes(exclude=[np.number]).columns

        # Iterate through categorical columns
        for feature in categorical_columns:
            if feature in self.top_categories:
                # Reduce data to the top categories, replacing others with '-'
                X[feature] = np.where(X[feature].isin(self.top_categories[feature]), X[feature], '-')

        return X

In [60]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class SkewnessLogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, skew_threshold=1):
        """
        Parameters:
            skew_threshold (float): Threshold for identifying skewed columns.
        """
        self.skew_threshold = skew_threshold
        self.skewed_columns = []

    def fit(self, X, y=None):

        numeric_cols = X.select_dtypes(include="number").columns

        # Calculate skewness for numeric columns
        skewness = X[numeric_cols].skew()

        # Identify columns with skewness above the threshold
        self.skewed_cols = skewness[skewness.abs() > self.skew_threshold].index
        print("Skewed columns identified during fit:", self.skewed_cols)

        return self

    def transform(self,X):

        for col in self.skewed_cols:
            if col in X.columns:
                X[col] = np.log1p(X[col])  # log1p to avoid log(0)
            else:
                raise ValueError(f"Column '{col}' not found in the input data.")

        return X

In [62]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer

# Identify numeric columns
numeric_cols = X_train.select_dtypes(include=['number']).columns.tolist()

# Create a ColumnTransformer to apply both scalers
scaler_pipeline = ColumnTransformer(
    transformers=[
        ('standard', StandardScaler(), numeric_cols),
        ('minmax', MinMaxScaler(), numeric_cols)
    ],
    remainder='passthrough'  # Keep non-numeric columns if any
)


In [64]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder


class CategoricalColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.categorical_cols = None

    def fit(self, X, y=None):
        # Identify categorical columns dynamically
        self.categorical_cols = X.select_dtypes(include="object").columns
        return self

    def transform(self, X):
        # Return only the categorical columns
        return X[self.categorical_cols]

In [66]:
categorical_cols = []
ct = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_cols)
    ],
    remainder='passthrough'  # Pass through non-categorical columns unchanged
)

In [68]:
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from efficient_kan import KAN

class KANWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, architecture=[40, 20, 6, 3, 10], grid_size=3, scale_noise=0.2, scale_base=0.2, scale_spline=0.2,
                 epochs=10, learning_rate=0.001):
        """
        Wrapper for KAN to make it compatible with scikit-learn.
        """
        self.architecture = architecture
        self.grid_size = grid_size
        self.scale_noise = scale_noise
        self.scale_base = scale_base
        self.scale_spline = scale_spline
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.model = None  # Placeholder for KAN model

    def fit(self, X, y):
        """
        Train the KAN model.
        """
        X_tensor = torch.tensor(X, dtype=torch.float32)
        y_tensor = torch.tensor(y, dtype=torch.long)  # Assuming classification task

        # Initialize KAN model
        self.model = KAN(self.architecture, grid_size=self.grid_size, scale_noise=self.scale_noise,
                         scale_base=self.scale_base, scale_spline=self.scale_spline)

        # Define loss function and optimizer
        criterion = nn.CrossEntropyLoss()  # Change this if it's a regression task
        optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

        # Training loop
        self.model.train()
        for epoch in range(self.epochs):
            optimizer.zero_grad()
            outputs = self.model(X_tensor)
            loss = criterion(outputs, y_tensor)
            loss.backward()
            optimizer.step()
            print(f"Epoch {epoch+1}/{self.epochs}, Loss: {loss.item()}")

        return self

    def predict(self, X):
        """
        Generate predictions using the trained KAN model.
        """
        if self.model is None:
            raise ValueError("The KAN model has not been trained yet.")
        
        X_tensor = torch.tensor(X, dtype=torch.float32)
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(X_tensor)
            predictions = torch.argmax(outputs, dim=1).numpy()

        return predictions

    def score(self, X, y):
        """
        Compute accuracy score.
        """
        predictions = self.predict(X)
        return np.mean(predictions == y)



In [70]:
from sklearn.model_selection import cross_val_score


numeric_cols = X_train.select_dtypes(include=['number']).columns.tolist()

# Define Preprocessing Pipeline
preprocessing = ColumnTransformer(
    transformers=[
        ('outlier_treatment', OutlierTreatmentTransformer(), numeric_cols),
        ('category_pruning', CategoryPruner(), X_train.select_dtypes(exclude=[np.number]).columns.tolist()),
        ('skewness_log', SkewnessLogTransformer(), numeric_cols),
        ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first'), X_train.select_dtypes(exclude=[np.number]).columns.tolist()),
        ('scaling', Pipeline([
            ('standard', StandardScaler()),
            ('minmax', MinMaxScaler())
        ]), numeric_cols)
    ],
    remainder='passthrough'  # Keep non-transformed columns
)

# Create Full Pipeline with Model
full_pipeline = Pipeline([
    ('preprocessing', preprocessing),
    ('model', KANWrapper())
])

# Perform Cross-Validation
cv_scores = cross_val_score(full_pipeline, X_train, y_train, cv=5, scoring="accuracy")

Fitting outlier treatment...
Fitting completed.
Transforming data...
Transformation completed.
Skewed columns identified during fit: Index(['dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sload', 'dload',
       'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'stcpb', 'dtcpb',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports'],
      dtype='object')
Fitting outlier treatment...
Fitting completed.
Transforming data...
Transformation completed.
Skewed columns identified during fit: Index(['dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sload', 'dload',
       'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'stcpb', 'dtcpb',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'r

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\TESTER\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\TESTER\anaconda3\lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\TESTER\anaconda3\lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\TESTER\AppData\Local\Temp\ipykernel_26904\3885420240.py", line 27, in fit
    X_tensor = torch.tensor(X, dtype=torch.float32)
TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.
