In [7]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
import math
class MultivariateDiscretizer:
    def __init__(self, n_bins=10, min_support_diff=0.01):
        """
        Initializes the discretizer.
        
        Parameters:
        - n_bins: Number of initial bins to divide each continuous attribute.
        - min_support_diff: Minimum difference in support for merging intervals.
        """
        self.n_bins = n_bins
        self.min_support_diff = min_support_diff

    def fit(self, data):
        """
        Discretizes the continuous features in the dataset.
        
        Parameters:
        - data: A pandas DataFrame with continuous attributes to discretize.
        
        Returns:
        - A pandas DataFrame with discretized continuous variables.
        """
        discretized_data = data.copy()
        for col in data.columns:
            discretized_data[col] = self._discretize_column(data[col])
        return discretized_data

    def _discretize_column(self, series):
        # Step 1: Initial partitioning into equal-width bins
        bins = pd.cut(series, bins=self.n_bins, labels=False)
        bin_edges = np.linspace(series.min(), series.max(), self.n_bins + 1)

        # Step 2: Merge adjacent intervals based on distribution similarity
        while True:
            merged = False
            for i in range(len(bin_edges) - 2):
                interval_data = series[(bins == i) | (bins == i + 1)]
                if self._are_distributions_similar(interval_data):
                    # Merge bins i and i+1
                    bins[bins == i + 1] = i
                    bin_edges = np.delete(bin_edges, i + 1)
                    merged = True
                    break
            if not merged:
                break

        # Map each original value to the merged bins
        discretized_series = pd.cut(series, bins=bin_edges, labels=False)
        return discretized_series

    def _are_distributions_similar(self, data):
        """
        Tests if the distributions within two intervals are similar.
        
        Parameters:
        - data: Data for the two intervals being tested.
        
        Returns:
        - True if the intervals have similar distributions, otherwise False.
        """
        # Create histogram and add Laplace smoothing to avoid zero counts
        counts, _ = np.histogram(data, bins=2)
        counts = counts + 0.5  # Laplace smoothing

        # Perform chi-square test
        _, p_value, _, _ = chi2_contingency([counts])
        
        # Return True if distributions are similar (p-value > threshold)
        return p_value > self.min_support_diff

# Example usage:
data = pd.read_csv("D:/Projects/formal_project/concatenated_SWaT_Dataset.csv")

discretizer = MultivariateDiscretizer(n_bins = 100, min_support_diff=0.05)
discretized_data = discretizer.fit(data)

print(discretized_data.head())


   FIT101  LIT101  MV101  P101  P102  AIT201  AIT202  AIT203  FIT201  MV201  \
0     0.0     0.0    0.0   0.0   NaN     0.0     0.0     0.0     0.0    0.0   
1     0.0     0.0    0.0   0.0   NaN     0.0     0.0     0.0     0.0    0.0   
2     0.0     0.0    0.0   0.0   NaN     0.0     0.0     0.0     0.0    0.0   
3     0.0     0.0    0.0   0.0   NaN     0.0     0.0     0.0     0.0    0.0   
4     0.0     0.0    0.0   0.0   NaN     0.0     0.0     0.0     0.0    0.0   

   ...  P501  P502  PIT501  PIT502  PIT503  FIT601  P601  P602  P603  \
0  ...   NaN   NaN     0.0     NaN     0.0     0.0   NaN   NaN   NaN   
1  ...   NaN   NaN     0.0     NaN     0.0     0.0   NaN   NaN   NaN   
2  ...   NaN   NaN     0.0     NaN     0.0     0.0   NaN   NaN   NaN   
3  ...   NaN   NaN     0.0     NaN     0.0     0.0   NaN   NaN   NaN   
4  ...   NaN   NaN     0.0     NaN     0.0     0.0   NaN   NaN   NaN   

   Normal/Attack  
0            NaN  
1            NaN  
2            NaN  
3            NaN