In [4]:
import numpy as np
import pandas as pd
from math import log2

class MDLDiscretizer:
    def __init__(self, min_samples=10, target_column=None):
        """
        Initialize MDL Discretizer
        
        Parameters:
        -----------
        min_samples : int
            Minimum number of samples required to consider a split
        target_column : str
            Name of the target column to exclude from discretization
        """
        self.min_samples = min_samples
        self.target_column = target_column
        self.cut_points_dict = {}
        
    def _calculate_entropy(self, data):
        """Calculate entropy of a continuous variable."""
        hist, _ = np.histogram(data, bins='sturges')
        probs = hist / len(data)
        probs = probs[probs > 0]
        return -np.sum(probs * np.log2(probs))
    
    def _mdl_cost(self, data, cut_point):
        """Calculate MDL cost for a potential cut point."""
        left_data = data[data <= cut_point]
        right_data = data[data > cut_point]
        
        if len(left_data) == 0 or len(right_data) == 0:
            return float('inf')
        
        entropy_before = self._calculate_entropy(data)
        entropy_after = (len(left_data) * self._calculate_entropy(left_data) + 
                        len(right_data) * self._calculate_entropy(right_data)) / len(data)
        
        model_cost = log2(len(data) - 1)
        data_cost = len(data) * (entropy_after - entropy_before)
        
        return model_cost + data_cost
    
    def _find_best_cut_point(self, data):
        """Find the best cut point using MDL principle."""
        if len(data) < self.min_samples:
            return None
            
        sorted_data = np.sort(data)
        unique_values = np.unique(sorted_data)
        
        if len(unique_values) <= 1:
            return None
        
        potential_cuts = (unique_values[:-1] + unique_values[1:]) / 2
        min_cost = float('inf')
        best_cut = None
        
        for cut in potential_cuts:
            cost = self._mdl_cost(sorted_data, cut)
            if cost < min_cost:
                min_cost = cost
                best_cut = cut
                
        return best_cut
    
    def _recursive_discretize(self, data, cut_points):
        """Recursively find cut points using MDL principle."""
        cut_point = self._find_best_cut_point(data)
        
        if cut_point is None:
            return
        
        cut_points.add(cut_point)
        
        left_mask = data <= cut_point
        right_mask = ~left_mask
        
        self._recursive_discretize(data[left_mask], cut_points)
        self._recursive_discretize(data[right_mask], cut_points)
    
    def fit(self, data):
        """
        Fit the discretizer to the dataset
        
        Parameters:
        -----------
        data : DataFrame
            Dataset with continuous variables to be discretized
        """
        if isinstance(data, pd.Series):
            data = data.to_frame()
        
        # Exclude target column from discretization
        columns_to_discretize = [col for col in data.columns 
                               if col != self.target_column]
        
        for column in columns_to_discretize:
            cut_points = set()
            self._recursive_discretize(data[column].values, cut_points)
            self.cut_points_dict[column] = sorted(list(cut_points))
        
        return self
    
    def transform(self, data):
        """
        Transform continuous data into discrete intervals
        
        Parameters:
        -----------
        data : DataFrame
            Dataset to be transformed
        
        Returns:
        --------
        DataFrame with discretized values and original target column if present
        """
        if isinstance(data, pd.Series):
            data = data.to_frame()
            
        result = pd.DataFrame()
        
        for column in data.columns:
            if column == self.target_column:
                # Keep target column unchanged
                result[column] = data[column]
            elif column in self.cut_points_dict:
                cut_points = self.cut_points_dict[column]
                if not cut_points:
                    result[column] = 0
                else:
                    result[column] = np.searchsorted(cut_points, data[column])
            else:
                result[column] = data[column]
                
        return result
    
    def fit_transform(self, data):
        """Fit and transform the data in one step."""
        return self.fit(data).transform(data)

# Example usage
if __name__ == "__main__":
    # Generate sample data
    np.random.seed(42)
    n_samples = 1000
    
    # Create sample dataset with features and target
    df = pd.read_csv("D:/Projects/formal_project/concatenated_SWaT_Dataset.csv")
    
    # Initialize discretizer with target column specified
    discretizer = MDLDiscretizer(min_samples=5000, target_column='Normal/Attack')
    
    # Fit and transform the data
    discretized_df = discretizer.fit_transform(df)
    
    # Print results
    print("Original data first 5 rows:")
    print(df.head())
    print("\nDiscretized data first 5 rows:")
    print(discretized_df.head())
    print("\nCut points for each feature:")
    for col, cuts in discretizer.cut_points_dict.items():
        print(f"{col}: {cuts}")

KeyboardInterrupt: 