In [1]:
import os
import weka.core.jvm as jvm
from weka.core.converters import Loader, Saver
from weka.filters import Filter

# Set JAVA_HOME environment variable (ensure this path is correct)
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk-21"

# Start the JVM
if not jvm.started:
    jvm.start()

try:
    # Load the dataset in ARFF format
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file("converted.arff")
    
    # Ensure the class attribute is set correctly (MDLP requires supervised learning)
    data.class_is_last()

    # Apply MDLP-based discretization (information-theoretic measure)
    discretize = Filter(classname="weka.filters.supervised.attribute.Discretize")
    discretize.options = ["-M", "-R", "first-last"]  # -M for MDLP, -R for all attributes

    # Prepare the filter with the dataset structure
    discretize.inputformat(data)
    
    # Apply the filter
    discretized_data = discretize.filter(data)

    # Save the discretized dataset as a new ARFF file
    saver = Saver(classname="weka.core.converters.ArffSaver")
    saver.save_file(discretized_data, "discretized_dataset_mdlp.arff")

    print("Discretization complete with minimal data loss using MDLP.")

except Exception as e:
    print("An error occurred during discretization:", e)

finally:
    # Stop the JVM
    if jvm.started:
        jvm.stop()


DEBUG:weka.core.jvm:Adding bundled jars
DEBUG:weka.core.jvm:Classpath=['c:\\Users\\hp\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\weka\\lib\\arpack_combined.jar', 'c:\\Users\\hp\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\weka\\lib\\core.jar', 'c:\\Users\\hp\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\weka\\lib\\mtj.jar', 'c:\\Users\\hp\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\weka\\lib\\python-weka-wrapper.jar', 'c:\\Users\\hp\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\weka\\lib\\weka.jar']
DEBUG:weka.core.jvm:MaxHeapSize=default
DEBUG:weka.core.jvm:Package support disabled


An error occurred during discretization: java.lang.Exception: Illegal options: -M 


In [2]:
import arff
import pandas as pd
with open("discretized_dataset.arff", "r") as file:
    arff_data=arff.load(file)
df=pd.DataFrame(arff_data['data'], columns=[attr[0] for attr in arff_data['attributes']])
df.to_csv("discretized_dataset1.csv", index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'discretized_dataset.arff'

In [4]:
import pandas as pd
import numpy as np
from collections import Counter

# Function to calculate entropy
def entropy(y):
    counts = np.bincount(y)
    probabilities = counts / len(y)
    return -np.sum([p * np.log2(p) for p in probabilities if p > 0])

# Function to calculate information gain
def information_gain(y, y_left, y_right):
    parent_entropy = entropy(y)
    n = len(y)
    n_left, n_right = len(y_left), len(y_right)
    child_entropy = (n_left / n) * entropy(y_left) + (n_right / n) * entropy(y_right)
    return parent_entropy - child_entropy

# Function to find the best split point
def best_split(X, y):
    best_gain = -1
    best_split = None
    for split in np.unique(X):
        y_left = y[X <= split]
        y_right = y[X > split]
        if len(y_left) > 0 and len(y_right) > 0:
            gain = information_gain(y, y_left, y_right)
            if gain > best_gain:
                best_gain = gain
                best_split = split
    return best_split

# Recursive function to discretize based on information gain
def discretize_column(X, y, min_gain=0.01):
    split = best_split(X, y)
    if split is None:
        return [(-np.inf, np.inf)]
    y_left = y[X <= split]
    y_right = y[X > split]
    gain = information_gain(y, y_left, y_right)
    if gain < min_gain:
        return [(-np.inf, np.inf)]
    left_intervals = discretize_column(X[X <= split], y_left, min_gain)
    right_intervals = discretize_column(X[X > split], y_right, min_gain)
    return [(interval[0], split) for interval in left_intervals] + [(split, interval[1]) for interval in right_intervals]

# Load your dataset
data = pd.read_csv("concatenated_SWaT_Dataset.csv")

# Separate features and the target
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Discretize each column in X
intervals = {}
for column in X.columns:
    intervals[column] = discretize_column(X[column].values, y.values)

# Apply intervals to convert the continuous data to categorical data
X_discretized = X.copy()
for column, splits in intervals.items():
    for i, (low, high) in enumerate(splits):
        X_discretized[column] = np.where((X[column] > low) & (X[column] <= high), i, X_discretized[column])

# Save discretized data
discretized_data = pd.concat([X_discretized, y], axis=1)
discretized_data.to_csv("discretized_dataset_entropy.csv", index=False)

print("Discretization complete using information-theoretic measures.")


Discretization complete using information-theoretic measures.


In [8]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

data = pd.read_csv("concatenated_SWaT_Dataset.csv")
target_column = 'Normal/Attack'

def discretize_feature_using_tree(X, y, max_leaf_nodes=5):
    # Train a decision tree classifier to find split points based on the target variable
    clf = DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes)
    clf.fit(X.reshape(-1, 1), y)
    thresholds = clf.tree_.threshold[clf.tree_.threshold != -2]  # Get decision thresholds
    bins = [-np.inf] + sorted(thresholds.tolist()) + [np.inf]    # Define bins using thresholds
    return np.digitize(X, bins), bins                            # Digitize values into bins

def discretize_dataset(data, target_column, max_leaf_nodes=5):
    y = data[target_column].values  # Extract target variable
    discretized_data = data.copy()  # Copy the original data to add discretized columns

    for col in data.columns:
        if col != target_column:  # Skip the target column
            X = data[col].values
            discretized_values, bins = discretize_feature_using_tree(X, y, max_leaf_nodes)
            discretized_data[f'{col}_discretized'] = discretized_values
            print(f"Discretized '{col}' into bins: {bins}")

    return discretized_data


target_column = 'Normal/Attack'  # Set this to the name of your target column
discretized_data = discretize_dataset(data, target_column)
discretized_data.to_csv("discretized_dataset_tree.csv", index=False)


Discretized 'FIT101' into bins: [-inf, 0.0003202765074092895, 2.670627474784851, 2.696249485015869, 2.6967300176620483, inf]
Discretized 'LIT101' into bins: [-inf, 812.6890258789062, 813.1798095703125, 814.3966064453125, 858.6737670898438, inf]
Discretized 'MV101' into bins: [-inf, 0.5, 1.5, inf]
Discretized 'P101' into bins: [-inf, 1.5, inf]
Discretized 'P102' into bins: [-inf, 1.5, inf]
Discretized 'AIT201' into bins: [-inf, 188.26900482177734, 188.58944702148438, 192.78710174560547, 198.3946533203125, inf]
Discretized 'AIT202' into bins: [-inf, 8.51964282989502, 8.534862518310547, 8.565303802490234, 8.62826919555664, inf]
Discretized 'AIT203' into bins: [-inf, 339.15660095214844, 342.29685974121094, 353.1658477783203, 370.5075988769531, inf]
Discretized 'FIT201' into bins: [-inf, 6.40758516965434e-05, 2.4187999963760376, 2.435523509979248, 2.7764716148376465, inf]
Discretized 'MV201' into bins: [-inf, 0.5, 1.5, inf]
Discretized 'P201' into bins: [-inf, 1.5, inf]
Discretized 'P202' i