In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import arff

largeDataSet = './data/raw/KDDTrain.arff'
smallDataSet = './data/raw/KDDTrain_20Percent.arff'

largeTestData = './data/raw/KDDTest.arff'
smallTestData = './data/raw/KDDTest-21.arff'

# Loading the .arff data format into a pandas dataframe
data, meta = arff.loadarff(largeDataSet)
test_data, test_meta = arff.loadarff(largeTestData)

train = pd.DataFrame(data)
test = pd.DataFrame(test_data)

print('Shape of the training data: ', train.shape)
print('Shape of the test data: ', test.shape)

Shape of the training data:  (125973, 42)
Shape of the test data:  (22544, 42)


In [2]:
categorical_cols = train.select_dtypes(include=['object']).columns.tolist()

# for feature in categorical_cols:
#     plt.figure(figsize=(17, 5)) 
#     sns.countplot(data=df, x=feature, palette='Set3')
#     plt.title(f'Distribution of {feature}')
#     plt.xticks(rotation=85)
#     plt.show()

In [3]:

print([col for col in train if train[col].nunique() == 1])
print([col for col in test if test[col].nunique() == 1])
print(train.shape , test.shape)

['num_outbound_cmds']
['num_outbound_cmds']
(125973, 42) (22544, 42)


In [4]:

train = train.drop(columns='num_outbound_cmds')
test = test.drop(columns='num_outbound_cmds')
print(train.shape , test.shape)

(125973, 41) (22544, 41)


In [5]:
from scipy.stats import zscore

# A threshold value beyond which a data point is considered as an outlier
zscore_threshold = 3

# Calculate Z-scores for numeric columns (excluding categorical) in the training dataset
numeric_columns = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
z_scores_train = np.abs(zscore(train[numeric_columns]))

# Create an outlier mask indicating whether each row in the training dataset is an outlier or not
outlier_mask_train = np.any(z_scores_train > zscore_threshold, axis=1)

# Remove outliers from the training dataset
train_original = train.copy()
train = train_original[~outlier_mask_train]

# Calculate Z-scores for numeric columns (excluding categorical) in the test dataset
z_scores_test = np.abs(zscore(test[numeric_columns]))

# Create an outlier mask indicating whether each row in the test dataset is an outlier or not
outlier_mask_test = np.any(z_scores_test > zscore_threshold, axis=1)

# Remove outliers from the test dataset
test_original = test.copy()
test = test_original[~outlier_mask_test]

# Display the shape before and after removing outliers for both datasets
print("Training Dataset Shape before removing outliers:", train_original.shape)
print("Training Dataset Shape after removing outliers:", train.shape)
print("Test Dataset Shape before removing outliers:", test_original.shape)
print("Test Dataset Shape after removing outliers:", test.shape)


Training Dataset Shape before removing outliers: (125973, 41)
Training Dataset Shape after removing outliers: (103446, 41)
Test Dataset Shape before removing outliers: (22544, 41)
Test Dataset Shape after removing outliers: (15734, 41)


In [6]:
from sklearn.preprocessing import OneHotEncoder

train_categorical = train[train.select_dtypes(include=['object']).columns]
test_categorical = test[test.select_dtypes(include=['object']).columns]

train_categorical = train_categorical.drop(columns='class')
test_categorical = test_categorical.drop(columns='class')

encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(train_categorical)

train_encoded = encoder.transform(train_categorical)
test_encoded = encoder.transform(test_categorical)

print(train_encoded.shape , test_encoded.shape)

(103446, 87) (15734, 87)


In [7]:
from sklearn.preprocessing import MinMaxScaler

train_numerical = train[train.select_dtypes(include=['float64']).columns]
test_numerical = test[test.select_dtypes(include=['float64']).columns]

scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train_numerical)
test_scaled = scaler.transform(test_numerical)

print(train_scaled.shape , test_scaled.shape)

(103446, 33) (15734, 33)


In [8]:
# train_scaled = pd.DataFrame(train_scaled, columns=train_numerical.columns)
# test_scaled = pd.DataFrame(test_scaled, columns=test_numerical.columns)

# train_encoded = pd.DataFrame(train_encoded.todense())
# test_encoded = pd.DataFrame(test_encoded.todense())

# processed_train = pd.concat([train_scaled, train_encoded], axis=1)
# processed_test = pd.concat([test_scaled, test_encoded], axis=1)

# print(processed_train.shape, processed_test.shape)
# processed_train.info()


# Convert scaled arrays back to DataFrames with original column names
train_scaled = pd.DataFrame(train_scaled, columns=train_numerical.columns)
test_scaled = pd.DataFrame(test_scaled, columns=test_numerical.columns)

# Get the feature names for the encoded columns
encoded_columns = encoder.get_feature_names_out(input_features=train_categorical.columns)
train_encoded = pd.DataFrame(train_encoded.toarray(), columns=encoded_columns)
test_encoded = pd.DataFrame(test_encoded.toarray(), columns=encoded_columns)

# Concatenate the scaled and encoded DataFrames
processed_train = pd.concat([train_scaled, train_encoded], axis=1)
processed_test = pd.concat([test_scaled, test_encoded], axis=1)

print(processed_train.shape, processed_test.shape)



(103446, 120) (15734, 120)


In [9]:
# Calculate the correlation matrix for train dataset
corr_matrix_train = processed_train.corr(method='pearson')

print("Initial number of features in train dataset:", len(corr_matrix_train.columns))

# Calculate the correlation matrix for test dataset
corr_matrix_test = processed_test.corr(method='pearson')

print("Initial number of features in test dataset:", len(corr_matrix_test.columns))

# Define the correlation threshold
corr_threshold = 0.4

# Find highly correlated features for train dataset
high_corr_features_train = set()

# Iterate through upper triangle of the correlation matrix for train dataset
for i in range(len(corr_matrix_train.columns)):
    for j in range(i + 1, len(corr_matrix_train.columns)):
        if abs(corr_matrix_train.iloc[i, j]) > corr_threshold:
            high_corr_features_train.add(corr_matrix_train.columns[i])
            high_corr_features_train.add(corr_matrix_train.columns[j])

print("Number of features with high correlation in train dataset:", len(high_corr_features_train))

# Drop highly correlated features from both train and test datasets
processed_train = processed_train.drop(columns=high_corr_features_train)
processed_test = processed_test.drop(columns=high_corr_features_train)

print("Number of remaining features in train dataset:", len(processed_train.columns))
print("Number of remaining features in test dataset:", len(processed_test.columns))

print(processed_train.shape, processed_test.shape)

# plt.figure(figsize=(20, 20))
# sns.heatmap(processed_train.corr(method='pearson'), fmt='.1g', vmin=-1, vmax=1, center= 0, cmap= 'coolwarm', linewidths=3, linecolor='black')
# plt.show()


Initial number of features in train dataset: 120
Initial number of features in test dataset: 120
Number of features with high correlation in train dataset: 41
Number of remaining features in train dataset: 79
Number of remaining features in test dataset: 79
(103446, 79) (15734, 79)


In [10]:
X_train = processed_train.to_numpy()
y_train = train['class']

X_test = processed_test.to_numpy()
y_test = test['class']

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)


print(np.unique(y_train))
print(np.unique(y_test))

(103446, 79) (15734, 79)
(103446,) (15734,)
[b'anomaly' b'normal']
[b'anomaly' b'normal']


In [11]:
from sklearn.preprocessing import LabelEncoder

# Initialize label encoder
label_encoder = LabelEncoder()

# Fit and transform labels
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train_encoded)

y_pred_encoded = knn_model.predict(X_test)
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
print("Accuracy:", accuracy)


Accuracy: 0.8187364942163468


In [12]:
from sklearn.naive_bayes import GaussianNB

# Initialize Gaussian Naive Bayes model
nb_model = GaussianNB()

# Fit the model to the training data
nb_model.fit(X_train, y_train_encoded)

# Predict on the test data
y_pred_nb_encoded = nb_model.predict(X_test)

# Calculate accuracy
accuracy_nb = accuracy_score(y_test_encoded, y_pred_nb_encoded)
print("Naive Bayes Accuracy:", accuracy_nb)


Naive Bayes Accuracy: 0.6317528918266175


In [13]:
from sklearn.tree import DecisionTreeClassifier

# Initialize Decision Tree model
dt_model = DecisionTreeClassifier()

# Fit the model to the training data
dt_model.fit(X_train, y_train_encoded)

# Predict on the test data
y_pred_dt_encoded = dt_model.predict(X_test)

# Calculate accuracy
accuracy_dt = accuracy_score(y_test_encoded, y_pred_dt_encoded)
print("Decision Tree Accuracy:", accuracy_dt)


Decision Tree Accuracy: 0.8253463836278124


In [14]:
from sklearn.linear_model import LogisticRegression

# Initialize Logistic Regression model
logreg_model = LogisticRegression()

# Fit the model to the training data
logreg_model.fit(X_train, y_train_encoded)

# Predict on the test data
y_pred_logreg_encoded = logreg_model.predict(X_test)

# Calculate accuracy
accuracy_logreg = accuracy_score(y_test_encoded, y_pred_logreg_encoded)
print("Logistic Regression Accuracy:", accuracy_logreg)


Logistic Regression Accuracy: 0.6424939621202491


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
