In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.io import arff

In [None]:
largeDataSet = './data/raw/KDDTrain.arff'
smallDataSet = './data/raw/KDDTrain_20Percent.arff'

# Loading the .arff data format into a pandas dataframe
data, meta = arff.loadarff(smallDataSet)
df = pd.DataFrame(data)

# Review of the dataset
df.head(3)

In [11]:
# Dataset summary
df.info()
# Also need to list the number of categorical and numerical values feature numbers

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25192 entries, 0 to 25191
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   duration                     25192 non-null  float64
 1   protocol_type                25192 non-null  object 
 2   service                      25192 non-null  object 
 3   flag                         25192 non-null  object 
 4   src_bytes                    25192 non-null  float64
 5   dst_bytes                    25192 non-null  float64
 6   land                         25192 non-null  object 
 7   wrong_fragment               25192 non-null  float64
 8   urgent                       25192 non-null  float64
 9   hot                          25192 non-null  float64
 10  num_failed_logins            25192 non-null  float64
 11  logged_in                    25192 non-null  object 
 12  num_compromised              25192 non-null  float64
 13  root_shell      

### Data Cleaing : Handling & Imputing Missing Values

In this stage we're ensuring there's no missing / null values in the datset.

In [None]:
# Manually Checking if data include missing values
missingValues = df.isnull().sum()
print(f"Missing Values in every Column: \n{missingValues}")

In [None]:
# Checking Anomolies in the dataset
print(f"Descriptive Statistics: \n{df.describe(include='all')}")


Since there's no missing values in the dataset, we don't need to handle/ impute missing values.
Also, sinse every feature is complete with its values, we don't need to drop any feature.

## Feature Scaling
Since the datavalues are not in the same range, we need to scale them to the same range. We can choose between the two methods: MinMaxScaler and StandardScaler.

For choosing one of them let's first plot the distribution of the values of each feature to check for outliers.

In [None]:
# Select only numeric columns for outlier visualization
numeric_columns = df.select_dtypes(include=['int64','float64'])

# Create a boxplot to visualize outliers using Z-scores
plt.figure(figsize=(12, 6))
sns.boxplot(data=numeric_columns, orient='h', palette='Set2')
plt.title('Boxplot of Numeric Features with Outliers')
plt.xlabel('Z-score')
plt.show()


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Separate categorical and numeric columns
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()[:-1]  # Exclude the class column
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()


# One-hot encode categorical columns
encoded_columns = pd.get_dummies(df[categorical_columns], drop_first=True)

# Scale numeric columns using MinMaxScaler
scaler = MinMaxScaler()
scaled_columns = scaler.fit_transform(df[numeric_columns])


# Convert the scaled numeric columns back to a DataFrame with appropriate column names
scaled_df = pd.DataFrame(scaled_columns, columns=numeric_columns)
# Combine the scaled numeric and encoded categorical features
processed_data = pd.concat([scaled_df, encoded_columns], axis=1)

In [None]:
plt.figure(figsize=(15, 15))
sns.heatmap(processed_data.corr(), cmap='YlGnBu')
plt.title('Correlation Heatmap')
plt.show()

# PCA Algorithm for Dimensionality Reduction

In [None]:
from pandas.plotting import scatter_matrix

# Specify the alpha parameter to control transparency of data points
scatter_matrix(df[["protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent", "hot", "class"]], c=df["class"], figsize=(15, 15))
plt.show()
