In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from scipy.io import arff

In [2]:
largeDataSet = './data/raw/KDDTrain.arff'
smallDataSet = './data/raw/KDDTrain_20Percent.arff'
testDataSet = './data/raw/KDDTest.arff'

# Loading the .arff data format into a pandas dataframe
data, meta = arff.loadarff(largeDataSet)
df = pd.DataFrame(data)

test_data, test_deta = arff.loadarff(testDataSet)
test_df = pd.DataFrame(test_data)

pd.set_option('display.max_columns', None)

# Review of the dataset
df.head()
test_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0.0,b'tcp',b'private',b'REJ',0.0,0.0,b'0',0.0,0.0,0.0,0.0,b'0',0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b'0',b'0',229.0,10.0,0.0,0.0,1.0,1.0,0.04,0.06,0.0,255.0,10.0,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0,b'anomaly'
1,0.0,b'tcp',b'private',b'REJ',0.0,0.0,b'0',0.0,0.0,0.0,0.0,b'0',0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b'0',b'0',136.0,1.0,0.0,0.0,1.0,1.0,0.01,0.06,0.0,255.0,1.0,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,b'anomaly'
2,2.0,b'tcp',b'ftp_data',b'SF',12983.0,0.0,b'0',0.0,0.0,0.0,0.0,b'0',0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b'0',b'0',1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,134.0,86.0,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,b'normal'
3,0.0,b'icmp',b'eco_i',b'SF',20.0,0.0,b'0',0.0,0.0,0.0,0.0,b'0',0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b'0',b'0',1.0,65.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,3.0,57.0,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,b'anomaly'
4,1.0,b'tcp',b'telnet',b'RSTO',0.0,15.0,b'0',0.0,0.0,0.0,0.0,b'0',0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b'0',b'0',1.0,8.0,0.0,0.12,1.0,0.5,1.0,0.0,0.75,29.0,86.0,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,b'anomaly'


In [3]:
# Print meta data
print(meta)

Dataset: 'KDDTrain'
	duration's type is numeric
	protocol_type's type is nominal, range is ('tcp', 'udp', 'icmp')
	service's type is nominal, range is ('aol', 'auth', 'bgp', 'courier', 'csnet_ns', 'ctf', 'daytime', 'discard', 'domain', 'domain_u', 'echo', 'eco_i', 'ecr_i', 'efs', 'exec', 'finger', 'ftp', 'ftp_data', 'gopher', 'harvest', 'hostnames', 'http', 'http_2784', 'http_443', 'http_8001', 'imap4', 'IRC', 'iso_tsap', 'klogin', 'kshell', 'ldap', 'link', 'login', 'mtp', 'name', 'netbios_dgm', 'netbios_ns', 'netbios_ssn', 'netstat', 'nnsp', 'nntp', 'ntp_u', 'other', 'pm_dump', 'pop_2', 'pop_3', 'printer', 'private', 'red_i', 'remote_job', 'rje', 'shell', 'smtp', 'sql_net', 'ssh', 'sunrpc', 'supdup', 'systat', 'telnet', 'tftp_u', 'tim_i', 'time', 'urh_i', 'urp_i', 'uucp', 'uucp_path', 'vmnet', 'whois', 'X11', 'Z39_50')
	flag's type is nominal, range is ('OTH', 'REJ', 'RSTO', 'RSTOS0', 'RSTR', 'S0', 'S1', 'S2', 'S3', 'SF', 'SH')
	src_bytes's type is numeric
	dst_bytes's type is numeric

In [4]:
# List all unique classes
print(df['class'].unique())

# So we have two mutual exclusive labels: 'normal' and 'anomaly'. This what we want to predict. 
# In other words, this is the range of the labelling function and thus also the range of the predictor function.

[b'normal' b'anomaly']


In [5]:
# Manually Checking if data include missing values
missingValues = df.isnull().sum()
print(f"Missing Values in every Column: \n{missingValues}")

# No missing values. Imputation not required.

Missing Values in every Column: 
duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_r

In [6]:
# Checking Anomolies in the dataset
print(f"Descriptive Statistics: \n{df.describe(include='all')}")

Descriptive Statistics: 
            duration protocol_type  service    flag     src_bytes  \
count   125973.00000        125973   125973  125973  1.259730e+05   
unique           NaN             3       70      11           NaN   
top              NaN        b'tcp'  b'http'   b'SF'           NaN   
freq             NaN        102689    40338   74945           NaN   
mean       287.14465           NaN      NaN     NaN  4.556674e+04   
std       2604.51531           NaN      NaN     NaN  5.870331e+06   
min          0.00000           NaN      NaN     NaN  0.000000e+00   
25%          0.00000           NaN      NaN     NaN  0.000000e+00   
50%          0.00000           NaN      NaN     NaN  4.400000e+01   
75%          0.00000           NaN      NaN     NaN  2.760000e+02   
max      42908.00000           NaN      NaN     NaN  1.379964e+09   

           dst_bytes    land  wrong_fragment         urgent            hot  \
count   1.259730e+05  125973   125973.000000  125973.000000  125973.

In [7]:
# Dataset summary
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125973 entries, 0 to 125972
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     125973 non-null  float64
 1   protocol_type                125973 non-null  object 
 2   service                      125973 non-null  object 
 3   flag                         125973 non-null  object 
 4   src_bytes                    125973 non-null  float64
 5   dst_bytes                    125973 non-null  float64
 6   land                         125973 non-null  object 
 7   wrong_fragment               125973 non-null  float64
 8   urgent                       125973 non-null  float64
 9   hot                          125973 non-null  float64
 10  num_failed_logins            125973 non-null  float64
 11  logged_in                    125973 non-null  object 
 12  num_compromised              125973 non-null  float64
 13 

# Data preprocessing

### Encoding nominal features

In [8]:
# # protocol_type
# print(df['protocol_type'].unique()) # Only 3 unique values. We are going to use LabelEncoder
# enc = LabelEncoder()
# df['protocol_type_enc'] = enc.fit_transform(df['protocol_type'])

# df[['protocol_type_enc', 'protocol_type']].head(400)
# # 0: icmp, 1: tcp, 2: udp

In [9]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Step 1: One-Hot Encoding
# Concatenate training and test data to ensure consistent encoding
combined_df = pd.concat([df, test_df], axis=0, ignore_index=True)
encoded_df = pd.get_dummies(combined_df, columns=list(df.select_dtypes(include=['object']).columns))

# Split the encoded data back into training and test sets
train_encoded = encoded_df[:len(df)]
test_encoded = encoded_df[len(df):]

# Step 2: Fit a KNN Classifier
X_train = train_encoded.drop(columns=["class_b'normal'", "class_b'anomaly'"])
y_train = train_encoded[["class_b'normal'", "class_b'anomaly'"]]

k = 10  # You can choose an appropriate value for k
knn_classifier = KNeighborsClassifier(n_neighbors=k)
knn_classifier.fit(X_train, y_train)

# Step 3: Prediction
X_test = test_encoded.drop(columns=["class_b'normal'", "class_b'anomaly'"])
y_test = test_encoded[["class_b'normal'", "class_b'anomaly'"]]
y_pred = knn_classifier.predict(X_test)

# Step 4: Evaluation
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)



Accuracy: 77.013839602555


In [18]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

# Combine training and test data for consistent one-hot encoding and scaling
combined_df = pd.concat([df, test_df], axis=0, ignore_index=True)

# Identify numerical and categorical columns
numerical_columns = combined_df.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = combined_df.select_dtypes(include=['object']).columns

# Perform one-hot encoding on the combined data for categorical features
encoded_df = pd.get_dummies(combined_df, columns=categorical_columns)

# Split the encoded data back into training and test sets
train_encoded = encoded_df[:len(df)]
test_encoded = encoded_df[len(df):]

# # Identify new columns created for previously unseen labels
# new_columns = set(test_encoded.columns) - set(train_encoded.columns)

# # Set these new columns to all zeros in the test data
# for col in new_columns:
#     test_encoded[col] = 0

# Use MinMaxScaler to scale the numerical features
scaler = MinMaxScaler()
train_encoded[numerical_columns] = scaler.fit_transform(train_encoded[numerical_columns])
test_encoded[numerical_columns] = scaler.transform(test_encoded[numerical_columns])

# Prepare the data for training and testing
X_train = train_encoded.drop(columns=["class_b'normal'", "class_b'anomaly'"])
y_train = train_encoded[["class_b'normal'", "class_b'anomaly'"]]
X_test = test_encoded.drop(columns=["class_b'normal'", "class_b'anomaly'"])
y_test = test_encoded[["class_b'normal'", "class_b'anomaly'"]]

k = 3  # You can choose an appropriate value for k
knn_classifier = KNeighborsClassifier(n_neighbors=k)
knn_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = knn_classifier.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_encoded[numerical_columns] = scaler.fit_transform(train_encoded[numerical_columns])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_encoded[numerical_columns] = scaler.transform(test_encoded[numerical_columns])


Accuracy: 77.18683463449256


In [27]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Combine training and test data for consistent one-hot encoding and scaling
combined_df = df

# Identify numerical and categorical columns
numerical_columns = combined_df.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = combined_df.select_dtypes(include=['object']).columns

# Perform one-hot encoding on the combined data for categorical features
encoded_df = pd.get_dummies(combined_df, columns=categorical_columns)
encoded_df.head()


Y_values = encoded_df[["class_b'normal'", "class_b'anomaly'"]]
X_train, X_test, y_train, y_test = train_test_split(encoded_df, Y_values, stratify=Y_values, test_size=0.25, random_state=0)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Use MinMaxScaler to scale the numerical features
scaler = MinMaxScaler()
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

k = 3  # You can choose an appropriate value for k
knn_classifier = KNeighborsClassifier(n_neighbors=k)
knn_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = knn_classifier.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

(94479, 128) (31494, 128) (94479, 2) (31494, 2)
Accuracy: 99.98094875214326
