# Imports

In [None]:
import kagglehub
import pandas as pd
import numpy as np
import os

# Loading Data

In [None]:
#Downloading data from Kaggle
path = kagglehub.dataset_download("divyansh1010/kdd-nids")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/kdd-nids


In [None]:
#Loading data
train = os.path.join(path, "KDD.csv")

df = pd.read_csv(train)

# Getting Info About Data

In [None]:
#Printing the first 5 rows of dataset to get a basic idea
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,difficulty_score
0,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
1,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
2,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
3,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21
4,0,tcp,private,REJ,0,0,0,0,0,0,...,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21


In [None]:
#Printing all the column names
df.columns

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'label', 'difficulty_score'],
      dtype='object')

In [None]:
#Getting the Data type of each column and how many non null values they have
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148515 entries, 0 to 148514
Data columns (total 43 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     148515 non-null  int64  
 1   protocol_type                148515 non-null  object 
 2   service                      148515 non-null  object 
 3   flag                         148515 non-null  object 
 4   src_bytes                    148515 non-null  int64  
 5   dst_bytes                    148515 non-null  int64  
 6   land                         148515 non-null  int64  
 7   wrong_fragment               148515 non-null  int64  
 8   urgent                       148515 non-null  int64  
 9   hot                          148515 non-null  int64  
 10  num_failed_logins            148515 non-null  int64  
 11  logged_in                    148515 non-null  int64  
 12  num_compromised              148515 non-null  int64  
 13 

In [None]:
#The shape of whole data
df.shape

(148515, 43)

# Handling Missing Values

In [None]:
#Checking if there is any missing value in the data
print(df.isnull().sum())

duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_h

In [None]:
#Dropping in case any null value (in this case, no null value)
df = df.dropna()

In [None]:
#Printing the dimensions again
df.shape

(148515, 43)

The shape didnt changed because there was no null values

# Handling Duplicate Values

In [None]:
#Checking for any duplicated rows
df.duplicated().sum()

np.int64(610)

In [None]:
#Dropping all the duplicate values
df.drop_duplicates(inplace=True)

In [None]:
#Printing the dimensions again
df.shape

(147905, 43)

The rows got less by 610 because there were 610 duplicate values

# Data Description

In [None]:
#Describing the data
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,147905.0,277.92456,2465.704,0.0,0.0,0.0,0.0,57715.0
src_bytes,147905.0,40387.340874,5420791.0,0.0,0.0,44.0,278.0,1379964000.0
dst_bytes,147905.0,17158.582955,3711179.0,0.0,0.0,0.0,580.0,1309937000.0
land,147905.0,0.000196,0.01400124,0.0,0.0,0.0,0.0,1.0
wrong_fragment,147905.0,0.020432,0.240042,0.0,0.0,0.0,0.0,3.0
urgent,147905.0,0.000203,0.01945721,0.0,0.0,0.0,0.0,3.0
hot,147905.0,0.18996,2.017202,0.0,0.0,0.0,0.0,101.0
num_failed_logins,147905.0,0.004341,0.07239701,0.0,0.0,0.0,0.0,5.0
logged_in,147905.0,0.404307,0.490759,0.0,0.0,0.0,1.0,1.0
num_compromised,147905.0,0.256016,22.27731,0.0,0.0,0.0,0.0,7479.0


In [None]:
#Counting occurences of each unqiue column in the target column(label)
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
normal,76966
neptune,45715
satan,4360
ipsweep,3643
smurf,3108
portsweep,3070
nmap,1566
back,1300
guess_passwd,1284
mscan,996


# Changing for binary labelling

In [None]:
#Making a new column called 'binary_label' for binary calssifying label in 2 sets
df['binary_label'] = df['label'].apply(lambda x: 'normal' if x == 'normal' else 'attack')

# Exporting for further Use

In [None]:
#Exporting this cleaned dataset for further use
df.to_pickle("cleaned_dataset.pkl")