In [7]:
# Importing import libraries for data analysis and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='darkgrid')
%matplotlib inline

In [8]:
# Renaming columns for better readability
columns = [
    'srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes', 'dbytes',
    'sttl', 'dttl', 'sloss', 'dloss', 'service', 'Sload', 'Dload', 'Spkts', 'Dpkts',
    'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth', 'res_bdy_len',
    'Sjit', 'Djit', 'Stime', 'Ltime', 'Sintpkt', 'Dintpkt', 'tcprtt', 'synack',
    'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
    'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm',
    'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat', 'label'
]

# Importing the both datasets of UNSW-NB15
# Dataset: UNSW-NB15
df1= pd.read_csv('../Data/UNSW-NB15_1.csv',names=columns, skiprows=1, low_memory=False)
df2= pd.read_csv('../Data/UNSW-NB15_2.csv',names=columns, skiprows=1, low_memory=False)

# Concatenating the two datasets into a single DataFrame
df= pd.concat([df1,df2], ignore_index=True)

# Displaying basic information about the DataFrame
df.info()

# Displaying the first few rows of the DataFrame
df.head()

# Checking for missing values in the DataFrame
df.isnull().sum()


# List all the columns in the DataFrame
df.columns.tolist()

# Displaying the unique values in the 'label' column
df['label'].value_counts()


# Dropping unnecessary columns from the DataFrame which are not needed for macine learning
df.drop(columns=['srcip', 'sport', 'dstip', 'dsport', 'attack_cat'], inplace=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400000 entries, 0 to 1399999
Data columns (total 49 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   srcip             1400000 non-null  object 
 1   sport             1400000 non-null  object 
 2   dstip             1400000 non-null  object 
 3   dsport            1400000 non-null  object 
 4   proto             1400000 non-null  object 
 5   state             1400000 non-null  object 
 6   dur               1400000 non-null  float64
 7   sbytes            1400000 non-null  int64  
 8   dbytes            1400000 non-null  int64  
 9   sttl              1400000 non-null  int64  
 10  dttl              1400000 non-null  int64  
 11  sloss             1400000 non-null  int64  
 12  dloss             1400000 non-null  int64  
 13  service           1400000 non-null  object 
 14  Sload             1400000 non-null  float64
 15  Dload             1400000 non-null  float64
 16  

In [9]:
from sklearn.preprocessing import LabelEncoder

# Columns with string values that need to be encoded
cat_columns = ['proto', 'service', 'state']

# Creating ecoder object
label_encoders={}           
              
for col in cat_columns:
    le=LabelEncoder()
    df[col]=le.fit_transform(df[col])
    label_encoders[col] = le # Storing the encoder for later use
    
# Confirming that all values are now numeric
df.dtypes.value_counts()

# Checking class balance
df['label'].value_counts(normalize=True)

# Check for missing values
print("Missing values:", df.isnull().sum().sum())

# Confirm dataset shape
print("Shape of dataset:", df.shape)



Missing values: 592294
Shape of dataset: (1400000, 44)


In [10]:
# Splitting the dataset into features and target variable
from sklearn.model_selection import train_test_split

# Splitting the dataset into features (X) and target variable (y)
X= df.drop(columns=['label'])
y=df['label']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.2, random_state=42)

# Converting 'ct_ftp_cmd' column to numeric, replacing spaces with NaN
X_train['ct_ftp_cmd'] = pd.to_numeric(X_train['ct_ftp_cmd'].replace(' ', np.nan), errors='coerce')
X_test['ct_ftp_cmd'] = pd.to_numeric(X_test['ct_ftp_cmd'].replace(' ', np.nan), errors='coerce')

# Step 2: Fill NaNs with median 
X_train['ct_ftp_cmd'].fillna(X_train['ct_ftp_cmd'].median(), inplace=True)
X_test['ct_ftp_cmd'].fillna(X_test['ct_ftp_cmd'].median(), inplace=True)


# Displaying the shapes of the training and testing sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)



# Checking for NaN values in the training and testing sets
print("NaNs in X_train:", np.isnan(X_train).sum().sum())
print("NaNs in X_test:", np.isnan(X_test).sum().sum())

# Fill all missing values with the median of each column
X_train = X_train.fillna(X_train.median(numeric_only=True))
X_test = X_test.fillna(X_test.median(numeric_only=True))

# Checking for NaN values again after filling
print("✅ NaNs in X_train after clean-up:", X_train.isnull().sum().sum())
print("✅ NaNs in X_test after clean-up:", X_test.isnull().sum().sum())



Shape of X_train: (1120000, 43)
Shape of X_test: (280000, 43)
NaNs in X_train: 473472
NaNs in X_test: 118822
✅ NaNs in X_train after clean-up: 0
✅ NaNs in X_test after clean-up: 0


In [11]:
# Model training and evaluation

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Step 1: Create the model with class_weights ='balanced' to handle any imbalance
rf= RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Step 2: Training the model
rf.fit(X_train, y_train)

# Step 3: Making predictions on the test set
y_pred=rf.predict(X_test)

#Step 4: Evaluating the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[264542    366]
 [   408  14684]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    264908
           1       0.98      0.97      0.97     15092

    accuracy                           1.00    280000
   macro avg       0.99      0.99      0.99    280000
weighted avg       1.00      1.00      1.00    280000

