# Import All Python Necessary Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings('ignore')
import time
import seaborn as sns 
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.feature_selection import f_classif, SelectKBest, VarianceThreshold
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE 

# Loading the Datasets

In [2]:
df = pd.read_csv('IoT Network Intrusion Dataset.csv')
df

Unnamed: 0,Flow_ID,Src_IP,Src_Port,Dst_IP,Dst_Port,Protocol,Timestamp,Flow_Duration,Tot_Fwd_Pkts,Tot_Bwd_Pkts,...,Active_Std,Active_Max,Active_Min,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,Label,Cat,Sub_Cat
0,192.168.0.13-192.168.0.16-10000-10101-17,192.168.0.13,10000,192.168.0.16,10101,17,25/07/2019 03:25:53 AM,75,1,1,...,0.0,0.0,0.0,75.0,0.000000,75.0,75.0,Anomaly,Mirai,Mirai-Ackflooding
1,192.168.0.13-222.160.179.132-554-2179-6,222.160.179.132,2179,192.168.0.13,554,6,26/05/2019 10:11:06 PM,5310,1,2,...,0.0,0.0,0.0,2655.0,2261.327486,4254.0,1056.0,Anomaly,DoS,DoS-Synflooding
2,192.168.0.13-192.168.0.16-9020-52727-6,192.168.0.16,52727,192.168.0.13,9020,6,11/07/2019 01:24:48 AM,141,0,3,...,0.0,0.0,0.0,70.5,0.707107,71.0,70.0,Anomaly,Scan,Scan Port OS
3,192.168.0.13-192.168.0.16-9020-52964-6,192.168.0.16,52964,192.168.0.13,9020,6,04/09/2019 03:58:17 AM,151,0,2,...,0.0,0.0,0.0,151.0,0.000000,151.0,151.0,Anomaly,Mirai,Mirai-Hostbruteforceg
4,192.168.0.1-239.255.255.250-36763-1900-17,192.168.0.1,36763,239.255.255.250,1900,17,10/09/2019 01:41:18 AM,153,2,1,...,0.0,0.0,0.0,76.5,0.707107,77.0,76.0,Anomaly,Mirai,Mirai-Hostbruteforceg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
625778,192.168.0.24-210.89.164.90-56112-8043-17,192.168.0.24,56112,210.89.164.90,8043,17,25/07/2019 03:25:00 AM,277,1,1,...,0.0,0.0,0.0,277.0,0.000000,277.0,277.0,Anomaly,Mirai,Mirai-UDP Flooding
625779,192.168.0.13-222.131.171.244-554-4570-6,222.131.171.244,4570,192.168.0.13,554,6,26/05/2019 10:06:51 PM,1658,0,2,...,0.0,0.0,0.0,1658.0,0.000000,1658.0,1658.0,Anomaly,DoS,DoS-Synflooding
625780,192.168.0.13-192.168.0.16-9020-52739-6,192.168.0.16,52739,192.168.0.13,9020,6,11/07/2019 01:29:09 AM,77,1,1,...,0.0,0.0,0.0,77.0,0.000000,77.0,77.0,Anomaly,Scan,Scan Port OS
625781,192.168.0.13-192.168.0.16-9020-49784-6,192.168.0.13,9020,192.168.0.16,49784,6,20/05/2019 05:00:29 AM,240,2,1,...,0.0,0.0,0.0,120.0,7.071068,125.0,115.0,Normal,Normal,Normal


# Some Basic Cleaning and EDA

In [3]:
# There are some infinite values in this column, which prevent saling. So, replacing them by max value of float64
Max_Val_Float_64 = np.finfo(np.float64).max
df['Flow_Byts/s'] = df['Flow_Byts/s'].replace([np.inf, -np.inf], Max_Val_Float_64)
df['Flow_Pkts/s'] = df['Flow_Pkts/s'].replace([np.inf, -np.inf], Max_Val_Float_64)

# Data Encoding and Data Normalization

In [4]:
Categorical_Columns = df.select_dtypes(include=['object']).columns
Numerical_Columns = df.select_dtypes(include=['int64','float64']).columns

print("Categorical Columns: ", Categorical_Columns)
print("Numerical Columns: ", Numerical_Columns)

Categorical Columns:  Index(['Flow_ID', 'Src_IP', 'Dst_IP', 'Timestamp', 'Label', 'Cat', 'Sub_Cat'], dtype='object')
Numerical Columns:  Index(['Src_Port', 'Dst_Port', 'Protocol', 'Flow_Duration', 'Tot_Fwd_Pkts',
       'Tot_Bwd_Pkts', 'TotLen_Fwd_Pkts', 'TotLen_Bwd_Pkts', 'Fwd_Pkt_Len_Max',
       'Fwd_Pkt_Len_Min', 'Fwd_Pkt_Len_Mean', 'Fwd_Pkt_Len_Std',
       'Bwd_Pkt_Len_Max', 'Bwd_Pkt_Len_Min', 'Bwd_Pkt_Len_Mean',
       'Bwd_Pkt_Len_Std', 'Flow_Byts/s', 'Flow_Pkts/s', 'Flow_IAT_Mean',
       'Flow_IAT_Std', 'Flow_IAT_Max', 'Flow_IAT_Min', 'Fwd_IAT_Tot',
       'Fwd_IAT_Mean', 'Fwd_IAT_Std', 'Fwd_IAT_Max', 'Fwd_IAT_Min',
       'Bwd_IAT_Tot', 'Bwd_IAT_Mean', 'Bwd_IAT_Std', 'Bwd_IAT_Max',
       'Bwd_IAT_Min', 'Fwd_PSH_Flags', 'Bwd_PSH_Flags', 'Fwd_URG_Flags',
       'Bwd_URG_Flags', 'Fwd_Header_Len', 'Bwd_Header_Len', 'Fwd_Pkts/s',
       'Bwd_Pkts/s', 'Pkt_Len_Min', 'Pkt_Len_Max', 'Pkt_Len_Mean',
       'Pkt_Len_Std', 'Pkt_Len_Var', 'FIN_Flag_Cnt', 'SYN_Flag_Cnt',
       'RST_Fla

# Replace Missing Value

In [8]:
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()
#Drop duplicate rows
df = df.drop_duplicates()

# Counting Labels 

In [5]:
# Count the number of samples for each class in the 'Label' column
label_counts = df['Label'].value_counts()

label_counts

Label
Anomaly    585710
Normal      40073
Name: count, dtype: int64

In [6]:
# Count the number of samples for each class in the 'Label' column
cat_counts = df['Cat'].value_counts()

cat_counts

Cat
Mirai                415677
Scan                  75265
DoS                   59391
Normal                40073
MITM ARP Spoofing     35377
Name: count, dtype: int64

In [7]:
# Count the number of samples for each sub-category in the 'Sub_Cat' column
sub_cat_counts = df['Sub_Cat'].value_counts()

sub_cat_counts

Sub_Cat
Mirai-UDP Flooding       183554
Mirai-Hostbruteforceg    121181
DoS-Synflooding           59391
Mirai-HTTP Flooding       55818
Mirai-Ackflooding         55124
Scan Port OS              53073
Normal                    40073
MITM ARP Spoofing         35377
Scan Hostport             22192
Name: count, dtype: int64

# Data Scaling and Encoding

In [9]:
for col in Numerical_Columns:
    Scaler = MinMaxScaler()
    df[col] = Scaler.fit_transform(df[col].astype(str).astype('float64').values.reshape(-1,1))

for col in Categorical_Columns:
    if col != 'Label':
        LE = OrdinalEncoder()
        df[col] = LE.fit_transform(df[col].values.reshape(-1,1))
df

Unnamed: 0,Flow_ID,Src_IP,Src_Port,Dst_IP,Dst_Port,Protocol,Timestamp,Flow_Duration,Tot_Fwd_Pkts,Tot_Bwd_Pkts,...,Active_Std,Active_Max,Active_Min,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,Label,Cat,Sub_Cat
0,12446.0,25883.0,0.152672,203.0,0.154518,1.000000,3496.0,0.000750,0.005376,0.000000,...,0.0,0.0,0.0,0.000750,0.000000,0.000750,0.000750,Anomaly,2.0,2.0
1,22760.0,34617.0,0.033267,200.0,0.008475,0.352941,3664.0,0.053108,0.005376,0.001789,...,0.0,0.0,0.0,0.026557,0.033715,0.042551,0.010563,Anomaly,0.0,0.0
2,12691.0,25886.0,0.804992,200.0,0.137982,0.352941,2082.0,0.001410,0.000000,0.003578,...,0.0,0.0,0.0,0.000705,0.000011,0.000710,0.000700,Anomaly,4.0,8.0
3,12704.0,25886.0,0.808611,200.0,0.137982,0.352941,791.0,0.001510,0.000000,0.001789,...,0.0,0.0,0.0,0.001510,0.000000,0.001510,0.001510,Anomaly,2.0,4.0
4,611.0,25881.0,0.561267,317.0,0.029065,1.000000,1040.0,0.001530,0.010753,0.000000,...,0.0,0.0,0.0,0.000765,0.000011,0.000770,0.000760,Anomaly,2.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
625773,62439.0,25889.0,0.918550,233.0,0.136131,1.000000,3245.0,0.000290,0.026882,0.000000,...,0.0,0.0,0.0,0.000058,0.000050,0.000110,0.000030,Anomaly,2.0,5.0
625776,58871.0,21034.0,0.133420,205.0,0.299888,0.352941,535.0,0.010922,0.000000,0.001789,...,0.0,0.0,0.0,0.010923,0.000000,0.010923,0.010923,Anomaly,0.0,0.0
625778,62081.0,25889.0,0.856672,233.0,0.123036,1.000000,3443.0,0.002770,0.005376,0.000000,...,0.0,0.0,0.0,0.002771,0.000000,0.002771,0.002771,Anomaly,2.0,5.0
625779,18760.0,30623.0,0.069771,200.0,0.008475,0.352941,3637.0,0.016583,0.000000,0.001789,...,0.0,0.0,0.0,0.016584,0.000000,0.016584,0.016584,Anomaly,0.0,0.0


# Transform the Labels

In [10]:
str2idx_Label = {label:idx for idx, label in enumerate(df.Label.unique())}
str2idx_Label

{'Anomaly': 0, 'Normal': 1}

In [11]:
df.Label = df.Label.map(str2idx_Label)

In [13]:
from sklearn.ensemble import IsolationForest
clf = IsolationForest(n_estimators=3, n_jobs=-1).fit(df)
clf.predict(df)
threshold = 0
df = df[clf.predict(df) == 1]

In [14]:
# Count the number of samples for each class in the 'Label' column
label_counts = df['Label'].value_counts()

label_counts


Label
0    349408
1     19314
Name: count, dtype: int64

In [15]:
# Count the number of samples for each class in the 'Label' column
cat_counts = df['Cat'].value_counts()

cat_counts

Cat
2.0    240132
4.0     49959
0.0     44088
3.0     19314
1.0     15229
Name: count, dtype: int64

In [16]:
# Count the number of samples for each sub-category in the 'Sub_Cat' column
sub_cat_counts = df['Sub_Cat'].value_counts()

sub_cat_counts


Sub_Cat
5.0    111884
4.0     80632
0.0     44088
8.0     35628
3.0     24149
2.0     23467
6.0     19314
1.0     15229
7.0     14331
Name: count, dtype: int64

# Data Splitting 

In [16]:
x = df.drop('Label',axis=1)
y = df.Label

In [17]:
df

Unnamed: 0,Flow_ID,Src_IP,Src_Port,Dst_IP,Dst_Port,Protocol,Timestamp,Flow_Duration,Tot_Fwd_Pkts,Tot_Bwd_Pkts,...,Active_Std,Active_Max,Active_Min,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,Label,Cat,Sub_Cat
0,12446.0,25883.0,0.152672,203.0,0.154518,1.000000,3496.0,0.000750,0.005376,0.000000,...,0.0,0.0,0.0,0.000750,0.000000,0.000750,0.000750,0,2.0,2.0
1,22760.0,34617.0,0.033267,200.0,0.008475,0.352941,3664.0,0.053108,0.005376,0.001789,...,0.0,0.0,0.0,0.026557,0.033715,0.042551,0.010563,0,0.0,0.0
2,12691.0,25886.0,0.804992,200.0,0.137982,0.352941,2082.0,0.001410,0.000000,0.003578,...,0.0,0.0,0.0,0.000705,0.000011,0.000710,0.000700,0,4.0,8.0
3,12704.0,25886.0,0.808611,200.0,0.137982,0.352941,791.0,0.001510,0.000000,0.001789,...,0.0,0.0,0.0,0.001510,0.000000,0.001510,0.001510,0,2.0,4.0
5,47142.0,25889.0,0.640916,8.0,0.006777,0.352941,927.0,0.001570,0.010753,0.000000,...,0.0,0.0,0.0,0.000785,0.000095,0.000830,0.000740,0,2.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
625772,12838.0,25883.0,0.086473,205.0,0.000352,0.352941,958.0,0.000280,0.005376,0.000000,...,0.0,0.0,0.0,0.000280,0.000000,0.000280,0.000280,0,2.0,4.0
625773,62439.0,25889.0,0.918550,233.0,0.136131,1.000000,3245.0,0.000290,0.026882,0.000000,...,0.0,0.0,0.0,0.000058,0.000050,0.000110,0.000030,0,2.0,5.0
625776,58871.0,21034.0,0.133420,205.0,0.299888,0.352941,535.0,0.010922,0.000000,0.001789,...,0.0,0.0,0.0,0.010923,0.000000,0.010923,0.010923,0,0.0,0.0
625778,62081.0,25889.0,0.856672,233.0,0.123036,1.000000,3443.0,0.002770,0.005376,0.000000,...,0.0,0.0,0.0,0.002771,0.000000,0.002771,0.002771,0,2.0,5.0


# Technique for Balancing Datasets

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Apply SMOTE to address class imbalance
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)


In [20]:
# Print the number of samples after applying SMOTE
print("Number of samples after applying SMOTE:")
print(pd.Series(y_train_resampled).value_counts())

Number of samples after applying SMOTE:
Label
0    308365
1    308365
Name: count, dtype: int64


In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print("Train: ", x_train.shape, y_train.shape)
print("Test: ", x_test.shape, y_test.shape)

Train:  (309984, 85) (309984,)
Test:  (77497, 85) (77497,)


# Feature Selection

In [22]:
from sklearn.feature_selection import RFE
def recursive_feature_elimination(xtrain, ytrain, n_features_to_select=10):
    '''Function to select features based on Recursive Feature Elimination (RFE)'''
    estimator = RandomForestClassifier()  # can use a different estimator if needed
    rfe = RFE(estimator, n_features_to_select=n_features_to_select)
    rfe.fit(xtrain, ytrain)

    # Get the selected features
    selected_features = np.where(rfe.support_)[0]

    return xtrain.columns[selected_features]

# Apply recursive feature elimination separately to training, testing, and validation sets
selected_features_train = recursive_feature_elimination(x_train, y_train)
selected_features_test = recursive_feature_elimination(x_test, y_test)
# selected_features_val = recursive_feature_elimination(x_val, y_val)  # Uncomment if a have a validation set

# Ensure all datasets have the same columns after feature selection
common_selected_features = list(set(selected_features_train) & set(selected_features_test))

# Subset the datasets with common selected features
x_train_selected = x_train[common_selected_features]
x_test_selected = x_test[common_selected_features]

print("Train: ", x_train_selected.shape, y_train.shape)
print("Test: ", x_test_selected.shape, y_test.shape)

Train:  (309984, 8) (309984,)
Test:  (77497, 8) (77497,)


In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import RFE
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# ... (previous code)

def recursive_feature_elimination(xtrain, ytrain, n_features_to_select=50):
    '''Function to select features based on Recursive Feature Elimination (RFE)'''
    estimator = RandomForestClassifier()  # You can use a different estimator if needed
    rfe = RFE(estimator, n_features_to_select=n_features_to_select)
    rfe.fit(xtrain, ytrain)

    # Get the selected features
    selected_features = np.where(rfe.support_)[0]

    return xtrain.columns[selected_features]

# ... (previous code)

best_features = list(recursive_feature_elimination(x_train, y_train))
print('The best features are: ', best_features)

# Drop multiple features from x_test and x_train
features_to_drop = ['Src_IP', 'Src_Port', 'Dst_IP', 'Dst_Port', 'Protocol', 'Timestamp']  # Replace with your feature names
xtest = x_test.drop(features_to_drop, axis=1)
xtrain = x_train.drop(features_to_drop, axis=1)

# ... (continue with the rest of your code)

# Example usage of Train_Model with recursive feature selection
# Acc, Prec, Rec, F1, FPR, Train_Time, Test_Time = Train_Model(your_model, xtrain, y_train, xtest, y_test)


The best features are:  ['Flow_ID', 'Src_IP', 'Src_Port', 'Dst_IP', 'Dst_Port', 'Timestamp', 'Flow_Duration', 'TotLen_Fwd_Pkts', 'TotLen_Bwd_Pkts', 'Fwd_Pkt_Len_Max', 'Fwd_Pkt_Len_Min', 'Fwd_Pkt_Len_Mean', 'Bwd_Pkt_Len_Max', 'Bwd_Pkt_Len_Min', 'Bwd_Pkt_Len_Mean', 'Flow_IAT_Mean', 'Flow_IAT_Std', 'Flow_IAT_Max', 'Flow_IAT_Min', 'Bwd_IAT_Tot', 'Bwd_IAT_Mean', 'Bwd_IAT_Std', 'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Bwd_PSH_Flags', 'Fwd_Header_Len', 'Bwd_Header_Len', 'Fwd_Pkts/s', 'Bwd_Pkts/s', 'Pkt_Len_Min', 'Pkt_Len_Max', 'Pkt_Len_Mean', 'Pkt_Len_Std', 'Pkt_Len_Var', 'SYN_Flag_Cnt', 'PSH_Flag_Cnt', 'ACK_Flag_Cnt', 'Down/Up_Ratio', 'Pkt_Size_Avg', 'Fwd_Seg_Size_Avg', 'Bwd_Seg_Size_Avg', 'Subflow_Fwd_Byts', 'Subflow_Bwd_Byts', 'Init_Bwd_Win_Byts', 'Idle_Mean', 'Idle_Std', 'Idle_Max', 'Idle_Min', 'Cat', 'Sub_Cat']


# Model Training and Evaluation

In [24]:
def Train_Model(Model,xtrain,ytrain,xtest,ytest):
    start = time.time()
    Model.fit(xtrain,ytrain)
    end = time.time()
    Train_Time_Taken = end-start
    start = time.time()
    ypred = Model.predict(xtest)
    end = time.time()
    Test_Time_Taken = end-start
    Acc = accuracy_score(ytest,ypred)*100
    Prec = precision_score(ytest,ypred,average='macro')*100
    Rec = recall_score(ytest,ypred,average='macro')*100
    F1 = f1_score(ytest,ypred,average='macro')*100
    print(classification_report(ytest,ypred))
    print(f"Accuracy: {Acc}")
    print(f"Precision: {Prec}")
    print(f"Recall: {Rec}")
    print(f"F1: {F1}")
    print(f"Training Time Taken: {Train_Time_Taken}")
    print(f"Testing Time Taken: {Test_Time_Taken}")
    sns.heatmap(confusion_matrix(ytest,ypred),annot=True,fmt='d')
    plt.show()
    print()
    return Acc,Prec,Rec,F1,Train_Time_Taken,Test_Time_Taken

from sklearn.metrics import confusion_matrix

def Train_Model(Model, xtrain, ytrain, xtest, ytest):
    start = time.time()
    Model.fit(xtrain, ytrain)
    end = time.time()
    Train_Time_Taken = end - start
    
    start = time.time()
    ypred = Model.predict(xtest)
    end = time.time()
    Test_Time_Taken = end - start
    
    Acc = accuracy_score(ytest, ypred) * 100
    Prec = precision_score(ytest, ypred, average='macro') * 100
    Rec = recall_score(ytest, ypred, average='macro') * 100
    F1 = f1_score(ytest, ypred, average='macro') * 100
    
    # Calculate the confusion matrix
    cm = confusion_matrix(ytest, ypred)
    
    # Extract values from the confusion matrix
    TN, FP, FN, TP = cm.ravel()
    
    # Calculate the false positive rate (FPR)
    FPR = FP / (FP + TN)
    
    print(classification_report(ytest, ypred))
    
    return Acc, Prec, Rec, F1, FPR, Train_Time_Taken, Test_Time_Taken

# Example usage:
# Acc, Prec, Rec, F1, FPR, Train_Time, Test_Time = Train_Model(your_model, x_train, y_train, x_test, y_test)


# Evaluating Models for Resource Utilization 

In [26]:
import time
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
# Add any other models you want to use

Models = {
    'Logistic Regression': LogisticRegression(),
    'AdaBoost': AdaBoostClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'KNN': KNeighborsClassifier()
    # Add other models here if needed
}

def Train_Model(Model, xtrain, ytrain, xtest, ytest):
    start_total_time = time.time()  # Start total time
    
    # Training
    start_train_time = time.time()
    Model.fit(xtrain, ytrain)
    end_train_time = time.time()
    
    # Testing (Prediction)
    start_test_time = time.time()
    predictions = Model.predict(xtest)
    end_test_time = time.time()

    end_total_time = time.time()  # End total time
    
    # Calculating metrics
    Acc = accuracy_score(ytest, predictions)
    Prec = precision_score(ytest, predictions)
    Rec = recall_score(ytest, predictions)
    F1 = f1_score(ytest, predictions)
    tn, fp, fn, tp = confusion_matrix(ytest, predictions).ravel()
    FPR = fp / (fp + tn)
    
    return Acc, Prec, Rec, F1, end_train_time - start_train_time, end_test_time - start_test_time, FPR, end_total_time - start_total_time

Result_DF = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'Train Time', 'Test Time', 'False Positive Rate', 'Total Time'])

for idxModel, Model in Models.items():
    print(f'Model being trained: {idxModel}')
    Acc, Prec, Rec, F1, Train_Time, Test_Time, FPR, Total_Time = Train_Model(Model, x_train, y_train, x_test, y_test)
    Result_DF = Result_DF._append({
        'Model': idxModel, 
        'Accuracy': Acc, 
        'Precision': Prec, 
        'Recall': Rec, 
        'F1-Score': F1, 
        'Train Time': Train_Time, 
        'Test Time': Test_Time, 
        'False Positive Rate': FPR, 
        'Total Time': Total_Time
    }, ignore_index=True)

print(Result_DF)


Model being trained: Logistic Regression
Model being trained: AdaBoost
Model being trained: Random Forest
Model being trained: Decision Tree
Model being trained: KNN
                 Model  Accuracy  Precision    Recall  F1-Score  Train Time  \
0  Logistic Regression  0.994245   0.434783  0.022573  0.042918    8.336743   
1             AdaBoost  0.999987   1.000000  0.997743  0.998870   33.712445   
2        Random Forest  1.000000   1.000000  1.000000  1.000000   19.474362   
3        Decision Tree  0.999974   0.995506  1.000000  0.997748    1.239755   
4                  KNN  0.999794   0.993072  0.970655  0.981735    0.345298   

   Test Time  False Positive Rate  Total Time  
0   0.048350             0.000169    8.385094  
1   0.797617             0.000000   34.510063  
2   0.342802             0.000000   19.817164  
3   0.053973             0.000026    1.293745  
4  93.883322             0.000039   94.228622  


In [28]:
import time
import joblib  # Import joblib library for model size calculation
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Add any other models you want to use

Models = {
    'Logistic Regression': LogisticRegression(),
    'AdaBoost': AdaBoostClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'KNN': KNeighborsClassifier()
    # Add other models here if needed
}

def Train_Model(Model, xtrain, ytrain, xtest, ytest):
    start_total_time = time.time()  # Start total time
    
    # Training
    start_train_time = time.time()
    Model.fit(xtrain, ytrain)
    end_train_time = time.time()
    
    # Save the trained model to a temporary file to estimate memory usage
    temp_file_path = 'temp_model_dump.joblib'
    joblib.dump(Model, temp_file_path)
    
    # Get the estimated memory usage of the trained model
    model_memory = joblib.os.path.getsize(temp_file_path) / (1024 * 1024)  # Size in MB
    print(f"Estimated memory usage of the trained {type(Model).__name__} model: {model_memory:.2f} MB")
    
    # Remove the temporary file
    joblib.os.remove(temp_file_path)
    
    # Testing (Prediction)
    start_test_time = time.time()
    predictions = Model.predict(xtest)
    end_test_time = time.time()

    end_total_time = time.time()  # End total time
    
    # Calculating metrics
    Acc = accuracy_score(ytest, predictions)
    Prec = precision_score(ytest, predictions)
    Rec = recall_score(ytest, predictions)
    F1 = f1_score(ytest, predictions)
    tn, fp, fn, tp = confusion_matrix(ytest, predictions).ravel()
    FPR = fp / (fp + tn)
    
    return Acc, Prec, Rec, F1, end_train_time - start_train_time, end_test_time - start_test_time, FPR, end_total_time - start_total_time

Result_DF = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'Train Time', 'Test Time', 'False Positive Rate', 'Total Time'])

for idxModel, Model in Models.items():
    print(f'Model being trained: {idxModel}')
    Acc, Prec, Rec, F1, Train_Time, Test_Time, FPR, Total_Time = Train_Model(Model, x_train, y_train, x_test, y_test)
    Result_DF = Result_DF._append({
        'Model': idxModel, 
        'Accuracy': Acc, 
        'Precision': Prec, 
        'Recall': Rec, 
        'F1-Score': F1, 
        'Train Time': Train_Time, 
        'Test Time': Test_Time, 
        'False Positive Rate': FPR, 
        'Total Time': Total_Time
    }, ignore_index=True)

print(Result_DF)


Model being trained: Logistic Regression
Estimated memory usage of the trained LogisticRegression model: 0.00 MB
Model being trained: AdaBoost
Estimated memory usage of the trained AdaBoostClassifier model: 0.03 MB
Model being trained: Random Forest
Estimated memory usage of the trained RandomForestClassifier model: 1.04 MB
Model being trained: Decision Tree
Estimated memory usage of the trained DecisionTreeClassifier model: 0.01 MB
Model being trained: KNN
Estimated memory usage of the trained KNeighborsClassifier model: 203.39 MB
                 Model  Accuracy  Precision    Recall  F1-Score  Train Time  \
0  Logistic Regression  0.994245   0.434783  0.022573  0.042918    6.815871   
1             AdaBoost  0.999987   1.000000  0.997743  0.998870   32.587292   
2        Random Forest  1.000000   1.000000  1.000000  1.000000   19.717564   
3        Decision Tree  0.999961   0.993274  1.000000  0.996625    1.329988   
4                  KNN  0.999794   0.993072  0.970655  0.981735    

In [30]:
import time
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sys import getsizeof

# Add any other models you want to use
Models = {
    'Logistic Regression': LogisticRegression(),
    'AdaBoost': AdaBoostClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'KNN': KNeighborsClassifier()
    # Add other models here if needed
}

def Train_Model(Model, xtrain, ytrain, xtest, ytest):
    start_total_time = time.time()  # Start total time
    
    # Training
    start_train_time = time.time()
    Model.fit(xtrain, ytrain)
    end_train_time = time.time()
    
    # Save the trained model to a temporary file to estimate memory usage
    temp_model_file = 'temp_model_dump.joblib'
    joblib.dump(Model, temp_model_file)
    
    # Get the estimated memory usage of the trained model
    model_memory = getsizeof(joblib.load(temp_model_file)) / (1024 * 1024)  # Size in MB
    print(f"Estimated memory usage of the trained {type(Model).__name__} model: {model_memory:.2f} KB")
    
    # Remove the temporary model file
    joblib.os.remove(temp_model_file)
    
    # Testing (Prediction)
    start_test_time = time.time()
    predictions = Model.predict(xtest)
    end_test_time = time.time()

    end_total_time = time.time()  # End total time
    
    # Calculating metrics
    Acc = accuracy_score(ytest, predictions)
    Prec = precision_score(ytest, predictions)
    Rec = recall_score(ytest, predictions)
    F1 = f1_score(ytest, predictions)
    tn, fp, fn, tp = confusion_matrix(ytest, predictions).ravel()
    FPR = fp / (fp + tn)
    
    return Acc, Prec, Rec, F1, end_train_time - start_train_time, end_test_time - start_test_time, FPR, end_total_time - start_total_time

Result_DF = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'Train Time', 'Test Time', 'False Positive Rate', 'Total Time'])

for idxModel, Model in Models.items():
    print(f'Model being trained: {idxModel}')
    Acc, Prec, Rec, F1, Train_Time, Test_Time, FPR, Total_Time = Train_Model(Model, x_train, y_train, x_test, y_test)
    Result_DF = Result_DF._append({
        'Model': idxModel, 
        'Accuracy': Acc, 
        'Precision': Prec, 
        'Recall': Rec, 
        'F1-Score': F1, 
        'Train Time': Train_Time, 
        'Test Time': Test_Time, 
        'False Positive Rate': FPR, 
        'Total Time': Total_Time
    }, ignore_index=True)

print(Result_DF)


Model being trained: Logistic Regression
Estimated memory usage of the trained LogisticRegression model: 0.00 KB
Model being trained: AdaBoost
Estimated memory usage of the trained AdaBoostClassifier model: 0.00 KB
Model being trained: Random Forest
Estimated memory usage of the trained RandomForestClassifier model: 0.00 KB
Model being trained: Decision Tree
Estimated memory usage of the trained DecisionTreeClassifier model: 0.00 KB
Model being trained: KNN
Estimated memory usage of the trained KNeighborsClassifier model: 0.00 KB
                 Model  Accuracy  Precision    Recall  F1-Score  Train Time  \
0  Logistic Regression  0.994245   0.434783  0.022573  0.042918    7.181755   
1             AdaBoost  0.999987   1.000000  0.997743  0.998870   31.783342   
2        Random Forest  1.000000   1.000000  1.000000  1.000000   19.497181   
3        Decision Tree  0.999961   0.993274  1.000000  0.996625    1.281933   
4                  KNN  0.999794   0.993072  0.970655  0.981735    0.