In [158]:
import pandas as pd 
from sklearn import *
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


In [160]:
#Load Dataframe
df = pd.read_csv('IOT_cleaned.csv')
df.head()

Unnamed: 0,id_orig_p,id_resp_p,proto,service,flow_duration,fwd_pkts_tot,bwd_pkts_tot,fwd_data_pkts_tot,bwd_data_pkts_tot,fwd_pkts_per_sec,...,idle_min,idle_max,idle_tot,idle_avg,idle_std,fwd_init_window_size,bwd_init_window_size,fwd_last_window_size,Attack_type,Attack
0,38667,1883,tcp,mqtt,32.011598,9,5,3,3,0.281148,...,29729180.0,29729180.0,29729180.0,29729180.0,0.0,64240,26847,502,MQTT_Publish,Normal
1,51143,1883,tcp,mqtt,31.883584,9,5,3,3,0.282277,...,29855280.0,29855280.0,29855280.0,29855280.0,0.0,64240,26847,502,MQTT_Publish,Normal
2,44761,1883,tcp,mqtt,32.124053,9,5,3,3,0.280164,...,29842150.0,29842150.0,29842150.0,29842150.0,0.0,64240,26847,502,MQTT_Publish,Normal
3,60893,1883,tcp,mqtt,31.961063,9,5,3,3,0.281593,...,29913770.0,29913770.0,29913770.0,29913770.0,0.0,64240,26847,502,MQTT_Publish,Normal
4,51087,1883,tcp,mqtt,31.902362,9,5,3,3,0.282111,...,29814700.0,29814700.0,29814700.0,29814700.0,0.0,64240,26847,502,MQTT_Publish,Normal


## Encode Dataframe

In [162]:
from sklearn.preprocessing import LabelEncoder

In [163]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply LabelEncoder to categorical columns
for column in df.columns:
    if df[column].dtype == 'object' or df[column].dtype.name == 'category':
        df[column] = label_encoder.fit_transform(df[column]) 

In [165]:
df['proto'].nunique() , df['Attack_type'].nunique() , df['service'].nunique()

(3, 12, 10)

### Train Test Split

In [170]:
features = df.drop(columns=['Attack_type', 'Attack'])
target = df["Attack_type"]

In [172]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state=0)

#Normalize Data after Train Split
normalizer = MinMaxScaler() 

normalizer.fit(X_train)

X_train_norm = normalizer.transform(X_train) # Normalize 80% training Data
X_test_norm = normalizer.transform(X_test) # Normalize 20% Testing Data

#Apply to test and training data
X_train_norm = pd.DataFrame(X_train_norm, columns = X_train.columns)
X_test_norm = pd.DataFrame(X_test_norm, columns = X_test.columns)

In [213]:
# Create a DataFrame from the normalized X_test data
test_data = pd.DataFrame(X_test, columns=X_test.columns)

# Save the X_test DataFrame to a CSV file
test_data.to_csv('test_data.csv', index=False)

### Applying Smote and Undersampling on Dataframe to balance weaker class of "Normal" Attack_types

##### Undersampling

In [177]:
from imblearn.under_sampling import RandomUnderSampler

# Apply undersampling using RandomUnderSampler
undersampler = RandomUnderSampler(random_state=0)
X_under, y_under = undersampler.fit_resample(X_train, y_train)

y_under.value_counts()

Attack_type
0     27
1     27
2     27
3     27
4     27
5     27
6     27
7     27
8     27
9     27
10    27
11    27
Name: count, dtype: int64

##### Smote

In [180]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE
smote = SMOTE(random_state=0)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

y_smote.value_counts()

Attack_type
2     75705
4     75705
6     75705
0     75705
9     75705
3     75705
7     75705
10    75705
8     75705
1     75705
11    75705
5     75705
Name: count, dtype: int64

## Using Kbest to reduce number of features with Smote, Undersampling and Basedata

I know I said I wont be dropping any features in wrangling, but I tried multiple variations of features before, because the models accuracy was always too good.

In [181]:
from sklearn.feature_selection import SelectKBest, f_classif

#### Kbest for Undersampling

In [183]:
# Apply SelectKBest to the resampled training data
selector = SelectKBest(score_func=f_classif, k=50)  
X_train_kbest_under = selector.fit_transform(X_under, y_under)  

# Transform the original test set using the fitted selector
X_test_selected = selector.transform(X_test)  

# Get selected feature names
selected_features = X_under.columns[selector.get_support()]  
print("Selected features:", selected_features)

# Apply MinMaxScaler 
scaler = MinMaxScaler()
X_train_kbest_scaled = scaler.fit_transform(X_train_kbest_under)  
X_test_selected_scaled = scaler.transform(X_test_selected)  

# Convert scaled arrays back to DataFrames for better readability
X_train_kbest_scaled_under = pd.DataFrame(X_train_kbest_scaled, columns=selected_features)
X_test_selected_scaled_under = pd.DataFrame(X_test_selected_scaled, columns=selected_features)


Selected features: Index(['id_orig_p', 'proto', 'service', 'bwd_pkts_tot', 'bwd_data_pkts_tot',
       'fwd_pkts_per_sec', 'bwd_pkts_per_sec', 'flow_pkts_per_sec',
       'down_up_ratio', 'fwd_header_size_min', 'fwd_header_size_max',
       'bwd_header_size_tot', 'bwd_header_size_min', 'bwd_header_size_max',
       'flow_FIN_flag_count', 'flow_SYN_flag_count', 'flow_RST_flag_count',
       'fwd_PSH_flag_count', 'bwd_PSH_flag_count', 'flow_ACK_flag_count',
       'fwd_URG_flag_count', 'fwd_pkts_payload_min', 'fwd_pkts_payload_max',
       'fwd_pkts_payload_avg', 'fwd_pkts_payload_std', 'bwd_pkts_payload_max',
       'bwd_pkts_payload_avg', 'bwd_pkts_payload_std', 'flow_pkts_payload_max',
       'flow_pkts_payload_avg', 'flow_pkts_payload_std', 'fwd_iat_max',
       'fwd_iat_avg', 'fwd_iat_std', 'flow_iat_max', 'flow_iat_avg',
       'flow_iat_std', 'payload_bytes_per_second', 'fwd_subflow_pkts',
       'bwd_subflow_pkts', 'fwd_subflow_bytes', 'active_min', 'active_max',
       'active_a

  f = msb / msw


##### Kbest for Smote

In [188]:
# Apply SelectKBest to the resampled (SMOTE) training data
selector = SelectKBest(score_func=f_classif, k=50)  
X_kbest_smote = selector.fit_transform(X_smote, y_smote)  

# Transform the original test set using the fitted selector
X_test_selected = selector.transform(X_test)  

# Get selected feature names
selected_features = X_smote.columns[selector.get_support()] 
print("Selected features:", selected_features)

# Apply MinMaxScaler 
scaler = MinMaxScaler()
X_kbest_smote_scaled = scaler.fit_transform(X_kbest_smote) 
X_test_selected_scaled = scaler.transform(X_test_selected)  

# Convert scaled arrays back to DataFrames for better readability
X_kbest_smote_scaled = pd.DataFrame(X_kbest_smote_scaled, columns=selected_features)
X_test_selected_scaled_smote = pd.DataFrame(X_test_selected_scaled, columns=selected_features)

Selected features: Index(['id_orig_p', 'id_resp_p', 'proto', 'service', 'fwd_pkts_per_sec',
       'bwd_pkts_per_sec', 'flow_pkts_per_sec', 'down_up_ratio',
       'fwd_header_size_min', 'fwd_header_size_max', 'bwd_header_size_min',
       'bwd_header_size_max', 'flow_FIN_flag_count', 'flow_SYN_flag_count',
       'flow_RST_flag_count', 'fwd_URG_flag_count', 'fwd_pkts_payload_min',
       'fwd_pkts_payload_max', 'fwd_pkts_payload_avg', 'fwd_pkts_payload_std',
       'bwd_pkts_payload_min', 'bwd_pkts_payload_max', 'bwd_pkts_payload_avg',
       'bwd_pkts_payload_std', 'flow_pkts_payload_min',
       'flow_pkts_payload_max', 'flow_pkts_payload_avg',
       'flow_pkts_payload_std', 'fwd_iat_max', 'fwd_iat_avg', 'fwd_iat_std',
       'bwd_iat_max', 'bwd_iat_avg', 'bwd_iat_std', 'flow_iat_max',
       'flow_iat_avg', 'flow_iat_std', 'payload_bytes_per_second',
       'fwd_subflow_pkts', 'fwd_subflow_bytes', 'active_min', 'active_max',
       'active_avg', 'active_std', 'idle_min', 'idle_max

#### Kbest for Imbalanced Data (Starting Data)

In [189]:
# Apply SelectKBest to the original training data
selector = SelectKBest(score_func = f_classif, k=50)  # Adjust 'k' as needed
X_kbest_normal = selector.fit_transform(X_train, y_train)  

# Transform the original test set using the fitted selector
X_test_selected = selector.transform(X_test) 

# Get selected feature names
selected_features = X_train.columns[selector.get_support()]  
print("Selected features:", selected_features)

# Apply MinMaxScaler 
scaler = MinMaxScaler()
X_kbest_normal_scaled = scaler.fit_transform(X_kbest_normal) 
X_test_selected_scaled = scaler.transform(X_test_selected)  

# Convert scaled arrays back to DataFrames for better readability
X_kbest_normal_scaled = pd.DataFrame(X_kbest_normal_scaled, columns=selected_features)
X_test_selected_scaled_normal = pd.DataFrame(X_test_selected_scaled, columns=selected_features)

Selected features: Index(['id_orig_p', 'id_resp_p', 'proto', 'service', 'fwd_pkts_per_sec',
       'bwd_pkts_per_sec', 'flow_pkts_per_sec', 'down_up_ratio',
       'fwd_header_size_min', 'fwd_header_size_max', 'bwd_header_size_min',
       'bwd_header_size_max', 'flow_FIN_flag_count', 'flow_SYN_flag_count',
       'flow_RST_flag_count', 'fwd_PSH_flag_count', 'fwd_URG_flag_count',
       'fwd_pkts_payload_min', 'fwd_pkts_payload_max', 'fwd_pkts_payload_avg',
       'fwd_pkts_payload_std', 'bwd_pkts_payload_min', 'bwd_pkts_payload_max',
       'bwd_pkts_payload_avg', 'bwd_pkts_payload_std', 'flow_pkts_payload_max',
       'flow_pkts_payload_avg', 'flow_pkts_payload_std', 'fwd_iat_max',
       'fwd_iat_tot', 'fwd_iat_avg', 'fwd_iat_std', 'bwd_iat_max',
       'bwd_iat_avg', 'bwd_iat_std', 'flow_iat_max', 'flow_iat_tot',
       'flow_iat_avg', 'flow_iat_std', 'payload_bytes_per_second',
       'fwd_subflow_pkts', 'active_min', 'active_max', 'active_avg',
       'idle_min', 'idle_max', 'idl

In [193]:
# Import the function from functions.py
from functions import check_data_integrity

# Normal (original) data check
check_data_integrity(X_kbest_normal_scaled, X_test_selected_scaled_normal, y_train, y_test, "Normal")

# SMOTE data check
check_data_integrity(X_kbest_smote_scaled, X_test_selected_scaled_smote, y_train, y_test, "SMOTE")

# Undersampled data check
check_data_integrity(X_train_kbest_scaled_under, X_test_selected_scaled_under, y_train, y_test, "Undersampled")



--- Data Integrity Check for Normal ---
Column consistency:
✓ Selected columns are consistent between X_train and X_test

Checking for NaNs:
✓ X_train has no missing values
✓ X_test has no missing values
✓ y_train has no missing values
✓ y_test has no missing values

Shape of data:
X_train shape: (98493, 50)
X_test shape: (24624, 50)
y_train shape: (98493,)
y_test shape: (24624,)

--- Data Integrity Check for SMOTE ---
Column consistency:
✓ Selected columns are consistent between X_train and X_test

Checking for NaNs:
✓ X_train has no missing values
✓ X_test has no missing values
✓ y_train has no missing values
✓ y_test has no missing values

Shape of data:
X_train shape: (908460, 50)
X_test shape: (24624, 50)
y_train shape: (98493,)
y_test shape: (24624,)

--- Data Integrity Check for Undersampled ---
Column consistency:
✓ Selected columns are consistent between X_train and X_test

Checking for NaNs:
✓ X_train has no missing values
✓ X_test has no missing values
✓ y_train has no miss

# ML Models with Normal Dataset

#### Normal Logistic Regression

In [195]:
# Initialize the Logistic Regression model
log_reg = LogisticRegression(random_state=42, max_iter=1000)

# Fit the model using the normalized training data
log_reg.fit(X_train_norm, y_train)

# Predict on the test set
y_pred_norm = log_reg.predict(X_test_norm)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_norm)
print(f"Accuracy: {accuracy:.4f}")

# Calculate precision
precision = precision_score(y_test, y_pred_norm, average='weighted')  
print(f"Precision: {precision:.4f}")

# Calculate recall
recall = recall_score(y_test, y_pred_norm, average='weighted')  
print(f"Recall: {recall:.4f}")

# Print the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_norm,))# target_names=['Class 0', 'Class 1'])) 

Accuracy: 0.9788
Precision: 0.9793
Recall: 0.9788

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.76      0.82      1507
           1       0.72      0.88      0.79       111
           2       1.00      1.00      1.00     18954
           3       1.00      1.00      1.00       849
           4       0.00      0.00      0.00        10
           5       0.00      0.00      0.00         1
           6       0.98      1.00      0.99       384
           7       1.00      0.99      1.00       171
           8       0.96      0.98      0.97       524
           9       1.00      1.00      1.00       420
          10       0.82      0.94      0.88      1631
          11       0.97      0.61      0.75        62

    accuracy                           0.98     24624
   macro avg       0.78      0.76      0.77     24624
weighted avg       0.98      0.98      0.98     24624



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Normal Random Forest

In [38]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)

# Fit the model using the normalized training data
rf_model.fit(X_train_norm, y_train)

# Predict on the test set
y_pred_norm_rf = rf_model.predict(X_test_norm)

# Calculate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_norm_rf)
print(f"Accuracy: {accuracy_rf:.4f}")

# Calculate precision
precision_rf = precision_score(y_test, y_pred_norm_rf, average='weighted') 
print(f"Precision: {precision_rf:.4f}")

# Calculate recall
recall_rf = recall_score(y_test, y_pred_norm_rf, average='weighted') 
print(f"Recall: {recall_rf:.4f}")

# Print the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_norm_rf,))# target_names=['Class 0', 'Class 1']))  

Accuracy: 0.9989
Precision: 0.9989
Recall: 0.9989

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1507
           1       0.99      1.00      1.00       111
           2       1.00      1.00      1.00     18954
           3       1.00      1.00      1.00       849
           4       1.00      1.00      1.00        10
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00       384
           7       1.00      1.00      1.00       171
           8       0.99      1.00      1.00       524
           9       1.00      1.00      1.00       420
          10       0.99      0.99      0.99      1631
          11       1.00      0.94      0.97        62

    accuracy                           1.00     24624
   macro avg       1.00      0.99      1.00     24624
weighted avg       1.00      1.00      1.00     24624



# ML Models with Smote and Kbest Features

##### Smote Logistic Regression

In [201]:
# Initialize the Logistic Regression model
log_reg = LogisticRegression(random_state=42, max_iter=1000)

# Fit the model using the SMOTE-resampled and scaled training data
log_reg.fit(X_kbest_smote_scaled, y_smote)

# Predict on the test set
y_pred_smote = log_reg.predict(X_test_selected_scaled_smote)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_smote)
print(f"Accuracy: {accuracy:.4f}")

# Calculate precision
precision = precision_score(y_test, y_pred_smote, average='micro')  
print(f"Precision: {precision:.4f}")

# Calculate recall
recall = recall_score(y_test, y_pred_smote, average='micro')  
print(f"Recall: {recall:.4f}")

# Print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_smote)
print("\nConfusion Matrix:")
print(conf_matrix)

# Print a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_smote))


Accuracy: 0.9825
Precision: 0.9825
Recall: 0.9825

Confusion Matrix:
[[ 1253    29     2     2    35     2     3     0     0     0   152    29]
 [    0   111     0     0     0     0     0     0     0     0     0     0]
 [    0     0 18954     0     0     0     0     0     0     0     0     0]
 [    3     0     0   845     0     1     0     0     0     0     0     0]
 [    0     0     0     0    10     0     0     0     0     0     0     0]
 [    0     0     0     0     0     1     0     0     0     0     0     0]
 [    0     0     0     0     0     0   384     0     0     0     0     0]
 [    0     0     0     0     0     0     0   171     0     0     0     0]
 [    3    33     0     0     4     0     0     0   483     0     0     1]
 [    2     0     0     0     0     0     0     0     0   418     0     0]
 [   79     6     0     0    14     1     0     0     3     0  1507    21]
 [    2     0     1     0     0     0     1     0     0     0     1    57]]

Classification Report:
      

##### Smote Random Forest

In [143]:
df["Attack_type"].unique()

array([ 3, 10, 11,  0,  1,  2,  4,  5,  6,  7,  8,  9])

In [144]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)

# Fit the model using the SMOTE-resampled and scaled training data
rf_model.fit(X_kbest_smote_scaled, y_smote)

# Predict on the test set
y_pred_smote_rf = rf_model.predict(X_test_selected_scaled_smote)

# Calculate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_smote_rf)
print(f"Accuracy: {accuracy_rf:.4f}")

# Calculate precision
precision_rf = precision_score(y_test, y_pred_smote_rf, average='micro')
print(f"Precision: {precision_rf:.4f}")

# Calculate recall
recall_rf = recall_score(y_test, y_pred_smote_rf, average='micro')
print(f"Recall: {recall_rf:.4f}")

# Manually define the label mapping
label_mapping = {
    0: 'ARP_poisioning',
    1: 'DDOS_Slowloris',
    2: 'DOS_SYN_Hping',
    3: 'MQTT_Publish',
    4: 'Metasploit_Brute_Force_SSH',
    5: 'NMAP_FIN_SCAN',
    6: 'NMAP_OS_DETECTION',
    7: 'NMAP_TCP_scan',
    8: 'NMAP_UDP_SCAN',
    9: 'NMAP_XMAS_TREE_SCAN',
    10: 'Thing_Speak',
    11: 'Wipro_bulb',}

# Convert y_test and y_pred to their original class names using the manual mapping
y_test_original = pd.Series(y_test).map(label_mapping)
y_pred_original = pd.Series(y_pred_smote_rf).map(label_mapping)

# Print the classification report with manually defined class names
print("\nClassification Report:")
print(classification_report(y_test_original, y_pred_original))

# Generate the confusion matrix
conf_matrix_rf = confusion_matrix(y_test_original, y_pred_original)

# Plot the confusion matrix with manually defined class names
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_rf, annot=True, fmt='d', cmap='Blues', xticklabels=list(label_mapping.values()), yticklabels=list(label_mapping.values()))
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix - Random Forest with Manually Defined Class Names')
plt.show()


KeyboardInterrupt: 

# ML Models with Undersampling and Kbest Features

#### Under Logistic Regression

In [None]:
# Initialize the Logistic Regression model
log_reg_under = LogisticRegression(random_state=42, max_iter=1000)

# Fit the model using the scaled, feature-selected, undersampled training data
log_reg_under.fit(X_train_kbest_scaled_under, y_under)

# Predict on the test set
y_pred_under = log_reg_under.predict(X_test_selected_scaled_under)

# Calculate accuracy
accuracy_under = accuracy_score(y_test, y_pred_under)
print(f"Accuracy: {accuracy_under:.4f}")

# Calculate precision
precision_under = precision_score(y_test, y_pred_under, average='weighted')  
print(f"Precision: {precision_under:.4f}")

# Calculate recall
recall_under = recall_score(y_test, y_pred_under, average='weighted')  
print(f"Recall: {recall_under:.4f}")

# Print the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_under,))# target_names=['Class 0', 'Class 1'])) 

#### Under Random Forest

In [None]:
# Initialize the Random Forest model
rf_model_under = RandomForestClassifier(random_state=42, n_estimators=100)

# Fit the model using the scaled, feature-selected, undersampled training data
rf_model_under.fit(X_train_kbest_scaled_under, y_under)

# Predict on the test set
y_pred_under_rf = rf_model_under.predict(X_test_selected_scaled_under)

# Calculate accuracy
accuracy_under_rf = accuracy_score(y_test, y_pred_under_rf)
print(f"Accuracy: {accuracy_under_rf:.4f}")

# Calculate precision
precision_under_rf = precision_score(y_test, y_pred_under_rf, average='weighted')  
print(f"Precision: {precision_under_rf:.4f}")

# Calculate recall
recall_under_rf = recall_score(y_test, y_pred_under_rf, average='weighted') 
print(f"Recall: {recall_under_rf:.4f}")

# Print the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_under_rf, ))#target_names=['Class 0', 'Class 1'])) 

#### Under SVM

In [None]:
from sklearn.svm import SVC

# Initialize the SVM model
svm_model_under = SVC(random_state=42, kernel='rbf')  # You can change 'kernel' to 'linear', 'poly', or 'sigmoid' as needed

# Fit the model using the scaled, feature-selected, undersampled training data
svm_model_under.fit(X_train_kbest_scaled_under, y_under)

# Predict on the test set
y_pred_under_svm = svm_model_under.predict(X_test_selected_scaled_under)

# Calculate accuracy
accuracy_under_svm = accuracy_score(y_test, y_pred_under_svm)
print(f"Accuracy: {accuracy_under_svm:.4f}")

# Calculate precision
precision_under_svm = precision_score(y_test, y_pred_under_svm, average='weighted')  # Adjust 'binary' for multiclass
print(f"Precision: {precision_under_svm:.4f}")

# Calculate recall
recall_under_svm = recall_score(y_test, y_pred_under_svm, average='weighted')  # Adjust 'binary' for multiclass
print(f"Recall: {recall_under_svm:.4f}")

# Print the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_under_svm,))# target_names=['Class 0', 'Class 1']))

We are going to go with the Smote Random forest model, as that seems to be the best one. We dont need to enhance it with Random Search or Gridsearch or Hypertuning as it is alredy good enough.

In [31]:
import joblib

# Save the model to a file
joblib.dump(rf_model, 'random_forest_model_smote.joblib')
# Save the scaler to a .joblib file
joblib.dump(scaler, 'scaler.joblib')



NameError: name 'rf_model' is not defined

# Simulating IoT Datafeed

In [109]:
def get_column_names(X_test_selected_scaled_smote):
    # Get the column names from the DataFrame or array
    if hasattr(X_test_selected_scaled_smote, 'columns'):
        # If it's a DataFrame, return column names
        return X_test_selected_scaled_smote.columns.tolist()
    else:
        # If it's a numpy array, you may need to access column names if available
        return ["Column_" + str(i) for i in range(X_test_selected_scaled_smote.shape[1])]

# Get column names from the DataFrame or array
column_names = get_column_names(X_test_selected_scaled_smote)

# Convert the column names list into a DataFrame (with one row to view)
column_names_df = pd.DataFrame([column_names], columns=column_names)

# Save column names used during training
feature_names = column_names_df.columns.tolist()  
joblib.dump(feature_names, 'feature_names.joblib')

['feature_names.joblib']

In [118]:
column_names_df

Unnamed: 0,id_orig_p,id_resp_p,proto,service,fwd_pkts_per_sec,bwd_pkts_per_sec,flow_pkts_per_sec,down_up_ratio,fwd_header_size_min,fwd_header_size_max,...,active_min,active_max,active_avg,active_std,idle_min,idle_max,idle_avg,fwd_init_window_size,bwd_init_window_size,fwd_last_window_size
0,id_orig_p,id_resp_p,proto,service,fwd_pkts_per_sec,bwd_pkts_per_sec,flow_pkts_per_sec,down_up_ratio,fwd_header_size_min,fwd_header_size_max,...,active_min,active_max,active_avg,active_std,idle_min,idle_max,idle_avg,fwd_init_window_size,bwd_init_window_size,fwd_last_window_size


In [33]:
# Load the trained model
rf_model = joblib.load('random_forest_model_smote.joblib')
# Load the scaler from the .joblib file
scaler = joblib.load('scaler.joblib')

In [47]:
import time

# Adjusted real-time prediction function 
def real_time_prediction_from_df(row):
    import time
    # Convert the row to a DataFrame format to maintain compatibility
    new_data_df = pd.DataFrame([row], columns=X_test_selected_scaled_smote.columns)
    
    # Normalize the new data using the existing MinMaxScaler
    new_data_scaled = scaler.transform(new_data_df)  # Ensure consistent scaling
    
    # Predict using the trained RandomForest model
    prediction = rf_model.predict(new_data_scaled)
    
    # Return the prediction label
    return prediction


In [49]:
# Function to shuffle and simulate real-time data feeding with row and probability printing
def simulate_real_time_feed_from_df(df, delay=1):
    # Shuffle the DataFrame to randomize row order
    shuffled_df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    for _, row in shuffled_df.iterrows():
        # Drop the 'Attack_type' or any target columns for prediction
        new_data_row = row.drop(['Attack_type', 'Attack'], errors='ignore')

        # Align with the feature columns used in training (X_test_selected_scaled_smote.columns)
        new_data_row = pd.DataFrame([new_data_row], columns= column_names_df.columns) #X_test_selected_scaled_smote

        # Initialize the LabelEncoder
        label_encoder = LabelEncoder()

        # Apply LabelEncoder to each categorical column
        for column in new_data_row.columns:
            if new_data_row[column].dtype == 'object' or new_data_row[column].dtype.name == 'category':
                new_data_row[column] = label_encoder.fit_transform(new_data_row[column]) 

        
        # Print the row's feature values before scaling and encoding
        print("\nFeature Values (Raw):")
        print(new_data_row)

        # Apply scaling to the row
        new_data_scaled = scaler.transform(new_data_row)
        new_data_scaled_df = pd.DataFrame(new_data_scaled, columns= column_names_df.columns) #X_test_selected_scaled_smote

        # Print the row's feature values after scaling
        #print("\nFeature Values (Scaled):")
        #print(new_data_scaled_df)

        # Get the prediction and probabilities
        prediction = rf_model.predict(new_data_scaled_df)
        prediction_proba = rf_model.predict_proba(new_data_scaled_df)

        # Convert numeric prediction to original class name using the manual label mapping
        prediction_label = label_mapping.get(prediction[0], "Unknown")

        # Print the prediction result and probabilities
        print(f"\nPredicted Attack Type: {prediction_label}")
        print(f"Prediction Probabilities: {prediction_proba[0]}")
        
        # Simulate real-time delay
        time.sleep(delay)

In [325]:
df['Attack_type'].value_counts()

Attack_type
2     94659
10     8108
0      7750
3      4146
8      2590
9      2010
6      2000
7      1002
1       534
11      253
4        37
5        28
Name: count, dtype: int64

In [43]:
label_mapping = {
    0: 'ARP_poisioning',
    1: 'DDOS_Slowloris',
    2: 'DOS_SYN_Hping',
    3: 'MQTT_Publish',
    4: 'Metasploit_Brute_Force_SSH',
    5: 'NMAP_FIN_SCAN',
    6: 'NMAP_OS_DETECTION',
    7: 'NMAP_TCP_scan',
    8: 'NMAP_UDP_SCAN',
    9: 'NMAP_XMAS_TREE_SCAN',
    10: 'Thing_Speak',
    11: 'Wipro_bulb',}

In [84]:
# Call the function with your DataFrame
simulate_real_time_feed_from_df(df, delay=2)



Feature Values (Raw):
   id_orig_p  id_resp_p  proto  service  fwd_pkts_per_sec  bwd_pkts_per_sec  \
0     7315.0       21.0    1.0      0.0          838860.8          838860.8   

   flow_pkts_per_sec  down_up_ratio  fwd_header_size_min  fwd_header_size_max  \
0          1677721.6            1.0                 20.0                 20.0   

   ...  active_min  active_max  active_avg  active_std  idle_min  idle_max  \
0  ...    1.192093    1.192093    1.192093         0.0       0.0       0.0   

   idle_avg  fwd_init_window_size  bwd_init_window_size  fwd_last_window_size  
0       0.0                  64.0                   0.0                  64.0  

[1 rows x 50 columns]

Predicted Attack Type: DOS_SYN_Hping
Prediction Probabilities: [0.02 0.   0.85 0.01 0.   0.   0.03 0.   0.02 0.   0.04 0.03]





Feature Values (Raw):
   id_orig_p  id_resp_p  proto  service  fwd_pkts_per_sec  bwd_pkts_per_sec  \
1    18609.0       21.0    1.0      0.0          262144.0          262144.0   

   flow_pkts_per_sec  down_up_ratio  fwd_header_size_min  fwd_header_size_max  \
1           524288.0            1.0                 20.0                 20.0   

   ...  active_min  active_max  active_avg  active_std  idle_min  idle_max  \
1  ...    3.814697    3.814697    3.814697         0.0       0.0       0.0   

   idle_avg  fwd_init_window_size  bwd_init_window_size  fwd_last_window_size  
1       0.0                  64.0                   0.0                  64.0  

[1 rows x 50 columns]

Predicted Attack Type: DOS_SYN_Hping
Prediction Probabilities: [0.02 0.   0.84 0.01 0.   0.   0.03 0.   0.02 0.   0.04 0.04]





Feature Values (Raw):
   id_orig_p  id_resp_p  proto  service  fwd_pkts_per_sec  bwd_pkts_per_sec  \
2    14663.0       21.0    1.0      0.0     466033.777778     466033.777778   

   flow_pkts_per_sec  down_up_ratio  fwd_header_size_min  fwd_header_size_max  \
2      932067.555556            1.0                 20.0                 20.0   

   ...  active_min  active_max  active_avg  active_std  idle_min  idle_max  \
2  ...    2.145767    2.145767    2.145767         0.0       0.0       0.0   

   idle_avg  fwd_init_window_size  bwd_init_window_size  fwd_last_window_size  
2       0.0                  64.0                   0.0                  64.0  

[1 rows x 50 columns]

Predicted Attack Type: DOS_SYN_Hping
Prediction Probabilities: [0.02 0.   0.84 0.01 0.   0.   0.03 0.   0.02 0.   0.04 0.04]





Feature Values (Raw):
   id_orig_p  id_resp_p  proto  service  fwd_pkts_per_sec  bwd_pkts_per_sec  \
3    37827.0     1883.0    1.0      5.0          0.145039          0.080577   

   flow_pkts_per_sec  down_up_ratio  fwd_header_size_min  fwd_header_size_max  \
3           0.225616       0.555556                 32.0                 40.0   

   ...    active_min    active_max    active_avg  active_std      idle_min  \
3  ...  2.233347e+06  2.233347e+06  2.233347e+06         0.0  5.981909e+07   

       idle_max      idle_avg  fwd_init_window_size  bwd_init_window_size  \
3  5.981909e+07  5.981909e+07               64240.0               26847.0   

   fwd_last_window_size  
3                 502.0  

[1 rows x 50 columns]

Predicted Attack Type: MQTT_Publish
Prediction Probabilities: [0.08 0.02 0.   0.76 0.   0.   0.   0.   0.01 0.   0.11 0.02]





Feature Values (Raw):
   id_orig_p  id_resp_p  proto  service  fwd_pkts_per_sec  bwd_pkts_per_sec  \
4    22348.0       21.0    1.0      0.0               0.0               0.0   

   flow_pkts_per_sec  down_up_ratio  fwd_header_size_min  fwd_header_size_max  \
4                0.0            0.0                 20.0                 20.0   

   ...  active_min  active_max  active_avg  active_std  idle_min  idle_max  \
4  ...         0.0         0.0         0.0         0.0       0.0       0.0   

   idle_avg  fwd_init_window_size  bwd_init_window_size  fwd_last_window_size  
4       0.0                  64.0                   0.0                  64.0  

[1 rows x 50 columns]

Predicted Attack Type: DOS_SYN_Hping
Prediction Probabilities: [0.05 0.01 0.72 0.   0.   0.04 0.   0.   0.02 0.03 0.08 0.05]





Feature Values (Raw):
   id_orig_p  id_resp_p  proto  service  fwd_pkts_per_sec  bwd_pkts_per_sec  \
5    37150.0       21.0    1.0      0.0     199728.761905     199728.761905   

   flow_pkts_per_sec  down_up_ratio  fwd_header_size_min  fwd_header_size_max  \
5       399457.52381            1.0                 20.0                 20.0   

   ...  active_min  active_max  active_avg  active_std  idle_min  idle_max  \
5  ...     5.00679     5.00679     5.00679         0.0       0.0       0.0   

   idle_avg  fwd_init_window_size  bwd_init_window_size  fwd_last_window_size  
5       0.0                  64.0                   0.0                  64.0  

[1 rows x 50 columns]

Predicted Attack Type: DOS_SYN_Hping
Prediction Probabilities: [0.02 0.   0.84 0.02 0.   0.   0.02 0.   0.02 0.   0.04 0.04]





Feature Values (Raw):
   id_orig_p  id_resp_p  proto  service  fwd_pkts_per_sec  bwd_pkts_per_sec  \
6    36115.0       21.0    1.0      0.0     199728.761905     199728.761905   

   flow_pkts_per_sec  down_up_ratio  fwd_header_size_min  fwd_header_size_max  \
6       399457.52381            1.0                 20.0                 20.0   

   ...  active_min  active_max  active_avg  active_std  idle_min  idle_max  \
6  ...     5.00679     5.00679     5.00679         0.0       0.0       0.0   

   idle_avg  fwd_init_window_size  bwd_init_window_size  fwd_last_window_size  
6       0.0                  64.0                   0.0                  64.0  

[1 rows x 50 columns]

Predicted Attack Type: DOS_SYN_Hping
Prediction Probabilities: [0.02 0.   0.84 0.02 0.   0.   0.02 0.   0.02 0.   0.04 0.04]





Feature Values (Raw):
   id_orig_p  id_resp_p  proto  service  fwd_pkts_per_sec  bwd_pkts_per_sec  \
7    42253.0       21.0    1.0      0.0     246723.764706     246723.764706   

   flow_pkts_per_sec  down_up_ratio  fwd_header_size_min  fwd_header_size_max  \
7      493447.529412            1.0                 20.0                 20.0   

   ...  active_min  active_max  active_avg  active_std  idle_min  idle_max  \
7  ...    4.053116    4.053116    4.053116         0.0       0.0       0.0   

   idle_avg  fwd_init_window_size  bwd_init_window_size  fwd_last_window_size  
7       0.0                  64.0                   0.0                  64.0  

[1 rows x 50 columns]

Predicted Attack Type: DOS_SYN_Hping
Prediction Probabilities: [0.01 0.   0.84 0.02 0.   0.   0.03 0.   0.02 0.   0.04 0.04]





Feature Values (Raw):
   id_orig_p  id_resp_p  proto  service  fwd_pkts_per_sec  bwd_pkts_per_sec  \
8    36242.0     1334.0    1.0      0.0     110376.421053     110376.421053   

   flow_pkts_per_sec  down_up_ratio  fwd_header_size_min  fwd_header_size_max  \
8      220752.842105            1.0                 20.0                 20.0   

   ...  active_min  active_max  active_avg  active_std  idle_min  idle_max  \
8  ...    9.059906    9.059906    9.059906         0.0       0.0       0.0   

   idle_avg  fwd_init_window_size  bwd_init_window_size  fwd_last_window_size  
8       0.0                1024.0                   0.0                1024.0  

[1 rows x 50 columns]

Predicted Attack Type: NMAP_OS_DETECTION
Prediction Probabilities: [0.03       0.         0.         0.03       0.         0.
 0.55155702 0.         0.         0.35844298 0.         0.03      ]





Feature Values (Raw):
   id_orig_p  id_resp_p  proto  service  fwd_pkts_per_sec  bwd_pkts_per_sec  \
9    49275.0       53.0    2.0      2.0         93.601964         93.601964   

   flow_pkts_per_sec  down_up_ratio  fwd_header_size_min  fwd_header_size_max  \
9         187.203928            1.0                  8.0                  8.0   

   ...   active_min   active_max   active_avg  active_std  idle_min  idle_max  \
9  ...  21367.07306  21367.07306  21367.07306         0.0       0.0       0.0   

   idle_avg  fwd_init_window_size  bwd_init_window_size  fwd_last_window_size  
9       0.0                   0.0                   0.0                   0.0  

[1 rows x 50 columns]

Predicted Attack Type: Thing_Speak
Prediction Probabilities: [0.23 0.01 0.05 0.01 0.04 0.01 0.   0.   0.1  0.   0.46 0.09]




KeyboardInterrupt: 

In [120]:
pip install shap

Collecting shap
  Downloading shap-0.46.0-cp312-cp312-macosx_10_9_x86_64.whl.metadata (24 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Downloading shap-0.46.0-cp312-cp312-macosx_10_9_x86_64.whl (459 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m459.3/459.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading slicer-0.0.8-py3-none-any.whl (15 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.46.0 slicer-0.0.8
Note: you may need to restart the kernel to use updated packages.
