In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier # Example model
from sklearn.metrics import accuracy_score, classification_report # For evaluation
import joblib # For saving the trained model
import os # For path manipulation

In [2]:
PROJECT_ROOT_DIR = R"D:\intel\processed_data"

In [3]:
# Define the filename of your dataset (it's directly in the root of the project folder)
DATASET_FILENAME = 'final_netshield_cleaned_scaled_dataset.parquet'

In [4]:

# Combine the directory and filename to get the full path to the dataset
DATASET_PATH = os.path.join(PROJECT_ROOT_DIR, DATASET_FILENAME)

In [5]:

# Define the path where the trained model will be saved
# We'll create a 'trained_model' subfolder for this
MODEL_SAVE_DIR = os.path.join(PROJECT_ROOT_DIR, 'trained_model')
MODEL_SAVE_PATH = os.path.join(MODEL_SAVE_DIR, 'network_intrusion_model.pkl')

In [6]:
print("--- Step 3: Loading the Cleaned Dataset ---")
print(f"Attempting to load dataset from: {DATASET_PATH}")
try:
    # Load the Parquet file into a pandas DataFrame using pyarrow engine
    df = pd.read_parquet(DATASET_PATH, engine="pyarrow")
    print(f"Dataset loaded successfully from: {DATASET_PATH}")
    print(f"Dataset shape: {df.shape}") # Shows (number of rows, number of columns)
    print("First 5 rows of the dataset:")
    print(df.head()) # Displays the first 5 rows of your data

    print("\nDataset information (data types, non-null values):")
    df.info() # Provides a summary of your dataset's columns and data types

except FileNotFoundError:
    print(f"ERROR: Dataset file not found at: {DATASET_PATH}")
    print("Please double-check that the file name and the 'PROJECT_ROOT_DIR' are correct.")
except Exception as e:
    print(f"ERROR: An unexpected problem occurred while loading the dataset:")
    print(f"Error type: {type(e).__name__}")
    print(f"Error message: {e}")
    import traceback
    traceback.print_exc() # Prints full technical details of the error

print("-" * 50)

--- Step 3: Loading the Cleaned Dataset ---
Attempting to load dataset from: D:\intel\processed_data\final_netshield_cleaned_scaled_dataset.parquet
Dataset loaded successfully from: D:\intel\processed_data\final_netshield_cleaned_scaled_dataset.parquet
Dataset shape: (2596603, 79)
First 5 rows of the dataset:
   protocol  flow_duration  total_fwd_packets  total_backward_packets  \
0         6      -0.485030          -0.011998               -0.010310   
1         6      -0.485033          -0.011998               -0.009399   
2        17      -0.482243          -0.011998               -0.010310   
3        17      -0.484001          -0.011998               -0.010310   
4         0       2.619704           0.164207               -0.011221   

   fwd_packets_length_total  bwd_packets_length_total  fwd_packet_length_max  \
0                 -0.062106                 -0.007681              -0.330554   
1                 -0.062106                 -0.007681              -0.330554   
2         

In [7]:
print("--- Starting Step 5: Dividing Data into Training and Test Sets ---")
all_columns=df.columns.tolist()
features_to_exclude=['label','label_encoded']
X_columns=[col for col in all_columns if col not in features_to_exclude]
X=df[X_columns]
y=df['label_encoded']
print(f"\nShape of Features (X):{X.shape}")
print(f"Shape of Target (y):{y.shape}")
print("\nFirst 5 columns of X (Features):")
print(X.iloc[:, :5].head())

--- Starting Step 5: Dividing Data into Training and Test Sets ---

Shape of Features (X):(2596603, 77)
Shape of Target (y):(2596603,)

First 5 columns of X (Features):
   protocol  flow_duration  total_fwd_packets  total_backward_packets  \
0         6      -0.485030          -0.011998               -0.010310   
1         6      -0.485033          -0.011998               -0.009399   
2        17      -0.482243          -0.011998               -0.010310   
3        17      -0.484001          -0.011998               -0.010310   
4         0       2.619704           0.164207               -0.011221   

   fwd_packets_length_total  
0                 -0.062106  
1                 -0.062106  
2                 -0.057573  
3                 -0.057573  
4                 -0.062106  


In [8]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42,stratify=y)
print(f"\nShape of X_train (Training Features): {X_train.shape}")
print(f"Shape of X_test (Test Features): {X_test.shape}")
print(f"Shape of y_train (Training Target): {y_train.shape}")
print(f"Shape of y_test (Test Target): {y_test.shape}")
print("\nClass distribution in original dataset:")
print(y.value_counts(normalize=True))
print("\nClass distribution in y_train:")
print(y_train.value_counts(normalize=True))
print("\nClass distribution in y_test:")
print(y_test.value_counts(normalize=True))
print("-"*50)
print("Data successfully split into training and test sets!")


Shape of X_train (Training Features): (2077282, 77)
Shape of X_test (Test Features): (519321, 77)
Shape of y_train (Training Target): (2077282,)
Shape of y_test (Test Target): (519321,)

Class distribution in original dataset:
label_encoded
8     0.870964
2     0.066566
0     0.049301
1     0.003961
5     0.002284
4     0.002074
3     0.002013
10    0.001240
9     0.000753
11    0.000566
13    0.000251
7     0.000014
12    0.000008
6     0.000004
Name: proportion, dtype: float64

Class distribution in y_train:
label_encoded
8     0.870964
2     0.066566
0     0.049300
1     0.003961
5     0.002284
4     0.002074
3     0.002013
10    0.001240
9     0.000753
11    0.000566
13    0.000251
7     0.000014
12    0.000008
6     0.000004
Name: proportion, dtype: float64

Class distribution in y_test:
label_encoded
8     0.870964
2     0.066566
0     0.049301
1     0.003961
5     0.002284
4     0.002074
3     0.002014
10    0.001240
9     0.000753
11    0.000566
13    0.000252
7     0.000013
1