In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
print(os.getcwd())

C:\Users\HP


In [2]:
DATASET_DIR="C:\\Users\\HP\\Downloads\\archive (1)"

In [3]:
parquet_files=['Infiltration-Thursday-no-metadata.parquet','Portscan-Friday-no-metadata.parquet','WebAttacks-Thursday-no-metadata.parquet','DoS-Wednesday-no-metadata.parquet','DDoS-Friday-no-metadata.parquet','Bruteforce-Tuesday-no-metadata.parquet','Benign-Monday-no-metadata.parquet','Benign-Monday-no-metadata.parquet']

In [4]:
OUTPUT_DIR='processed_data'

In [5]:
os.makedirs(OUTPUT_DIR,exist_ok=True)

In [6]:
print("---Starting Data Loading from Parquet files---")

---Starting Data Loading from Parquet files---


In [7]:
all_dfs=[]

In [8]:
for file_name in parquet_files:
    file_path=os.path.join(DATASET_DIR,file_name)
    if os.path.exists(file_path):
        print(f'Loading {file_name}....')
        try:
            df_temp=pd.read_parquet(file_path)
            all_dfs.append(df_temp)
            print(f'Loaded {len(df_temp)} rows.')
        except Exception as e:
            print(f'Error loading {file_name}:{e}')
            print(f' Skipping {file_name}.')
    else:
        print(f'WARNING:File not found - {file_path}. Skipping.')
if not all_dfs:
    print("\nERROR: No Parquet files were loaded.Please double-check your DATASET_DIR path and the 'parquet_files' list.")
    print("  Ensure the file names in 'parquet_files' exactly match the names in your folder.")
    print("  Also,verify the full path to the folder containing the Parquet files.")
    df=pd.DataFrame()
else:
    df=pd.concat(all_dfs,ignore_index=True)
    print(f'\n---Data Loading Complete---')
    print(f'All datasets combined.Total rows:{len(df)},Total columns:{len(df.columns)}')
        
        

Loading Infiltration-Thursday-no-metadata.parquet....
Loaded 207630 rows.
Loading Portscan-Friday-no-metadata.parquet....
Loaded 119522 rows.
Loading WebAttacks-Thursday-no-metadata.parquet....
Loaded 155820 rows.
Loading DoS-Wednesday-no-metadata.parquet....
Loaded 584991 rows.
Loading DDoS-Friday-no-metadata.parquet....
Loaded 221264 rows.
Loading Bruteforce-Tuesday-no-metadata.parquet....
Loaded 389714 rows.
Loading Benign-Monday-no-metadata.parquet....
Loaded 458831 rows.
Loading Benign-Monday-no-metadata.parquet....
Loaded 458831 rows.

---Data Loading Complete---
All datasets combined.Total rows:2596603,Total columns:78


In [9]:
print("---Starting Data Exploration---")
print('\n First 5 rows of the DataFrame (df.head()):')
print(df.head())
print("-"*50)

---Starting Data Exploration---

 First 5 rows of the DataFrame (df.head()):
   Protocol  Flow Duration  Total Fwd Packets  Total Backward Packets  \
0         6            166                  1                       1   
1         6             83                  1                       2   
2        17          99947                  1                       1   
3        17          37017                  1                       1   
4         0      111161336                147                       0   

   Fwd Packets Length Total  Bwd Packets Length Total  Fwd Packet Length Max  \
0                         0                         0                      0   
1                         0                         0                      0   
2                        48                        48                     48   
3                        48                        48                     48   
4                         0                         0                      0   

   

In [10]:
print('\n2. DataFrame Shape (rows,coloumns-df.shape):')
print(df.shape)
print("-"*50)


2. DataFrame Shape (rows,coloumns-df.shape):
(2596603, 78)
--------------------------------------------------


In [11]:
print("\n3. DataFrame info (columns,non-null counts,data types-df.info():")
df.info()
print("-"*50)


3. DataFrame info (columns,non-null counts,data types-df.info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2596603 entries, 0 to 2596602
Data columns (total 78 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   Protocol                  int8   
 1   Flow Duration             int32  
 2   Total Fwd Packets         int32  
 3   Total Backward Packets    int32  
 4   Fwd Packets Length Total  int32  
 5   Bwd Packets Length Total  int32  
 6   Fwd Packet Length Max     int16  
 7   Fwd Packet Length Min     int16  
 8   Fwd Packet Length Mean    float32
 9   Fwd Packet Length Std     float32
 10  Bwd Packet Length Max     int16  
 11  Bwd Packet Length Min     int16  
 12  Bwd Packet Length Mean    float32
 13  Bwd Packet Length Std     float32
 14  Flow Bytes/s              float64
 15  Flow Packets/s            float64
 16  Flow IAT Mean             float32
 17  Flow IAT Std              float32
 18  Flow IAT Max              int32  
 1

In [12]:
pd.reset_option('display.max_columns')
pd.reset_option('display.max_rows')
print('\n4. Missing values per column (df.isnull().sum()):')
print(df.isnull().sum())
print("-"*50)


4. Missing values per column (df.isnull().sum()):
Protocol                    0
Flow Duration               0
Total Fwd Packets           0
Total Backward Packets      0
Fwd Packets Length Total    0
                           ..
Idle Mean                   0
Idle Std                    0
Idle Max                    0
Idle Min                    0
Label                       0
Length: 78, dtype: int64
--------------------------------------------------


In [13]:
print('\n5. All Coloumn Names (df.columns.tolist()):')
print(df.columns.tolist())
print("-"*50)


5. All Coloumn Names (df.columns.tolist()):
['Protocol', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Fwd Packets Length Total', 'Bwd Packets Length Total', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Packet Length Min', 'Packet Length Max', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'C

In [14]:
if 'Label' in df.columns:
    print("\n6. Distribution of 'Label' column (df['Label'l.value_counts()):")
    print(df['Label'].value_counts())
    print("\nPercentage distribution of 'Label' column:")
    print(df['Label'].value_counts(normalize=True)*100)
else:
    print("\n6. 'Label' column not found.Please verify the exact coloumn name for your traffic labels.")
    
    


6. Distribution of 'Label' column (df['Label'l.value_counts()):
Label
Benign                        2261548
DoS Hulk                       172846
DDoS                           128014
DoS GoldenEye                   10286
FTP-Patator                      5931
DoS slowloris                    5385
DoS Slowhttptest                 5228
SSH-Patator                      3219
PortScan                         1956
Web Attack � Brute Force         1470
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64

Percentage distribution of 'Label' column:
Label
Benign                        87.096410
DoS Hulk                       6.656620
DDoS                           4.930057
DoS GoldenEye                  0.396133
FTP-Patator                    0.228414
DoS slowloris                  0.207386
DoS Slowhttptest               0.201340
SSH-Patator                    0.123970
P

In [15]:
print("---Starting Data Cleaning: Fixing Coloumn Names---")
df.columns=df.columns.str.strip().str.lower().str.replace(' ','_').str.replace('[^a-z0-9_]','',regex=True)
print("Cleaned Column Names:")
print(df.columns.tolist())
print("-"*50)

---Starting Data Cleaning: Fixing Coloumn Names---
Cleaned Column Names:
['protocol', 'flow_duration', 'total_fwd_packets', 'total_backward_packets', 'fwd_packets_length_total', 'bwd_packets_length_total', 'fwd_packet_length_max', 'fwd_packet_length_min', 'fwd_packet_length_mean', 'fwd_packet_length_std', 'bwd_packet_length_max', 'bwd_packet_length_min', 'bwd_packet_length_mean', 'bwd_packet_length_std', 'flow_bytess', 'flow_packetss', 'flow_iat_mean', 'flow_iat_std', 'flow_iat_max', 'flow_iat_min', 'fwd_iat_total', 'fwd_iat_mean', 'fwd_iat_std', 'fwd_iat_max', 'fwd_iat_min', 'bwd_iat_total', 'bwd_iat_mean', 'bwd_iat_std', 'bwd_iat_max', 'bwd_iat_min', 'fwd_psh_flags', 'bwd_psh_flags', 'fwd_urg_flags', 'bwd_urg_flags', 'fwd_header_length', 'bwd_header_length', 'fwd_packetss', 'bwd_packetss', 'packet_length_min', 'packet_length_max', 'packet_length_mean', 'packet_length_std', 'packet_length_variance', 'fin_flag_count', 'syn_flag_count', 'rst_flag_count', 'psh_flag_count', 'ack_flag_coun

In [16]:
print("---Starting Data Cleaning: Handling Infinite Values---")
df.replace([np.inf,-np.inf],np.nan,inplace=True)
nan_after_inf_check=df.isnull().sum()
if nan_after_inf_check.sum()>0:
    print("NaN values introduced after replacing infinities:")
    print(nan-after_inf_check[nan_after_inf_check>0])
    print("\nDeciding how to handle these NaNs:")
    df.fillna(0,inplace=True)
    print("Filled all introduced NaNs with 0.")
else:
    print("No infinite values found and no NaNs introduces.")
print("-"*50)    
    

---Starting Data Cleaning: Handling Infinite Values---
No infinite values found and no NaNs introduces.
--------------------------------------------------


In [17]:
print("---Starting Data Cleaning: Cleaning 'Label' Columns---")
if 'label' in df.columns:
    df['label']=df['label'].astype(str).str.strip().str.upper()
    df['label']=df['label'].replace('BENIGN','NORMAL')
    print("Cleaned 'label' column distribution:")
    print(df['label'].value_counts())
else:
    print("No 'Label' column to clean.Please ensure your target column is correctly named.")
print("-"*50)    
    

---Starting Data Cleaning: Cleaning 'Label' Columns---
Cleaned 'label' column distribution:
label
NORMAL                        2261548
DOS HULK                       172846
DDOS                           128014
DOS GOLDENEYE                   10286
FTP-PATATOR                      5931
DOS SLOWLORIS                    5385
DOS SLOWHTTPTEST                 5228
SSH-PATATOR                      3219
PORTSCAN                         1956
WEB ATTACK � BRUTE FORCE         1470
WEB ATTACK � XSS                  652
INFILTRATION                       36
WEB ATTACK � SQL INJECTION         21
HEARTBLEED                         11
Name: count, dtype: int64
--------------------------------------------------


In [18]:
print("---Preprocesing: Encoding Target 'Label' Column---")
if 'label' in df.columns and df['label'].dtype=='object':
    le=LabelEncoder()
    df['label_encoded']=le.fit_transform(df['label'])
    print(f"Original Lable Value Counts:\n{df['label'].value_counts()}")
    print(f"\nEncoded Label Value Counts:\n{df['label_encoded'].value_counts()}")
    print("\nMapping of original labels to encoded numbers:")
    for i,label in enumerate(le.classes_):
        print(f"{label}->{i}")
elif "label" in df.columns and df['label'].dtype!='object':
    print("'Label' column is already numerical.Skipping Label Encoding.")
    df['label_encoded']=df['label']
else:
     print("Warning: 'Label' column not found for encoding. Please verify the name of your target column.")
print("-"*50)    

---Preprocesing: Encoding Target 'Label' Column---
Original Lable Value Counts:
label
NORMAL                        2261548
DOS HULK                       172846
DDOS                           128014
DOS GOLDENEYE                   10286
FTP-PATATOR                      5931
DOS SLOWLORIS                    5385
DOS SLOWHTTPTEST                 5228
SSH-PATATOR                      3219
PORTSCAN                         1956
WEB ATTACK � BRUTE FORCE         1470
WEB ATTACK � XSS                  652
INFILTRATION                       36
WEB ATTACK � SQL INJECTION         21
HEARTBLEED                         11
Name: count, dtype: int64

Encoded Label Value Counts:
label_encoded
8     2261548
2      172846
0      128014
1       10286
5        5931
4        5385
3        5228
10       3219
9        1956
11       1470
13        652
7          36
12         21
6          11
Name: count, dtype: int64

Mapping of original labels to encoded numbers:
DDOS->0
DOS GOLDENEYE->1
DOS HULK->2
DOS SL

In [19]:
print("\n--- Skipping One-Hot Encoding for Feature Columns ---")
print("All feature columns are already numerical (not 'object' dtype), so no One-Hot Encoding is needed.")
print("-" * 50)


--- Skipping One-Hot Encoding for Feature Columns ---
All feature columns are already numerical (not 'object' dtype), so no One-Hot Encoding is needed.
--------------------------------------------------


In [20]:
print("\n---Starting Preprocessing: Scaling Numerical Features---")
numerical_features=df.select_dtypes(include=['int8','int16','int32','int64','float32','float64'])
columns_to_exclude_from_scaling=['label',
    'label_encoded',
    'protocol',
    'fwd_psh_flags',
    'bwd_psh_flags',
    'fwd_urg_flags',
    'bwd_urg_flags',
    'fin_flag_count',
    'syn_flag_count',
    'rst_flag_count',
    'psh_flag_count',
    'ack_flag_count',
    'urg_flag_count',
    'cwe_flag_count',
    'ece_flag_count',
    'downup_ratio',
    'fwd_avg_bytesbulk',
    'fwd_avg_packetsbulk',
    'fwd_avg_bulk_rate',
    'bwd_avg_bytesbulk',
    'bwd_avg_packetsbulk',
    'bwd_avg_bulk_rate']
numerical_features_to_scale=[col for col in numerical_features if col not in columns_to_exclude_from_scaling]
if not numerical_features_to_scale:
    print("No numerical features found for scaling (after excluding labels/IDs).")
else:
    print(f"\nScaling the following numerical features: {numerical_features_to_scale[:5]}... (showing first 5 of {len(numerical_features_to_scale)} columns)")
    scaler=StandardScaler()
    df[numerical_features_to_scale]=scaler.fit_transform(df[numerical_features_to_scale])
    print("\nFirst 5 rows of DataFrame after scaling (check numerical features):")
    pd.set_option('display.max_columns', None)
    print(df.head())
    pd.reset_option('display.max_columns') 
    print(f"\nDataFrame shape after scaling: {df.shape}")


---Starting Preprocessing: Scaling Numerical Features---

Scaling the following numerical features: ['flow_duration', 'total_fwd_packets', 'total_backward_packets', 'fwd_packets_length_total', 'bwd_packets_length_total']... (showing first 5 of 57 columns)

First 5 rows of DataFrame after scaling (check numerical features):
   protocol  flow_duration  total_fwd_packets  total_backward_packets  \
0         6      -0.485030          -0.011998               -0.010310   
1         6      -0.485033          -0.011998               -0.009399   
2        17      -0.482243          -0.011998               -0.010310   
3        17      -0.484001          -0.011998               -0.010310   
4         0       2.619704           0.164207               -0.011221   

   fwd_packets_length_total  bwd_packets_length_total  fwd_packet_length_max  \
0                 -0.062106                 -0.007681              -0.330554   
1                 -0.062106                 -0.007681              -0.33055

In [25]:
print("\n---Saving Final Cleaned and Scaled Dataset to Parquet file---")
output_parquet_filepath = os.path.join(OUTPUT_DIR, 'final_netshield_cleaned_scaled_dataset.parquet')
try:
    # Save the DataFrame (df) to the specified Parquet file path
    # index=False ensures pandas doesn't write the DataFrame's index as a column
    df.to_parquet(output_parquet_filepath, index=False)
    print(f"Cleaned and scaled dataset successfully saved to: {output_parquet_filepath}")
    print(f"Shape of saved dataset: {df.shape}")
except Exception as e:
    print(f"ERROR: Could not save cleaned and scaled dataset to Parquet: {e}")


---Saving Final Cleaned and Scaled Dataset to Parquet file---
Cleaned and scaled dataset successfully saved to: processed_data\final_netshield_cleaned_scaled_dataset.parquet
Shape of saved dataset: (2596603, 79)


In [22]:
print("--- Starting Step 5: Dividing Data into Training and Test Sets ---")
all_columns=df.columns.tolist()
features_to_exclude=['label','label_encoded']
X_columns=[col for col in all_columns if col not in features_to_exclude]
X=df[X_columns]
y=df['label_encoded']
print(f"\nShape of Features (X):{X.shape}")
print(f"Shape of Target (y):{y.shape}")
print("\nFirst 5 columns of X (Features):")
print(X.iloc[:, :5].head())

--- Starting Step 5: Dividing Data into Training and Test Sets ---

Shape of Features (X):(2596603, 77)
Shape of Target (y):(2596603,)

First 5 columns of X (Features):
   protocol  flow_duration  total_fwd_packets  total_backward_packets  \
0         6      -0.485030          -0.011998               -0.010310   
1         6      -0.485033          -0.011998               -0.009399   
2        17      -0.482243          -0.011998               -0.010310   
3        17      -0.484001          -0.011998               -0.010310   
4         0       2.619704           0.164207               -0.011221   

   fwd_packets_length_total  
0                 -0.062106  
1                 -0.062106  
2                 -0.057573  
3                 -0.057573  
4                 -0.062106  


In [23]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42,stratify=y)
print(f"\nShape of X_train (Training Features): {X_train.shape}")
print(f"Shape of X_test (Test Features): {X_test.shape}")
print(f"Shape of y_train (Training Target): {y_train.shape}")
print(f"Shape of y_test (Test Target): {y_test.shape}")
print("\nClass distribution in original dataset:")
print(y.value_counts(normalize=True))
print("\nClass distribution in y_train:")
print(y_train.value_counts(normalize=True))
print("\nClass distribution in y_test:")
print(y_test.value_counts(normalize=True))
print("-"*50)
print("Data successfully split into training and test sets!")





Shape of X_train (Training Features): (2077282, 77)
Shape of X_test (Test Features): (519321, 77)
Shape of y_train (Training Target): (2077282,)
Shape of y_test (Test Target): (519321,)

Class distribution in original dataset:
label_encoded
8     0.870964
2     0.066566
0     0.049301
1     0.003961
5     0.002284
4     0.002074
3     0.002013
10    0.001240
9     0.000753
11    0.000566
13    0.000251
7     0.000014
12    0.000008
6     0.000004
Name: proportion, dtype: float64

Class distribution in y_train:
label_encoded
8     0.870964
2     0.066566
0     0.049300
1     0.003961
5     0.002284
4     0.002074
3     0.002013
10    0.001240
9     0.000753
11    0.000566
13    0.000251
7     0.000014
12    0.000008
6     0.000004
Name: proportion, dtype: float64

Class distribution in y_test:
label_encoded
8     0.870964
2     0.066566
0     0.049301
1     0.003961
5     0.002284
4     0.002074
3     0.002014
10    0.001240
9     0.000753
11    0.000566
13    0.000252
7     0.000013
1