## Data Preprocessing
#### - Preprocessing

In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set(style='darkgrid')
import matplotlib.pyplot as plt

In [11]:
data = pd.read_csv('intermediate_data/Data_after_EDA.csv')

In [12]:
# For improving performance and reduce memory-related errors
old_memory_usage = data.memory_usage().sum() / 1024 ** 2
print(f'Initial memory usage: {old_memory_usage:.2f} MB')
for col in data.columns:
    col_type = data[col].dtype
    if col_type != object:
        c_min = data[col].min()
        c_max = data[col].max()
        # Downcasting float64 to float32
        if str(col_type).find('float') >= 0 and c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
            data[col] = data[col].astype(np.float32)

        # Downcasting int64 to int32
        elif str(col_type).find('int') >= 0 and c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
            data[col] = data[col].astype(np.int32)

new_memory_usage = data.memory_usage().sum() / 1024 ** 2
print(f"Final memory usage: {new_memory_usage:.2f} MB")

Initial memory usage: 1520.28 MB
Final memory usage: 789.01 MB


In [13]:
# Calculating percentage reduction in memory usage
print(f'Reduced memory usage: {1 - (new_memory_usage / old_memory_usage):.2%}')

Reduced memory usage: 48.10%


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2522362 entries, 0 to 2522361
Data columns (total 79 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Destination Port             int32  
 1   Flow Duration                int32  
 2   Total Fwd Packets            int32  
 3   Total Backward Packets       int32  
 4   Total Length of Fwd Packets  int32  
 5   Total Length of Bwd Packets  int32  
 6   Fwd Packet Length Max        int32  
 7   Fwd Packet Length Min        int32  
 8   Fwd Packet Length Mean       float32
 9   Fwd Packet Length Std        float32
 10  Bwd Packet Length Max        int32  
 11  Bwd Packet Length Min        int32  
 12  Bwd Packet Length Mean       float32
 13  Bwd Packet Length Std        float32
 14  Flow Bytes/s                 float32
 15  Flow Packets/s               float32
 16  Flow IAT Mean                float32
 17  Flow IAT Std                 float32
 18  Flow IAT Max                 int32  
 19  

In [15]:
pd.options.display.max_rows = 80

data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Destination Port,2522362.0,8704.762,19025.07,0.0,53.0,80.0,443.0,65535.0
Flow Duration,2522362.0,16581320.0,35224260.0,-13.0,208.0,50577.0,5329717.0,120000000.0
Total Fwd Packets,2522362.0,10.27627,794.1738,1.0,2.0,2.0,6.0,219759.0
Total Backward Packets,2522362.0,11.56596,1056.594,0.0,1.0,2.0,5.0,291922.0
Total Length of Fwd Packets,2522362.0,611.5751,10584.99,0.0,12.0,66.0,332.0,12900000.0
Total Length of Bwd Packets,2522362.0,18133.15,2397434.0,0.0,6.0,155.0,988.0,655453000.0
Fwd Packet Length Max,2522362.0,231.0918,756.1625,0.0,6.0,40.0,202.0,24820.0
Fwd Packet Length Min,2522362.0,19.19464,60.79447,0.0,0.0,2.0,37.0,2325.0
Fwd Packet Length Mean,2522362.0,63.47011,195.5015,0.0,6.0,36.111111,52.0,5940.857
Fwd Packet Length Std,2522362.0,77.27759,296.7953,0.0,0.0,0.0,74.13914,7125.597
