In [1]:
import numpy as np
import pandas as pd
import os
import warnings

# Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning Libraries
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight

# XGBoost Classifier
from xgboost import XGBClassifier, plot_importance
import xgboost as xgb

import joblib

# Suppress warnings
warnings.filterwarnings('ignore')

import pandas as pd
import plotly.graph_objects as go


In [2]:
def load_data(project_path):
    """
    Load network intrusion detection datasets from multiple days
    """
    print("1. Loading Data...")
    days = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday']
    dfs = []
    
    for day in days:
        file_path = os.path.join(project_path, 'data', 'cicids2017', f'{day}.parquet')
        print(f"   Loading data for {day}: {file_path}")
        df = pd.read_parquet(file_path)
        dfs.append(df)
    
    full_df = pd.concat(dfs, ignore_index=True)
    print(f"   Total records loaded: {len(full_df)}")
    print(f"   Columns in dataset: {list(full_df.columns)}\n")
    
    return full_df

In [3]:
project_path = '/home/tan/Code/finalproject/xgboost-cicids2017'
# Load Data
full_df = load_data(project_path)


1. Loading Data...
   Loading data for monday: /home/tan/Code/finalproject/xgboost-cicids2017/data/cicids2017/monday.parquet
   Loading data for tuesday: /home/tan/Code/finalproject/xgboost-cicids2017/data/cicids2017/tuesday.parquet
   Loading data for wednesday: /home/tan/Code/finalproject/xgboost-cicids2017/data/cicids2017/wednesday.parquet
   Loading data for thursday: /home/tan/Code/finalproject/xgboost-cicids2017/data/cicids2017/thursday.parquet
   Loading data for friday: /home/tan/Code/finalproject/xgboost-cicids2017/data/cicids2017/friday.parquet
   Total records loaded: 2099976
   Columns in dataset: ['id', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets', 'Total Length of Fwd Packet', 'Total Length of Bwd Packet', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', '

In [4]:
# Ensure 'Dst Port' is of type string
full_df['Dst Port'] = full_df['Dst Port'].astype('string[pyarrow]')

# Count the frequency of each 'Dst Port'
dst_port_counts = full_df['Dst Port'].value_counts()

# Define the number of top ports to display
top_n = 6

# Get the top N ports
top_ports = dst_port_counts.nlargest(top_n)

# Calculate the sum of the remaining ports
other_count = dst_port_counts.iloc[top_n:].sum()

# Append the 'Other' category using pd.concat
if other_count > 0:
    other_series = pd.Series({'Other': other_count})
    top_ports = pd.concat([top_ports, other_series])

# Extract labels and sizes
labels = top_ports.index.tolist()
sizes = top_ports.values.tolist()

# Define colors (optional)
colors = [
    'red', 'green', 'blue', 'yellow', 'orange',
    'purple', 'cyan', 'magenta', 'lime', 'pink', 'grey'
]

# Create the Donut Chart
fig = go.Figure(data=[go.Pie(
    labels=labels,
    values=sizes,
    hole=0.4,  # Size of the donut hole
    marker=dict(colors=colors[:len(labels)]),  # Assign colors
    hoverinfo='label+percent+value',  # Information on hover
    textinfo='percent',  # Display percentage on the chart
    sort=False  # Disable sorting to maintain the order
)])

# Update layout for better appearance
fig.update_layout(
    title_text='Destination Port Distribution',
    annotations=[dict(text='Dst Port', x=0.5, y=0.5, font_size=20, showarrow=False)],
    showlegend=True  # Display legend
)

# Display the chart
fig.show()

In [7]:
# Port Scan attack filter
port_scan_df = full_df[full_df['Label'] == 'Portscan']
port_scan_df

Unnamed: 0,id,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Total Fwd Packet,...,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label,Attempted Category
1622046,69628,172.16.0.1-192.168.10.50-33830-4848-6,172.16.0.1,33830,192.168.10.50,4848,6,2017-07-07 17:52:58.173454,44,1,...,0,0.0,0.0,0,0,-1,-1,44,Portscan,-1
1622047,69629,172.16.0.1-192.168.10.50-33830-1272-6,172.16.0.1,33830,192.168.10.50,1272,6,2017-07-07 17:52:58.178547,43,1,...,0,0.0,0.0,0,0,-1,-1,43,Portscan,-1
1622048,69630,172.16.0.1-192.168.10.50-39492-5862-6,172.16.0.1,39492,192.168.10.50,5862,6,2017-07-07 17:52:33.479484,54,1,...,0,0.0,0.0,0,0,-1,-1,54,Portscan,-1
1622049,69631,172.16.0.1-192.168.10.50-33830-2605-6,172.16.0.1,33830,192.168.10.50,2605,6,2017-07-07 17:52:58.179078,33,1,...,0,0.0,0.0,0,0,-1,-1,33,Portscan,-1
1622051,69633,172.16.0.1-192.168.10.50-39070-17988-6,172.16.0.1,39070,192.168.10.50,17988,6,2017-07-07 17:52:11.570780,50,1,...,0,0.0,0.0,0,0,-1,-1,50,Portscan,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2099966,547548,172.16.0.1-192.168.10.50-44198-32776-6,172.16.0.1,44198,192.168.10.50,32776,6,2017-07-07 17:55:44.538828,43,1,...,0,0.0,0.0,0,0,-1,-1,43,Portscan,-1
2099968,547550,172.16.0.1-192.168.10.50-62393-20000-6,172.16.0.1,62393,192.168.10.50,20000,6,2017-07-07 17:52:08.822650,52,1,...,0,0.0,0.0,0,0,-1,-1,52,Portscan,-1
2099971,547553,172.16.0.1-192.168.10.50-64318-2222-6,172.16.0.1,64318,192.168.10.50,2222,6,2017-07-07 18:09:11.499555,64,1,...,0,0.0,0.0,0,0,-1,-1,64,Portscan,-1
2099972,547554,172.16.0.1-192.168.10.50-33248-9040-6,172.16.0.1,33248,192.168.10.50,9040,6,2017-07-07 17:54:38.857717,76,1,...,0,0.0,0.0,0,0,-1,-1,76,Portscan,-1


In [10]:
port_scan_df[['Dst Port', 'Label']].groupby('Dst Port').count().sort_values(by='Label', ascending=False).head(10)

Unnamed: 0_level_0,Label
Dst Port,Unnamed: 1_level_1
80,303
21,244
22,243
443,240
444,210
139,197
445,179
8045,160
2009,160
2006,160


In [11]:
port_scan_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 159066 entries, 1622046 to 2099973
Data columns (total 91 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   id                          159066 non-null  int64         
 1   Flow ID                     159066 non-null  object        
 2   Src IP                      159066 non-null  object        
 3   Src Port                    159066 non-null  int64         
 4   Dst IP                      159066 non-null  object        
 5   Dst Port                    159066 non-null  string        
 6   Protocol                    159066 non-null  int64         
 7   Timestamp                   159066 non-null  datetime64[ns]
 8   Flow Duration               159066 non-null  int64         
 9   Total Fwd Packet            159066 non-null  int64         
 10  Total Bwd packets           159066 non-null  int64         
 11  Total Length of Fwd Packet  159066 no

In [None]:
import plotly.express as px

# Aggregate counts by destination port
port_scan_dest_ports = port_scan_df['Dst Port'].value_counts().reset_index()
port_scan_dest_ports.columns = ['Dst Port', 'Request Count']

# Plot the bar chart
fig = px.bar(port_scan_dest_ports.head(20), x='Dst Port', y='Request Count',
             title='Top 20 Destination Ports in Portscan Attacks',
             labels={'Dst Port': 'Destination Port', 'Request Count': 'Number of Requests'})
fig.show()

In [16]:
# Group data by timestamp to count requests
port_scan_time_series = port_scan_df.groupby(port_scan_df['Timestamp'].dt.floor('T')).size().reset_index(name='Request Count')

# Plot the time-series chart
fig = px.line(port_scan_time_series, x='Timestamp', y='Request Count',
              title='Request Count Over Time (Portscan)',
              labels={'Timestamp': 'Timestamp', 'Request Count': 'Number of Requests'})
fig.show()


In [17]:
fig = px.scatter(port_scan_df.sample(5000), x='Src IP', y='Dst Port',
                 title='Scatter Plot of Source IP vs Destination Port (Portscan)',
                 labels={'Src IP': 'Source IP', 'Dst Port': 'Destination Port'},
                 color='Protocol')
fig.show()


In [19]:
full_df['Src IP'].nunique()

174

In [21]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Aggregate counts by destination port
port_scan_dest_ports = port_scan_df['Dst Port'].value_counts().reset_index()
port_scan_dest_ports.columns = ['Dst Port', 'Request Count']

# Group data by timestamp to count requests
port_scan_time_series = port_scan_df.groupby(port_scan_df['Timestamp'].dt.floor('T')).size().reset_index(name='Request Count')

# Create subplots: 2 rows, 1 column
fig = make_subplots(
    rows=2, cols=1,
    subplot_titles=("Top 20 Destination Ports in Portscan Attacks", "Request Count Over Time (Portscan)")
)

# Add bar chart to the first row
fig.add_trace(
    go.Bar(x=port_scan_dest_ports['Dst Port'].head(20), y=port_scan_dest_ports['Request Count'].head(20), name='Destination Ports'),
    row=1, col=1
)

# Add line chart to the second row
fig.add_trace(
    go.Scatter(x=port_scan_time_series['Timestamp'], y=port_scan_time_series['Request Count'], mode='lines', name='Request Count Over Time'),
    row=2, col=1
)

# Update layout
fig.update_layout(
    height=800,  # Set the figure height
    title_text="Portscan Analysis: Destination Ports and Request Counts Over Time",
    showlegend=False
)

In [22]:
import plotly.graph_objects as go

# Aggregate counts by destination port
port_scan_dest_ports = port_scan_df['Dst Port'].value_counts().reset_index()
port_scan_dest_ports.columns = ['Dst Port', 'Request Count']

# Group data by timestamp to count requests
port_scan_time_series = port_scan_df.groupby(port_scan_df['Timestamp'].dt.floor('T')).size().reset_index(name='Request Count')

# Create a combined figure
fig = go.Figure()

# Add time-series line chart for request frequency
fig.add_trace(
    go.Scatter(
        x=port_scan_time_series['Timestamp'],
        y=port_scan_time_series['Request Count'],
        mode='lines',
        name='Request Count Over Time',
        yaxis='y1'
    )
)

# Add bar chart for top destination ports
fig.add_trace(
    go.Bar(
        x=port_scan_dest_ports['Dst Port'].head(20),
        y=port_scan_dest_ports['Request Count'].head(20),
        name='Top Destination Ports',
        yaxis='y2'
    )
)

# Update layout to include two y-axes
fig.update_layout(
    title="Portscan Analysis: Request Frequency and Destination Ports",
    xaxis=dict(title="Timestamp / Destination Port"),
    yaxis=dict(
        title="Request Count Over Time",
        titlefont=dict(color="blue"),
        tickfont=dict(color="blue"),
    ),
    yaxis2=dict(
        title="Top Destination Ports",
        titlefont=dict(color="orange"),
        tickfont=dict(color="orange"),
        anchor="free",
        overlaying="y",
        side="right",
        position=1.0
    ),
    legend=dict(x=0.1, y=1.1, orientation="h")
)

# Show plot
fig.show()
