In [1]:
import numpy as np
import pandas as pd
import os
import warnings

# Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning Libraries
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight

# XGBoost Classifier
from xgboost import XGBClassifier, plot_importance
import xgboost as xgb

import joblib

# Suppress warnings
warnings.filterwarnings('ignore')

import pandas as pd
import plotly.graph_objects as go


In [2]:
def load_data(project_path):
    """
    Load network intrusion detection datasets from multiple days
    """
    print("1. Loading Data...")
    days = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday']
    dfs = []
    
    for day in days:
        file_path = os.path.join(project_path, 'data', 'cicids2017', f'{day}.csv')
        print(f"   Loading data for {day}: {file_path}")
        df = pd.read_csv(file_path)
        dfs.append(df)
    
    full_df = pd.concat(dfs, ignore_index=True)
    print(f"   Total records loaded: {len(full_df)}")
    print(f"   Columns in dataset: {list(full_df.columns)}\n")
    
    return full_df

In [3]:
project_path = '/Users/supakrit-a/Code/netviser'

# Load Data
full_df = load_data(project_path)


1. Loading Data...
   Loading data for monday: /Users/supakrit-a/Code/netviser/data/cicids2017/monday.csv
   Loading data for tuesday: /Users/supakrit-a/Code/netviser/data/cicids2017/tuesday.csv
   Loading data for wednesday: /Users/supakrit-a/Code/netviser/data/cicids2017/wednesday.csv
   Loading data for thursday: /Users/supakrit-a/Code/netviser/data/cicids2017/thursday.csv
   Loading data for friday: /Users/supakrit-a/Code/netviser/data/cicids2017/friday.csv
   Total records loaded: 2099976
   Columns in dataset: ['id', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets', 'Total Length of Fwd Packet', 'Total Length of Bwd Packet', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flo

In [5]:
# Ensure 'Dst Port' is of type string
full_df['Dst Port'] = full_df['Dst Port'].astype('str')

# Count the frequency of each 'Dst Port'
dst_port_counts = full_df['Dst Port'].value_counts()

# Define the number of top ports to display
top_n = 6

# Get the top N ports
top_ports = dst_port_counts.nlargest(top_n)

# Calculate the sum of the remaining ports
other_count = dst_port_counts.iloc[top_n:].sum()

# Append the 'Other' category using pd.concat
if other_count > 0:
    other_series = pd.Series({'Other': other_count})
    top_ports = pd.concat([top_ports, other_series])

# Extract labels and sizes
labels = top_ports.index.tolist()
sizes = top_ports.values.tolist()

# Define colors (optional)
colors = [
    'red', 'green', 'blue', 'yellow', 'orange',
    'purple', 'cyan', 'magenta', 'lime', 'pink', 'grey'
]

# Create the Donut Chart
fig = go.Figure(data=[go.Pie(
    labels=labels,
    values=sizes,
    hole=0.4,  # Size of the donut hole
    marker=dict(colors=colors[:len(labels)]),  # Assign colors
    hoverinfo='label+percent+value',  # Information on hover
    textinfo='percent',  # Display percentage on the chart
    sort=False  # Disable sorting to maintain the order
)])

# Update layout for better appearance
fig.update_layout(
    title_text='Destination Port Distribution',
    annotations=[dict(text='Dst Port', x=0.5, y=0.5, font_size=20, showarrow=False)],
    showlegend=True  # Display legend
)

# Display the chart
fig.show()