# Background
Dataset: https://www.takakura.com/Kyoto_data/new_data201704/

In [1]:
# Necessary imports
import pandas as pd
import os
import warnings
from shutil import rmtree
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import ipaddress
import numpy as np
from scipy.sparse import csr_matrix
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Pre-processing

### Converting the data from txt format to CSV format to make the data easier to work with

In [2]:
# Since our data has mixed attribute types, we suppress the pandas warning regarding it
warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning)

# From the data documentation
columns = [
    "Duration (seconds)",
    "Service Type",
    "Source bytes",
    "Destination bytes",
    "Count",
    "Same srv rate",
    "Serror rate",
    "Srv serror rate",
    "Dst host count",
    "Dst host srv count",
    "Dst host same src port rate",
    "Dst host serror rate",
    "Dst host srv serror rate",
    "Flag",
    "IDS detection",
    "Malware detection",
    "Ashula detection",
    "Label",
    "Source IP Address",
    "Source Port Number",
    "Destination IP Address",
    "Destination Port Number",
    "Start Time",
    "Protocol"
]

month_encoder = {
    1: "January",
    2: "February",
    3: "March",
    4: "April",
    5: "May",
    6: "June",
    7: "July",
    8: "August",
    9: "September",
    10: "October",
    11: "November",
    12: "December"
}

# Delete the CSVs folder if it already exists
rmtree("Data/CSVs/", ignore_errors = True)
# Make it again
os.mkdir("Data/CSVs/")

# Since we're iterating through all months in the year
for month in range(1, 2): #note: just start with first month (for testing), instead of (1,13)
    print(f"Creating {month_encoder[month]}.csv...")
    output_file = f"Data/CSVs/{month_encoder[month]}.csv"
    if month < 10:
        str_month = "0"+str(month)    # Since months are 2 digits in the file structure
    else:
        str_month = str(month)
    
    for data_file in os.listdir(f"Data/{str(str_month)}"):
        print(f"Processing File: {data_file}")
        file_path = os.path.join(f"Data/{str(str_month)}", data_file)
        # Read the TXT using tab delimiters since that's how it's structured
        day_data = pd.read_csv(file_path, delimiter="\t", names=columns, index_col=False)
        # If [month].csv doesn't exist yet, create it
        if not os.path.exists(output_file):
            day_data.to_csv(output_file, mode='w', header=True, index=False)
        # Or else append to it
        else:
            day_data.to_csv(output_file, mode='a', header=False, index=False)
        break #process only one file for testing

Creating January.csv...
Processing File: 20150101.txt


## Inspecting the data

In [None]:
def inspect_data(month):
    print("Inspection of data for " + month + ":")
    
    # Load the Data
    file_path = f"Data/CSVs/{month}.csv"
    data = pd.read_csv(file_path, index_col = False)

    # Print the table attributes
    print("Data Attributes:\n")
    attributes = data.columns.tolist()
    for i in range(0, len(attributes), 6):
        print("\t|\t".join(attributes[i:i+6]))
    #print("Attributes: " + ''.join(data.columns.tolist()))
    #print(data.columns.tolist())

    # Initial Data Summary
    print("\n\nFirst five rows of the dataset:")
    with pd.option_context('display.max_columns', 10):
        print(data.head(5)) 

    # Print statistocs of numerical features
    for column in data.select_dtypes(include='number').columns:
        print(f"\nStatistics for Numerical Feature {column}:")
        print(data[column].describe())

    # Print the number of missing values in each column
    for column in data.columns:
        missing_count = data[column].isnull().sum()
        print(f"Column '{column}' has {missing_count} missing values")

    # Print the unique values of categorical features
    for column in data.select_dtypes(include='object').columns:
        unique_values = data[column].nunique()
        print(f"Unique values in {column}: {unique_values}")

    # Correlations matrix between numerical features
    print("\nCorrelation Matrix:")
    correlation_matrix = data.select_dtypes(include='number').corr()
    sb.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title("Feature Correlation Heatmap")
    plt.show()

    """# Visualize Categorical Data Distributions
    for column in data.select_dtypes(include='object').columns:
        plt.figure()
        plt.title(f"Count Plot of {column}")
        sb.countplot(y=column, data=data)
        plt.show()"""

    """# Visualize Relationships Between Features
    # Example: Scatter Plot of Two Important Features
    if 'Duration' in data.columns and 'Label' in data.columns:
        plt.figure()
        plt.scatter(data['Duration'], data['Label'], alpha=0.5)
        plt.title('Duration vs. Label')
        plt.xlabel('Duration')
        plt.ylabel('Label')
        plt.show()"""

inspect_data("January")

### Encode The Data Before Split

In [3]:
# load the data
data = pd.read_csv("Data/CSVs/January.csv", index_col = False) #load only January dataset for initial testing

In [4]:
# network prefix grouping encoding
# note: categorize the data based on the network address
def extract_ipv6_net(ip_addr, net_len=64):
    ip = ipaddress.IPv6Address(ip_addr)
    ip_int = int(ip)

    mask = (1 << (128-net_len)) - 1 # mask = 111...1111 (64)
    prefixBits = ip_int & ~mask # keep the network portion (clearing subnet)
    return ipaddress.IPv6Address(prefixBits)   

#data["Source IP Address"] = data["Source IP Address"].apply(lambda x: extract_ipv6_net(x))
#print(data[["Source IP Address", "Destination IP Address"]].head())

# categorizing the Source/Destination IP addresses (vectorize the function, faster than .apply)
vectorized_extract = np.vectorize(extract_ipv6_net)
data["Source IP Address"] = vectorized_extract(data["Source IP Address"])
data["Destination IP Address"] = vectorized_extract(data["Destination IP Address"])

print(data["Source IP Address"].head())
#print(extract_ipv6_net("fd95:ec1e:6a61:f55c:1fa3:15ee:2e7a:0044"))

0    fd95:ec1e:6a61:f55c::
1    fd95:ec1e:6a61:f55c::
2    fd95:ec1e:6a61:f55c::
3    fd95:ec1e:6a61:f55c::
4    fd95:ec1e:6a61:5cd4::
Name: Source IP Address, dtype: object


In [8]:
# Label: indicates whether the session was attack or not; ‘1’ means the session was normal
# note: when categorizing if the data is attack data, 1 = attack, 0 = not attack
data["is_attack"] = data["Label"].apply(lambda x: 1 if x != 1 else 0)

# Count occurrences of 0 in "is_attack"
nonAttackCount = (data["is_attack"] == 0).sum()
attackCount = (data["is_attack"] == 1).sum()
print(data.shape) # 1.24m records, 24 attributes
print(f"Attack records: {attackCount}, Non attack records: {nonAttackCount}")

# encodes the non numerical attributes
# note: numerical attributes still need to be normalized
def encode_data(data):
    attributes = ["Service Type", "Flag", "Protocol"] #"Source IP Address","Destination IP Address", "Protocol"]#"Start Time", "Protocol"]
    df = pd.DataFrame(data)
    
    encoder = OneHotEncoder(sparse_output=True, drop="first")
    encoded_attr = encoder.fit_transform(df[attributes])
    encoded_columns = encoder.get_feature_names_out(attributes)

    # drop original categorical columns
    df = df.drop(columns=attributes).reset_index(drop=True)
    encoded_df = pd.DataFrame.sparse.from_spmatrix(encoded_attr, columns=encoded_columns)
    df = pd.concat([df, encoded_df], axis=1)
    return df

enc_data = encode_data(data)
print(enc_data.head())

(381105, 25)
Attack records: 339610, Non attack records: 41495
   Duration (seconds)  Source bytes  Destination bytes  Count  Same srv rate  \
0                 0.0             0                  0      0            0.0   
1                 0.0             0                  0      0            0.0   
2                 0.0             0                  0      0            0.0   
3                 0.0             0                  0      0            0.0   
4                 0.0             0                  0      0            0.0   

   Serror rate  Srv serror rate  Dst host count  Dst host srv count  \
0          0.0              0.0               0                   0   
1          0.0              0.0               0                   0   
2          0.0              0.0               0                   0   
3          0.0              0.0               0                   0   
4          0.0              0.0               0                   0   

   Dst host same src port rat

In [9]:
print(enc_data.columns)

Index(['Duration (seconds)', 'Source bytes', 'Destination bytes', 'Count',
       'Same srv rate', 'Serror rate', 'Srv serror rate', 'Dst host count',
       'Dst host srv count', 'Dst host same src port rate',
       'Dst host serror rate', 'Dst host srv serror rate', 'IDS detection',
       'Malware detection', 'Ashula detection', 'Label', 'Source IP Address',
       'Source Port Number', 'Destination IP Address',
       'Destination Port Number', 'Start Time', 'is_attack',
       'Service Type_http', 'Service Type_other', 'Service Type_rdp',
       'Service Type_sip', 'Service Type_smtp', 'Service Type_smtp,ssl',
       'Service Type_snmp', 'Service Type_ssh', 'Service Type_ssl', 'Flag_REJ',
       'Flag_RSTO', 'Flag_RSTOS0', 'Flag_RSTR', 'Flag_RSTRH', 'Flag_S0',
       'Flag_S1', 'Flag_S2', 'Flag_SF', 'Flag_SH', 'Flag_SHR', 'Protocol_tcp',
       'Protocol_udp'],
      dtype='object')


### Train/Test Split

In [12]:
# remove non unique columns
for col in enc_data:
    if len(enc_data[col].unique()) <= 1:
        print(f"Dropped non-unique: {col}")
        enc_data.drop(col)

# separating attack and labeling data for split
y = enc_data["is_attack"]
x = enc_data.drop(columns=['is_attack', 'Label','Start Time','IDS detection',
                            'Malware detection', 'Ashula detection', 'Source IP Address', 'Destination IP Address'])

# 30/70 split
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state = 83, test_size=0.3, shuffle=True)
X_train.head()

Unnamed: 0,Duration (seconds),Source bytes,Destination bytes,Count,Same srv rate,Serror rate,Srv serror rate,Dst host count,Dst host srv count,Dst host same src port rate,...,Flag_RSTR,Flag_RSTRH,Flag_S0,Flag_S1,Flag_S2,Flag_SF,Flag_SH,Flag_SHR,Protocol_tcp,Protocol_udp
362210,0.000542,44,104,12,1.0,0.0,0.0,83,99,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
123619,0.0,0,0,0,0.0,0.0,1.0,0,1,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
232834,0.0,0,0,0,0.0,0.0,1.0,0,0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
41258,0.0,0,0,0,0.0,0.0,1.0,0,0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
110704,0.0,0,0,3,1.0,0.33,0.23,22,22,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [14]:
for col in X_train.columns:
    print(col, X_train[col][1])

Duration (seconds) 0.0
Source bytes 0
Destination bytes 0
Count 0
Same srv rate 0.0
Serror rate 0.0
Srv serror rate 0.0
Dst host count 0
Dst host srv count 0
Dst host same src port rate 0.0
Dst host serror rate 0.0
Dst host srv serror rate 0.0
Source Port Number 47904
Destination Port Number 23
Service Type_http 0
Service Type_other 1.0
Service Type_rdp 0
Service Type_sip 0
Service Type_smtp 0
Service Type_smtp,ssl 0
Service Type_snmp 0
Service Type_ssh 0
Service Type_ssl 0
Flag_REJ 0
Flag_RSTO 0
Flag_RSTOS0 0
Flag_RSTR 0
Flag_RSTRH 0
Flag_S0 1.0
Flag_S1 0
Flag_S2 0
Flag_SF 0
Flag_SH 0
Flag_SHR 0
Protocol_tcp 1.0
Protocol_udp 0


# Modeling

### Random Forest Modeling

In [15]:
rf_classifier = RandomForestClassifier(max_depth = None).fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test) #predict based on the test data



In [17]:
# compare results from the predicted y values (if it is an attack or not), to the actual attack category (y_test)
confusion_mat_RFA = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = confusion_mat_RFA.ravel()
accuracy = (tp + tn)/(tn + fp + fn + tp)
precision = tp/(tp + fp)
recall = tp/(tp + fn)
score = 2 * (precision * recall)/(precision + recall)

print("Random Forest Evaluation:")
print(f"accuracy: {accuracy}")
print(f"precision: {precision}")
print(f"recall: {recall}")
print(f"F-1 score: {score}")

Random Forest Evaluation:
accuracy: 0.9979445824441101
precision: 0.9983528119852537
recall: 0.9993424345624246
F-1 score: 0.9988473781531565


# Post-processing

# Analysis + Accuracy