In [55]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras import regularizers
import xgboost as xgb
from sklearn.decomposition import PCA
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pyshark

import nest_asyncio
#nest_asyncio.apply()

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')



In [32]:
# Preprocess log to csv
with open('conn.log.labeled', 'r') as f:
    lines = f.readlines()
    
    with open('conn.log.labeled.csv', 'w') as ff:
        for line in lines[8:]:
            line = line.replace('\t', ',').replace('   ', ',')
            ff.write(line)        

In [34]:
# Load training data
#data_train = pd.read_csv('KDDTrain+.txt')

data_train = pd.read_csv('conn.log.labeled.csv')

data_train.head()


Unnamed: 0,1525879831.015811,CUmrqr4svHuSXJy5z7,192.168.100.103,51524,65.127.233.163,23,tcp,-,2.999051,0,0.1,S0,-.1,-.2,0.2,S,3,180,0.3,0.4,(empty),Malicious,PartOfAHorizontalPortScan
0,1525879831.025055,CH98aB3s1kJeq6SFOc,192.168.100.103,56305.0,63.150.16.171,23.0,tcp,-,-,-,-,S0,-,-,0.0,S,1.0,60.0,0.0,0.0,(empty),Malicious,PartOfAHorizontalPortScan
1,1525879831.045045,C3GBTkINvXNjVGtN5,192.168.100.103,41101.0,111.40.23.49,23.0,tcp,-,-,-,-,S0,-,-,0.0,S,1.0,60.0,0.0,0.0,(empty),Malicious,PartOfAHorizontalPortScan
2,1525879832.01624,CDe43c1PtgynajGI6,192.168.100.103,60905.0,131.174.215.147,23.0,tcp,-,2.998796,0,0,S0,-,-,0.0,S,3.0,180.0,0.0,0.0,(empty),Malicious,PartOfAHorizontalPortScan
3,1525879832.024985,CJaDcG3MZzvf1YVYI4,192.168.100.103,44301.0,91.42.47.63,23.0,tcp,-,-,-,-,S0,-,-,0.0,S,1.0,60.0,0.0,0.0,(empty),Malicious,PartOfAHorizontalPortScan
4,1525879832.044975,CMBrup3BLXivSp4Avc,192.168.100.103,50244.0,120.210.108.200,23.0,tcp,-,-,-,-,S0,-,-,0.0,S,1.0,60.0,0.0,0.0,(empty),Malicious,PartOfAHorizontalPortScan


In [38]:
columns = (["ts", "uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p", "proto", "service", "duration", "orig_bytes", "resp_bytes", "conn_state", "local_orig", "local_resp", "missed_bytes", "history", "orig_pkts", "orig_ip_bytes", "resp_pkts", "resp_ip_bytes", "tunnel_parents", "label", "detailed-label"])

data_train.columns = columns
data_train.loc[data_train['label'] == "Benign", "label"] = 'normal'
data_train.loc[data_train['label'] != 'Benign', "label"] = 'attack'

data_train.head()


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,conn_state,local_orig,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,label,detailed-label
0,1525879831.025055,CH98aB3s1kJeq6SFOc,192.168.100.103,56305.0,63.150.16.171,23.0,tcp,-,-,-,-,S0,-,-,0.0,S,1.0,60.0,0.0,0.0,(empty),attack,PartOfAHorizontalPortScan
1,1525879831.045045,C3GBTkINvXNjVGtN5,192.168.100.103,41101.0,111.40.23.49,23.0,tcp,-,-,-,-,S0,-,-,0.0,S,1.0,60.0,0.0,0.0,(empty),attack,PartOfAHorizontalPortScan
2,1525879832.01624,CDe43c1PtgynajGI6,192.168.100.103,60905.0,131.174.215.147,23.0,tcp,-,2.998796,0,0,S0,-,-,0.0,S,3.0,180.0,0.0,0.0,(empty),attack,PartOfAHorizontalPortScan
3,1525879832.024985,CJaDcG3MZzvf1YVYI4,192.168.100.103,44301.0,91.42.47.63,23.0,tcp,-,-,-,-,S0,-,-,0.0,S,1.0,60.0,0.0,0.0,(empty),attack,PartOfAHorizontalPortScan
4,1525879832.044975,CMBrup3BLXivSp4Avc,192.168.100.103,50244.0,120.210.108.200,23.0,tcp,-,-,-,-,S0,-,-,0.0,S,1.0,60.0,0.0,0.0,(empty),attack,PartOfAHorizontalPortScan


In [61]:
def Scaling(df_num, cols):
    std_scaler = RobustScaler()
    std_scaler_temp = std_scaler.fit_transform(df_num)
    std_df = pd.DataFrame(std_scaler_temp, columns=cols)
    return std_df

def preprocess_with_padding(dataframe, original_columns):
    # Preprocess the data as usual
    dataframe['protocol_type'] = dataframe['protocol_type'].astype('category').cat.codes
    df_num = dataframe
    num_cols = df_num.columns
    scaled_df = Scaling(df_num, num_cols)
    dataframe.drop(labels=num_cols, axis="columns", inplace=True)
    dataframe[num_cols] = scaled_df[num_cols]

    # Convert 'outcome' to binary labels
    dataframe.loc[dataframe['outcome'] == "normal", "outcome"] = 0
    dataframe.loc[dataframe['outcome'] != 0, "outcome"] = 1

    dataframe = pd.get_dummies(dataframe, columns=['protocol_type', 'service', 'flag'], drop_first=True)

    # Add missing columns from the original training data
    for col in original_columns:
        if col not in dataframe.columns:
            dataframe[col] = 0

    # Ensure the columns are in the same order
    dataframe = dataframe[original_columns]
    return dataframe

# Preprocess training data
def preprocess(dataframe):
    df_num = dataframe
    num_cols = df_num.columns
    scaled_df = Scaling(df_num, num_cols)
    dataframe.drop(labels=num_cols, axis="columns", inplace=True)
    dataframe[num_cols] = scaled_df[num_cols]

    dataframe.loc[dataframe['outcome'] == "normal", "outcome"] = 0
    dataframe.loc[dataframe['outcome'] != 0, "outcome"] = 1

    dataframe = pd.get_dummies(dataframe, columns=['protocol_type', 'service', 'flag'])
    return dataframe

In [63]:
scaled_train = preprocess(data_train)
x = scaled_train.drop(['label', 'level'], axis=1).values.astype('float32')
y = scaled_train['outcome'].values.astype('int')
y_reg = scaled_train['level'].values

pca = PCA(n_components=20)
pca = pca.fit(x)
x_reduced = pca.transform(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

ValueError: could not convert string to float: '#close'

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=64, activation='relu', input_shape=(x_train.shape[1:]),
                          kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4),
                          bias_regularizer=regularizers.L2(1e-4),
                          activity_regularizer=regularizers.L2(1e-5)),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(units=128, activation='relu',
                          kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4),
                          bias_regularizer=regularizers.L2(1e-4),
                          activity_regularizer=regularizers.L2(1e-5)),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(units=512, activation='relu',
                          kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4),
                          bias_regularizer=regularizers.L2(1e-4),
                          activity_regularizer=regularizers.L2(1e-5)),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(units=128, activation='relu',
                          kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4),
                          bias_regularizer=regularizers.L2(1e-4),
                          activity_regularizer=regularizers.L2(1e-5)),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(units=1, activation='sigmoid'),
])

model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy'])
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=1, verbose=1)

In [None]:
# Load the pcap file
capture = pyshark.FileCapture('2018-05-09-192.168.100.103.pcap')

In [None]:
# Process packets
packet_data = []
for i, packet in enumerate(capture):
    if i > 1000:
        break
    
    
    try:
        packet_info = {
            'duration': float(packet.sniff_time.timestamp()),
            'protocol_type': packet.highest_layer,
            'service': packet.transport_layer if hasattr(packet, 'transport_layer') else 'unknown',
            'flag': packet.tcp.flags if hasattr(packet, 'tcp') else 0,
            'src_bytes': int(packet.length),
            'dst_bytes': 0,
            'land': 1 if packet.ip.src == packet.ip.dst else 0,
            # Add other fields as needed
            'outcome': 0,
            'level': 0
        }
        packet_data.append(packet_info)
    except AttributeError:
        continue

In [None]:
# Convert to DataFrame and preprocess
df = pd.DataFrame(packet_data)
original_feature_columns = list(scaled_train.drop(['outcome', 'level'], axis=1).columns)
preprocessed_new_data = preprocess_with_padding(df, original_feature_columns)
x_new = preprocessed_new_data.drop(['outcome', 'level'], axis=1, errors='ignore').values.astype('float32')

# Transform with PCA and make predictions
x_new_reduced = pca.transform(x_new)
predictions = model.predict(x_new_reduced)
predicted_outcomes = (predictions > 0.5).astype('int')

# Save predictions
df['predicted_outcome'] = predicted_outcomes
df.to_csv('data/predicted_new_data.csv', index=False)

print("Predictions saved to 'data/predicted_new_data.csv'")