In [1]:
import json
import pandas as pd
import numpy as np
import bnlearn as bn
from sklearn.mixture import BayesianGaussianMixture

In [2]:
path = "/home/aschoen/my_storage/aschoen/dataset/flow_chronicle_dataset/xp2/CIDDS_xp2_train.csv"

## Load Training Data

In [3]:
df = pd.read_csv(path)

In [4]:
df

Unnamed: 0,Date first seen,Proto,Src IP Addr,Dst IP Addr,Dst Pt,In Byte,Out Byte
0,2017-04-05 00:00:00.266,TCP,192.168.220.15,192.168.100.5,445.0,743149900.0,547287300.0
1,2017-04-05 00:00:01.264,UDP,192.168.200.4,192.168.200.255,137.0,0.0,0.0
2,2017-04-05 00:00:10.534,TCP,192.168.220.4,192.168.100.5,445.0,9727132.0,522285.0
3,2017-04-05 00:00:11.275,UDP,192.168.200.5,192.168.200.255,138.0,0.0,0.0
4,2017-04-05 00:00:15.394,TCP,192.168.220.12,192.168.100.5,445.0,787059800.0,138529000.0
5,2017-04-05 00:00:15.497,TCP,192.168.220.7,192.168.100.5,445.0,559940800.0,422107700.0
6,2017-04-05 00:00:17.805,TCP,192.168.220.9,192.168.100.5,445.0,746642800.0,19818660.0
7,2017-04-05 00:00:19.845,TCP,192.168.220.13,192.168.100.5,445.0,1263817000.0,2915943000.0
8,2017-04-05 00:00:23.082,TCP,192.168.220.5,192.168.100.5,445.0,241165800.0,344115400.0
9,2017-04-05 00:00:25.054,UDP,192.168.220.8,192.168.220.255,137.0,0.0,0.0


In [82]:
train = df.copy()

In [83]:
train["Date first seen"] = pd.to_datetime(train["Date first seen"])
train["Day"] = train["Date first seen"].dt.dayofweek
train["Time"] = train["Date first seen"].dt.time
train["Time"] = pd.to_timedelta(train["Time"].astype(str)).dt.total_seconds()
train.drop("Date first seen", axis=1, inplace=True)

In [84]:
ip_pubs = [ip for ip in df["Dst IP Addr"].unique() if "_" in ip and "EXT" not in ip]

In [85]:
train.loc[train["Dst IP Addr"].isin(ip_pubs),"Dst IP Addr"] = "IP_PUB"

## Preprocess Training Data

In [86]:
def preprocess(df, n, name, v, max_iter):
    class NumpyEncoder(json.JSONEncoder):
        """ Special json encoder for numpy types """
        def default(self, obj):
            if isinstance(obj, np.integer):
                return int(obj)
            elif isinstance(obj, np.floating):
                return float(obj)
            elif isinstance(obj, np.ndarray):
                return obj.tolist()
            return json.JSONEncoder.default(self, obj)
    data = df.copy()
    continuous = data.loc[:,["In Byte", "Out Byte", "Time"]]
    dic = {}
    for col in continuous.columns:
        m = continuous[col].describe([.9])[5]
        temp = continuous.loc[data[col]<=m, col].to_numpy().reshape(-1,1)
        gm = BayesianGaussianMixture(n_components=n, random_state=0, verbose=v, max_iter = max_iter).fit(temp)
        dic[col] = {"edge":m, "max": continuous[col].max(), "weights": gm.weights_.squeeze(), "means": gm.means_.squeeze(), "std": gm.covariances_.squeeze()}
        continuous.loc[data[col]<=m, col] = gm.predict(temp)
        continuous.loc[data[col]>m, col] = n
        data[col] = continuous[col]
    with open(name, "w") as output:
        json.dump(dic, output, cls = NumpyEncoder)
        output.close()
    return data.astype("category")

In [87]:
def reconstruct(df, dic):
    data = df.copy()
    continuous = data.loc[:,["In Byte", "Out Byte", "Time"]].astype(int)
    for col in continuous.columns:
        n = len(dic[col]["weights"])
        continuous.loc[continuous[col] == float(n), col] = np.random.uniform(dic[col]["edge"], dic[col]["max"], (data[col] == float(n)).sum())
        norm = continuous.loc[continuous[col] < float(n), col]
        mean = np.array(dic[col]["means"])[norm.values.astype(int)]
        std = np.array(dic[col]["std"])[norm.values.astype(int)]
        continuous.loc[continuous[col] < float(n), col] = np.random.normal(mean, np.sqrt(std))
        continuous[col]=np.maximum(continuous[col], np.zeros(len(continuous)))
        if col != "Time":
            continuous[col]=continuous[col].round(1)
        data[col] = continuous[col]
    return data

In [88]:
train_p = preprocess(train, 40, "xp2_bn_stats.json", v=2, max_iter=50)

Initialization 0
Initialization converged: True	 time lapse 0.10795s	 ll -1395.93864
Initialization 0
Initialization converged: True	 time lapse 0.09648s	 ll -1301.39874
Initialization 0
  Iteration 10	 time lapse 0.11227s	 ll change 6.59869
  Iteration 20	 time lapse 0.09240s	 ll change 0.97066
Initialization converged: True	 time lapse 0.22713s	 ll -206.31776


In [89]:
def build_time_dep(df, steps=5):
    res = pd.concat([df.shift(-i) for i in range(steps)], axis=1)
    res.columns = [f'{col}_{i+1}' for i in range(steps) for col in df.columns]
    res = res.iloc[::steps,:]
    return res.dropna()

In [90]:
train_5step = build_time_dep(train_p, 5)

In [91]:
def regroup(df, steps):
    initial_columns = set('_'.join(col.split('_')[:-1]) for col in df.columns)
    res = []
    for initial_column in initial_columns:
        df_grouped = df[[c for c in df.columns if initial_column in c]]
        ser = []
        for col in df_grouped.columns:
            i = int(col.split('_')[-1])-1
            s = df_grouped[col]
            s.name = initial_column
            s.index = s.index + i
            ser.append(s)
        column = pd.concat(ser, axis=0).sort_index()
        res.append(column)
    reconstructed_df = pd.concat(res,axis=1)
    return reconstructed_df

In [92]:
regroup(train_5step, 5)

Unnamed: 0,Src IP Addr,Day,In Byte,Dst IP Addr,Proto,Dst Pt,Out Byte,Time
0,192.168.220.15,2,1.0,192.168.100.5,TCP,445.0,40.0,1.0
1,192.168.200.4,2,0.0,192.168.200.255,UDP,137.0,0.0,1.0
2,192.168.220.4,2,0.0,192.168.100.5,TCP,445.0,0.0,1.0
3,192.168.200.5,2,0.0,192.168.200.255,UDP,138.0,0.0,1.0
4,192.168.220.12,2,40.0,192.168.100.5,TCP,445.0,1.0,1.0
5,192.168.220.7,2,1.0,192.168.100.5,TCP,445.0,40.0,1.0
6,192.168.220.9,2,40.0,192.168.100.5,TCP,445.0,0.0,1.0
7,192.168.220.13,2,40.0,192.168.100.5,TCP,445.0,40.0,1.0
8,192.168.220.5,2,1.0,192.168.100.5,TCP,445.0,40.0,1.0
9,192.168.220.8,2,0.0,192.168.220.255,UDP,137.0,0.0,1.0


## Train Network Structures

In [93]:
model_indep  = bn.structure_learning.fit(train_p, methodtype='hc', scoretype='bic')

[bnlearn] >Computing best DAG using [hc]
[bnlearn] >Set scoring type at [bic]
[bnlearn] >Compute structure scores for model comparison (higher is better).


In [94]:
model_timedep = bn.structure_learning.fit(train_5step, methodtype='hc', scoretype='bic')

[bnlearn] >Computing best DAG using [hc]
[bnlearn] >Set scoring type at [bic]
[bnlearn] >Compute structure scores for model comparison (higher is better).


## Train Network Parameters

In [95]:
model_indep = bn.parameter_learning.fit(model_indep, train_p, methodtype='bayes')

[bnlearn] >Parameter learning> Computing parameters using [bayes]
[bnlearn] >Converting [<class 'pgmpy.base.DAG.DAG'>] to BayesianNetwork model.
[bnlearn] >Converting adjmat to BayesianNetwork.
[bnlearn] >CPD of Proto:
+--------------+----------+
| Proto(TCP  ) | 0.495247 |
+--------------+----------+
| Proto(UDP  ) | 0.504753 |
+--------------+----------+
[bnlearn] >CPD of Dst IP Addr:
+------------------------------+---------------------+---------------------+
| Proto                        | Proto(TCP  )        | Proto(UDP  )        |
+------------------------------+---------------------+---------------------+
| Dst IP Addr(192.168.100.255) | 0.1199616122840691  | 0.1271186440677966  |
+------------------------------+---------------------+---------------------+
| Dst IP Addr(192.168.100.5)   | 0.14875239923224567 | 0.11770244821092278 |
+------------------------------+---------------------+---------------------+
| Dst IP Addr(192.168.200.255) | 0.1199616122840691  | 0.12523540489642

In [96]:
model_timedep = bn.parameter_learning.fit(model_timedep, train_5step, methodtype='bayes')

[bnlearn] >Parameter learning> Computing parameters using [bayes]
[bnlearn] >Converting [<class 'pgmpy.base.DAG.DAG'>] to BayesianNetwork model.
[bnlearn] >Converting adjmat to BayesianNetwork.
[bnlearn] >CPD of Proto_1:
+----------------+-----+---------------------+
| Dst Pt_1       | ... | Dst Pt_1(1900.0)    |
+----------------+-----+---------------------+
| Proto_1(TCP  ) | ... | 0.49800796812749004 |
+----------------+-----+---------------------+
| Proto_1(UDP  ) | ... | 0.50199203187251    |
+----------------+-----+---------------------+
[bnlearn] >CPD of Dst IP Addr_1:
+--------------------------------+---------------------+---------------------+
| Proto_1                        | Proto_1(TCP  )      | Proto_1(UDP  )      |
+--------------------------------+---------------------+---------------------+
| Dst IP Addr_1(192.168.100.255) | 0.24801587301587302 | 0.2490118577075099  |
+--------------------------------+---------------------+---------------------+
| Dst IP Addr_1(192.16

## Sample New Data

In [216]:
new_data_indep = bn.sampling(model_indep,len(train_p))

In [217]:
new_data_timedep = bn.sampling(model_timedep,len(train_5step))

## Post-process Generated Data

In [218]:
def complete(df, u):
    for col in u.columns:
        if col not in df.columns:
            df[col] = u[col].sample(len(df))
    return df

In [219]:
step = train_5step.shape[1]//train_p.shape[1]
new_data_timedep.index = new_data_timedep.index*step

In [220]:
new_data_indep = complete(new_data_indep, train_p)
new_data_timedep = complete(new_data_timedep, train_5step)

In [221]:
new_data_timedep = regroup(new_data_timedep, step)

In [222]:
new_data_indep = new_data_indep[train.columns]
new_data_timedep = new_data_timedep[train.columns]

In [223]:
with open("xp2_bn_stats.json", "r") as file:
    dic = json.load(file)

In [224]:
new_data_indep = reconstruct(new_data_indep, dic)
new_data_timedep = reconstruct(new_data_timedep, dic)

In [225]:
new_data_timedep

Unnamed: 0,Proto,Src IP Addr,Dst IP Addr,Dst Pt,In Byte,Out Byte,Day,Time
0,UDP,192.168.220.15,192.168.220.255,137.0,39995980.0,125960400.0,2,193.90349
1,TCP,192.168.220.6,192.168.200.255,138.0,13410000.0,1908642.0,2,151.514731
2,TCP,192.168.220.4,192.168.220.255,137.0,927493300.0,1347492000.0,2,253.868359
3,UDP,192.168.200.5,IP_PUB,138.0,1347490000.0,136664800.0,2,50.234224
4,UDP,192.168.220.12,192.168.100.5,445.0,1054394000.0,2291024000.0,2,101.790547
5,UDP,192.168.220.7,192.168.100.255,445.0,33216340.0,2740854000.0,2,15.144135
6,TCP,192.168.220.16,192.168.100.5,138.0,1864222000.0,9542722.0,2,281.316479
7,UDP,192.168.220.13,192.168.100.5,137.0,890621000.0,946864600.0,2,55.651578
8,TCP,192.168.220.5,192.168.220.255,443.0,246222900.0,494794400.0,2,0.0
9,TCP,192.168.220.8,192.168.200.255,137.0,1265498000.0,2475617000.0,2,345.673391


In [226]:
new_data_indep.loc[new_data_indep["Dst IP Addr"]=="IP_PUB","Dst IP Addr"] = np.random.choice(ip_pubs, (new_data_indep["Dst IP Addr"]=="IP_PUB").sum())
new_data_timedep.loc[new_data_timedep["Dst IP Addr"]=="IP_PUB","Dst IP Addr"] = np.random.choice(ip_pubs, (new_data_timedep["Dst IP Addr"]=="IP_PUB").sum())

In [227]:
origTimestamp = pd.to_datetime(min(pd.to_datetime(df["Date first seen"]).dt.date))

In [228]:
minute_shift = pd.to_timedelta(6,unit='m')

In [229]:
new_data_indep.Day = new_data_indep.Day.astype(int)
new_data_timedep.Day = new_data_timedep.Day.astype(int)

In [230]:
new_data_indep["Date first seen"] = pd.to_timedelta(pd.to_timedelta(new_data_indep.Time, unit='s').dt.total_seconds()+new_data_indep.Day*24*3600,unit='s')+origTimestamp + minute_shift
new_data_timedep["Date first seen"] = pd.to_timedelta(pd.to_timedelta(new_data_timedep.Time, unit='s').dt.total_seconds()+new_data_timedep.Day*24*3600,unit='s')+origTimestamp + minute_shift

new_data_indep.drop(["Day","Time"],axis=1,inplace=True)
new_data_timedep.drop(["Day","Time"],axis=1,inplace=True)

new_data_indep = new_data_indep[df.columns]
new_data_timedep = new_data_timedep[df.columns]

new_data_indep.sort_values("Date first seen",inplace=True)
new_data_timedep.sort_values("Date first seen",inplace=True)

## Save Generated Data

In [235]:
new_data_indep.to_csv("/home/aschoen/my_storage/aschoen/dataset/flow_chronicle_dataset/xp2/BN_indep.csv",index=False)
new_data_timedep.to_csv("/home/aschoen/my_storage/aschoen/dataset/flow_chronicle_dataset/xp2/BN_timedep.csv",index=False)