In [148]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [149]:
trainframe = pd.read_csv('Data/train_data.csv')
trainframe.head()

Unnamed: 0,packet_id,packet_duration,second_frame,src_ip,dest_ip,protocol,length,info,target
0,PI_741404,0.047,SF_630318,fe80::212:7412:12:1212,fe80::212:7418:18:1818,ICMPv6,76,RPL Control (Destination Advertisement Object),0
1,PI_237888,0.001,SF_367074,fe80::212:7443:43:4343,fe80::212:7411:11:1111,ICMPv6,76,RPL Control (Destination Advertisement Object),1
2,PI_408771,0.009,SF_526833,fe80::212:7418:18:1818,fe80::212:7455:55:5555,ICMPv6,76,RPL Control (Destination Advertisement Object),0
3,PI_101539,0.43,SF_935887,fe80::212:740b:b:b0b,fe80::212:7421:21:2121,ICMPv6,102,RPL Control (DODAG Information Object),1
4,PI_199855,0.188,SF_272413,fe80::212:743a:3a:3a3a,fe80::212:7408:8:808,ICMPv6,76,RPL Control (Destination Advertisement Object),0


In [150]:
trainframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11663 entries, 0 to 11662
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   packet_id        11663 non-null  object 
 1   packet_duration  11663 non-null  float64
 2   second_frame     11663 non-null  object 
 3   src_ip           11576 non-null  object 
 4   dest_ip          11576 non-null  object 
 5   protocol         11663 non-null  object 
 6   length           11663 non-null  int64  
 7   info             11663 non-null  object 
 8   target           11663 non-null  int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 820.2+ KB


In [151]:
trainframe.isnull().sum()

packet_id           0
packet_duration     0
second_frame        0
src_ip             87
dest_ip            87
protocol            0
length              0
info                0
target              0
dtype: int64

In [152]:
trainframe['dest_ip'].value_counts()

dest_ip
ff02::1a                  1690
2002:db8::ff:fe00:1       1300
fe80::212:742b:2b:2b2b     367
fe80::212:741c:1c:1c1c     323
fe80::212:7434:34:3434     309
                          ... 
fe80::212:7426:26:2626      13
fe80::212:7419:19:1919      12
fe80::212:7430:30:3030      11
fe80::212:745a:5a:5a5a      10
fe80::212:7443:43:4343       2
Name: count, Length: 83, dtype: int64

In [153]:
trainframe['src_ip'].value_counts()

src_ip
fe80::212:7420:20:2020        239
fe80::212:7425:25:2525        207
fe80::212:7415:15:1515        206
fe80::212:7405:5:505          196
fe80::212:743e:3e:3e3e        195
                             ... 
2002:db8::212:740d:d:d0d        5
2002:db8::212:7422:22:2222      5
2002:db8::212:7412:12:1212      5
2002:db8::212:7407:7:707        5
2002:db8::212:7419:19:1919      3
Name: count, Length: 153, dtype: int64

In [154]:
def preprocessing(df):
    df.drop(['packet_id','second_frame'], axis=1, inplace=True)


In [155]:
preprocessing(trainframe)

In [156]:
trainframe.head()

Unnamed: 0,packet_duration,src_ip,dest_ip,protocol,length,info,target
0,0.047,fe80::212:7412:12:1212,fe80::212:7418:18:1818,ICMPv6,76,RPL Control (Destination Advertisement Object),0
1,0.001,fe80::212:7443:43:4343,fe80::212:7411:11:1111,ICMPv6,76,RPL Control (Destination Advertisement Object),1
2,0.009,fe80::212:7418:18:1818,fe80::212:7455:55:5555,ICMPv6,76,RPL Control (Destination Advertisement Object),0
3,0.43,fe80::212:740b:b:b0b,fe80::212:7421:21:2121,ICMPv6,102,RPL Control (DODAG Information Object),1
4,0.188,fe80::212:743a:3a:3a3a,fe80::212:7408:8:808,ICMPv6,76,RPL Control (Destination Advertisement Object),0


In [157]:
def split(df):
    x_train = trainframe.drop('target', axis=1)
    y_train = trainframe['target']
    return x_train, y_train

In [158]:
x_train, y_train = split(trainframe)

In [159]:
trainframe['info'].value_counts()

info
RPL Control (Destination Advertisement Object)         8302
RPL Control (DODAG Information Object)                 1922
Source port: ultraseek-http  Destination port: rrac    1300
Ack                                                      87
RPL Control (DODAG Information Solicitation)             43
Unknown (17)                                              9
Name: count, dtype: int64

In [160]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
column_transformer = ColumnTransformer(
    transformers=[('info', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['info'])],
    remainder='passthrough' 
)
df_encoded = column_transformer.fit_transform(trainframe)

In [161]:
encoded_columns = column_transformer.get_feature_names_out()
df_encoded = pd.DataFrame(df_encoded, columns=encoded_columns)
df_encoded

Unnamed: 0,info__info_Ack,info__info_RPL Control (DODAG Information Object),info__info_RPL Control (DODAG Information Solicitation),info__info_RPL Control (Destination Advertisement Object),info__info_Source port: ultraseek-http Destination port: rrac,info__info_Unknown (17),remainder__packet_duration,remainder__src_ip,remainder__dest_ip,remainder__protocol,remainder__length,remainder__target
0,0.0,0.0,0.0,1.0,0.0,0.0,0.047,fe80::212:7412:12:1212,fe80::212:7418:18:1818,ICMPv6,76,0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.001,fe80::212:7443:43:4343,fe80::212:7411:11:1111,ICMPv6,76,1
2,0.0,0.0,0.0,1.0,0.0,0.0,0.009,fe80::212:7418:18:1818,fe80::212:7455:55:5555,ICMPv6,76,0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.43,fe80::212:740b:b:b0b,fe80::212:7421:21:2121,ICMPv6,102,1
4,0.0,0.0,0.0,1.0,0.0,0.0,0.188,fe80::212:743a:3a:3a3a,fe80::212:7408:8:808,ICMPv6,76,0
...,...,...,...,...,...,...,...,...,...,...,...,...
11658,0.0,1.0,0.0,0.0,0.0,0.0,0.009,fe80::212:7434:34:3434,fe80::212:742f:2f:2f2f,ICMPv6,102,0
11659,0.0,0.0,0.0,1.0,0.0,0.0,0.079,fe80::212:7435:35:3535,fe80::212:7459:59:5959,ICMPv6,76,0
11660,0.0,0.0,0.0,1.0,0.0,0.0,0.001,fe80::212:7414:14:1414,fe80::212:741a:1a:1a1a,ICMPv6,76,0
11661,1.0,0.0,0.0,0.0,0.0,0.0,0.026,,,IEEE 802.15.4,5,1
