In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

%matplotlib notebook

In [2]:
data = pd.read_csv('RDC_samples/train.csv')

In [3]:
data.head()

Unnamed: 0,No.,Time,Source,Destination,Protocol,Length,Info
0,1,0.0,10.0.151.43,10.0.16.14,TCP,1026,"8000 > 63774 [PSH, ACK] Seq=1 Ack=1 Win=229 ..."
1,2,4e-06,10.0.151.43,10.0.16.14,TCP,1026,"[TCP Retransmission] 8000 > 63774 [PSH, ACK]..."
2,3,1.9e-05,10.0.4.66,10.0.17.129,TCP,70,445 > 36384 [ACK] Seq=1 Ack=1 Win=2573 Len=0...
3,4,2.1e-05,10.0.4.66,10.0.17.129,TCP,70,[TCP Dup ACK 3#1] 445 > 36384 [ACK] Seq=1 Ac...
4,5,5.5e-05,10.0.16.14,10.0.151.43,TCP,64,63774 > 8000 [ACK] Seq=1 Ack=1 Win=4096 Len=0


In [4]:
data.Protocol.count()

269306

In [5]:
data.Protocol.loc[data.Protocol == 'TCP'].count()

269306

In [6]:
data.Info.apply(lambda x: x.find('Win') != -1).count()

269306

In [7]:
def parse_data(par_list):
    par_list = par_list.split()

    if not par_list[0].isdigit():
        counter = 0
        while not par_list[counter].isdigit():
            counter+=1        
        par_list = par_list[counter:]

    source_port = int(par_list[0])
    destination_port = int(par_list[2])
    windows_size_val = 0

    for it in par_list:
        if it.find('Win')!=-1:
            windows_size_val = int(it[4:])
            break
    return [source_port, destination_port, windows_size_val]

In [8]:
def data_init(file_name='train.csv'):
    data = pd.read_csv('RDC_samples/'+file_name)
    data['params'] = data.Info.apply(parse_data)
    data[['source_port', 'destination_port', 'windows_size_val']] = pd.DataFrame(data.params.tolist(), index=data.index)
    data = data.drop(columns=['params', 'Info', 'Protocol', 'Time', 'No.'])
    data.Source = data.Source.apply(lambda x: int(x.replace('.', '')))
    data.Destination = data.Destination.apply(lambda x: int(x.replace('.', '')))
    return data

In [9]:
data = data_init('train.csv')

In [10]:
data.head()

Unnamed: 0,Source,Destination,Length,source_port,destination_port,windows_size_val
0,10015143,1001614,1026,8000,63774,229
1,10015143,1001614,1026,8000,63774,229
2,100466,10017129,70,445,36384,2573
3,100466,10017129,70,445,36384,2573
4,1001614,10015143,64,63774,8000,4096


In [11]:
data.nunique()

Source               31
Destination          40
Length              497
source_port         162
destination_port     90
windows_size_val    607
dtype: int64

In [14]:
data.hist(figsize=(10, 9))

<IPython.core.display.Javascript object>

array([[<AxesSubplot:title={'center':'Source'}>,
        <AxesSubplot:title={'center':'Destination'}>],
       [<AxesSubplot:title={'center':'Length'}>,
        <AxesSubplot:title={'center':'source_port'}>],
       [<AxesSubplot:title={'center':'destination_port'}>,
        <AxesSubplot:title={'center':'windows_size_val'}>]], dtype=object)

In [15]:
plt.figure()
pd.DataFrame(data.Source).boxplot(figsize=(12, 9))

<IPython.core.display.Javascript object>

<AxesSubplot:>

In [54]:
nums = data.groupby('destination_port').agg({'destination_port':'count'})\
.rename(columns = {'destination_port' : 'destination_port_count'})\
.sort_values(by='destination_port_count', ascending=False)
plt.figure()
plt.plot(nums.values[10:])

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x2278c0d71c0>]

Заметим, что примерно на 50 элементах (по ординате) происходит перегиб, поэтому все уникальные значение количество которых меньше 50 не будем включать, чтобы не переобучить модель на аномальных значениях.

In [59]:
data = data.loc[data.destination_port > 50]

In [62]:
data.hist(figsize=(10,10)
        )

<IPython.core.display.Javascript object>

array([[<AxesSubplot:title={'center':'Source'}>,
        <AxesSubplot:title={'center':'Destination'}>],
       [<AxesSubplot:title={'center':'Length'}>,
        <AxesSubplot:title={'center':'source_port'}>],
       [<AxesSubplot:title={'center':'destination_port'}>,
        <AxesSubplot:title={'center':'windows_size_val'}>]], dtype=object)

In [59]:
for col in list(data)[1:]:
    count = data.groupby([col]).count().query('Source < 20').Source.count()
    print('Count of unusual', col, 'elements', count)

Count of unusual Destination elements 22
Count of unusual Length elements 424
Count of unusual source_port elements 99
Count of unusual destination_port elements 53
Count of unusual windows_size_val elements 171


In [50]:
query('Source < 50').Source.count()

['Destination',
 'Length',
 'source_port',
 'destination_port',
 'windows_size_val']

In [12]:
import torch

In [185]:
X = torch.LongTensor(np.asarray(data))

In [188]:
train_data_size = int(0.8*X.shape[0])
X_train, X_val = X[:train_data_size], X[train_data_size:]

In [189]:
X_train.shape

torch.Size([215444, 6])

In [190]:
X_val.shape

torch.Size([53862, 6])