In [160]:
import pandas as pd
import numpy as np
import re
import json
from datetime import datetime

In [2]:
pd.set_option('display.max_columns', None)

In [80]:
with open('packets.json', 'r') as file:
    packets = json.load(file)

In [194]:
df = pd.read_json('packets.json')

In [195]:
len(df)

49569

In [197]:
def make_remove_prefix(prefix):
    def remove_prefix(s):
        if s.startswith(prefix):
            return s[1:]
        return s
    return remove_prefix

In [198]:
df['source_address'] = df['source_address'].map(make_remove_prefix('/'))
df['destination_address'] = df['destination_address'].map(make_remove_prefix('/'))

In [199]:
def make_get_flag(pos):
    def get_flag(s):
        return [i == 'true' for i in s.split('=')[1].strip('() ').split(', ')][pos]
    return get_flag

In [200]:
df['flags_reserved'] = df['flags'].map(make_get_flag(0))
df['flags_dont_fragment'] = df['flags'].map(make_get_flag(1))
df['flags_more_fragment'] = df['flags'].map(make_get_flag(2))

In [201]:
df = df.drop('flags', axis=1)

In [202]:
option_re = re.compile(r'\[Kind: (\d+) .*')
option_bytes_re = re.compile(r'.* \[Length: (\d+) bytes.*')

In [203]:
def match_re(r, default=None):
    def find(s):
        def match_or_default():
            l = r.findall(s)
            if l:
                return l[0]
            return default
        
        try:
            if np.isnan(s):
                return default
            else:
                return match_or_default()
        except:
            return match_or_default()

    return find

In [233]:
df['window'] = df['window'].map(int)
df['ttl'] = df['ttl'].map(int)
df['identification'] = df['identification'].map(int)
df['sequence_number'] = df['sequence_number'].map(int)
df['acknowledgment_number'] = df['acknowledgment_number'].map(int)
df['reserved'] = df['reserved'].map(int)

In [205]:
for i in range(1, 7):
    df[f'option{i}_length'] = df[f'option{i}'].map(match_re(option_bytes_re, default=0))
    df[f'option{i}_length'] = df[f'option{i}_length'].map(int)
    df[f'option{i}'] = df[f'option{i}'].map(match_re(option_re, default=np.nan))

In [206]:
df = df.drop(['protocol', 'urg', 'version', 'ihl', 'fragment_offset', 'destination_port', 'tos',
              'urgent_pointer', 'type'], axis=1)
# every row has: protocol = '6 (TCP)'; urg = 'false'; version = '4 (IPv4)'; ihl = '5 (20 [bytes])';
# fragment_offset = '0 (0 [bytes])'; destination_port = '8070 (unknown)';
# tos = '[precedence: 0 (Routine)] [tos: 0 (Default)] [mbz: 0]'; urgent_pointer = '0';
# type = '0x0800 (IPv4)'

In [236]:
df['rst'] = df['rst'].map(lambda d: d == 'true')
df['psh'] = df['psh'].map(lambda d: d == 'true')
df['fin'] = df['fin'].map(lambda d: d == 'true')
df['syn'] = df['syn'].map(lambda d: d == 'true')
df['ack'] = df['ack'].map(lambda d: d == 'true')

In [209]:
data_offset_re = re.compile(r'^\d+ \((\d+) \[bytes.*')

In [210]:
df['data_offset'] = df['data_offset'].map(match_re(data_offset_re, 0))

In [211]:
df['captured_at'] = df['captured_at'].map(lambda d: datetime.fromisoformat(d.split('Z')[0]))

In [223]:
length_re = re.compile(r'^(\d+) \[?bytes\]?')

In [227]:
df['total_length'] = df['total_length'].map(match_re(length_re, 0))
df['original_length'] = df['original_length'].map(match_re(length_re, 0))

In [239]:
port_re = re.compile(r'^(\d+) \(unknown\)')

In [243]:
df['source_port'] = df['source_port'].map(match_re(port_re))

In [247]:
df

Unnamed: 0,source_address,header_checksum,destination_address,option3,window,option4,option1,option2,checksum,ttl,rst,identification,data_offset,source_addressmac,captured_at,total_length,sequence_number,psh,acknowledgment_number,fin,original_length,syn,reserved,ack,source_port,destination_addressmac,hex_stream,zzz,option5,option6,flags_reserved,flags_dont_fragment,flags_more_fragment,option1_length,option2_length,option3_length,option4_length,option5_length,option6_length
0,170.231.187.126,0x4bd8,10.128.0.2,1,2144,4,2,1,0xd0b7,238,False,4104,28,42:01:0a:80:00:01,2020-11-15 11:08:56.401955,48,18896120,False,0,False,62,True,0,False,50973,42:01:0a:80:00:02,,,,,False,False,False,4,0,0,2,0,0
1,170.231.187.126,0x4bdf,10.128.0.2,,2144,,,,0x90ae,238,False,4105,20,42:01:0a:80:00:01,2020-11-15 11:08:56.550260,40,18896121,False,3366166653,False,54,False,0,True,50973,42:01:0a:80:00:02,,,,,False,False,False,0,0,0,0,0,0
2,170.231.187.126,0x4afa,10.128.0.2,,2144,,,,0xc415,238,False,4106,20,42:01:0a:80:00:01,2020-11-15 11:08:56.557755,268,18896121,True,3366166653,False,282,False,0,True,50973,42:01:0a:80:00:02,50 4f 53 54 20 2f 61 70 69 2f 76 31 2f 61 75 7...,POST /api/v1/auth/device/signin HTTP/1.1\r\nHo...,,,False,False,False,0,0,0,0,0,0
3,170.231.187.126,0x4b8d,10.128.0.2,,2144,,,,0x9463,238,False,4107,20,42:01:0a:80:00:01,2020-11-15 11:08:56.557921,120,18896349,True,3366166653,False,134,False,0,True,50973,42:01:0a:80:00:02,7b 22 75 73 65 72 6e 61 6d 65 22 3a 22 33 38 3...,"{""username"":""3841bf2d-6481-406a-9f41-6dba77cda...",,,False,False,False,0,0,0,0,0,0
4,170.231.187.126,0x4bdc,10.128.0.2,,1889,,,,0x8f79,238,False,4108,20,42:01:0a:80:00:01,2020-11-15 11:08:56.803207,40,18896429,False,3366166908,True,54,False,0,True,50973,42:01:0a:80:00:02,,,,,False,False,False,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49564,138.94.53.1,0xfa73,10.128.0.2,,2144,,,,0xa906,238,False,2171,20,42:01:0a:80:00:01,2020-11-22 19:09:39.116731,40,3398835,False,2911016762,False,54,False,0,True,62306,42:01:0a:80:00:02,,,,,False,False,False,0,0,0,0,0,0
49565,138.94.53.1,0xf8c0,10.128.0.2,,2144,,,,0xbf54,238,False,2172,20,42:01:0a:80:00:01,2020-11-22 19:09:39.123390,474,3398835,True,2911016762,False,488,False,0,True,62306,42:01:0a:80:00:02,50 4f 53 54 20 2f 61 70 69 2f 76 31 2f 6d 65 7...,POST /api/v1/message HTTP/1.1\r\nHost: 34.68.1...,,,False,False,False,0,0,0,0,0,0
49566,138.94.53.1,0xfa5c,10.128.0.2,,2144,,,,0x8c2e,238,False,2173,20,42:01:0a:80:00:01,2020-11-22 19:09:39.123430,61,3399269,True,2911016762,False,75,False,0,True,62306,42:01:0a:80:00:02,7b 22 74 65 6d 70 65 72 61 74 75 72 61 22 3a 3...,"{""temperatura"":23.64}",,,False,False,False,0,0,0,0,0,0
49567,138.94.53.1,0xfa70,10.128.0.2,,1889,,,,0xa73e,238,False,2174,20,42:01:0a:80:00:01,2020-11-22 19:09:39.319831,40,3399290,False,2911017017,True,54,False,0,True,62306,42:01:0a:80:00:02,,,,,False,False,False,0,0,0,0,0,0


In [255]:
df = df.set_index('captured_at').sort_index()

In [256]:
df

Unnamed: 0_level_0,source_address,header_checksum,destination_address,option3,window,option4,option1,option2,checksum,ttl,rst,identification,data_offset,source_addressmac,total_length,sequence_number,psh,acknowledgment_number,fin,original_length,syn,reserved,ack,source_port,destination_addressmac,hex_stream,zzz,option5,option6,flags_reserved,flags_dont_fragment,flags_more_fragment,option1_length,option2_length,option3_length,option4_length,option5_length,option6_length
captured_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
2020-11-15 11:08:56.401955,170.231.187.126,0x4bd8,10.128.0.2,1,2144,4,2,1,0xd0b7,238,False,4104,28,42:01:0a:80:00:01,48,18896120,False,0,False,62,True,0,False,50973,42:01:0a:80:00:02,,,,,False,False,False,4,0,0,2,0,0
2020-11-15 11:08:56.550260,170.231.187.126,0x4bdf,10.128.0.2,,2144,,,,0x90ae,238,False,4105,20,42:01:0a:80:00:01,40,18896121,False,3366166653,False,54,False,0,True,50973,42:01:0a:80:00:02,,,,,False,False,False,0,0,0,0,0,0
2020-11-15 11:08:56.557755,170.231.187.126,0x4afa,10.128.0.2,,2144,,,,0xc415,238,False,4106,20,42:01:0a:80:00:01,268,18896121,True,3366166653,False,282,False,0,True,50973,42:01:0a:80:00:02,50 4f 53 54 20 2f 61 70 69 2f 76 31 2f 61 75 7...,POST /api/v1/auth/device/signin HTTP/1.1\r\nHo...,,,False,False,False,0,0,0,0,0,0
2020-11-15 11:08:56.557921,170.231.187.126,0x4b8d,10.128.0.2,,2144,,,,0x9463,238,False,4107,20,42:01:0a:80:00:01,120,18896349,True,3366166653,False,134,False,0,True,50973,42:01:0a:80:00:02,7b 22 75 73 65 72 6e 61 6d 65 22 3a 22 33 38 3...,"{""username"":""3841bf2d-6481-406a-9f41-6dba77cda...",,,False,False,False,0,0,0,0,0,0
2020-11-15 11:08:56.803207,170.231.187.126,0x4bdc,10.128.0.2,,1889,,,,0x8f79,238,False,4108,20,42:01:0a:80:00:01,40,18896429,False,3366166908,True,54,False,0,True,50973,42:01:0a:80:00:02,,,,,False,False,False,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-11-22 19:09:39.116731,138.94.53.1,0xfa73,10.128.0.2,,2144,,,,0xa906,238,False,2171,20,42:01:0a:80:00:01,40,3398835,False,2911016762,False,54,False,0,True,62306,42:01:0a:80:00:02,,,,,False,False,False,0,0,0,0,0,0
2020-11-22 19:09:39.123390,138.94.53.1,0xf8c0,10.128.0.2,,2144,,,,0xbf54,238,False,2172,20,42:01:0a:80:00:01,474,3398835,True,2911016762,False,488,False,0,True,62306,42:01:0a:80:00:02,50 4f 53 54 20 2f 61 70 69 2f 76 31 2f 6d 65 7...,POST /api/v1/message HTTP/1.1\r\nHost: 34.68.1...,,,False,False,False,0,0,0,0,0,0
2020-11-22 19:09:39.123430,138.94.53.1,0xfa5c,10.128.0.2,,2144,,,,0x8c2e,238,False,2173,20,42:01:0a:80:00:01,61,3399269,True,2911016762,False,75,False,0,True,62306,42:01:0a:80:00:02,7b 22 74 65 6d 70 65 72 61 74 75 72 61 22 3a 3...,"{""temperatura"":23.64}",,,False,False,False,0,0,0,0,0,0
2020-11-22 19:09:39.319831,138.94.53.1,0xfa70,10.128.0.2,,1889,,,,0xa73e,238,False,2174,20,42:01:0a:80:00:01,40,3399290,False,2911017017,True,54,False,0,True,62306,42:01:0a:80:00:02,,,,,False,False,False,0,0,0,0,0,0


In [249]:
df.to_csv('tcp_data.csv')

---

## Filtering and encoding

In [370]:
aux = df[~df['zzz'].isna()]
aux = aux.drop(['source_address', 'header_checksum', 'destination_address', 'checksum',
                'source_addressmac', 'destination_addressmac', 'hex_stream'], axis=1)

In [371]:
aux['method'] = aux['zzz'].map(lambda s: np.nan if s.startswith('{') else 'POST')
aux['header'] = aux['zzz'].map(lambda s: np.nan if s.startswith('{') else s)
aux['payload'] = aux['zzz'].map(lambda s: np.nan if not s.startswith('{') else s)

In [372]:
aux['header'] = aux['header'].map(lambda s: s if not isinstance(s, str) else s[5:].split('\r\n'))

In [373]:
def payload_caster(j):
    # sometimes "temperatura" comes empty, which breaks json loading
    try:
        return j if not isinstance(j, str) else json.loads(j)
    except Exception as e:
        return {'temperatura': np.nan}

In [374]:
aux['payload'] = aux['payload'].map(payload_caster)

In [375]:
aux['header_endpoint'] = aux['header'].map(lambda s: s if not isinstance(s, list) else s[0].split()[0])

In [376]:
def get_user_agent(l):
    try:
        return l if not isinstance(l, list) else l[2].split()[1]
    except Exception as e:
        msg = f'error with {l}: {e}'
        print(msg)
        return msg

In [377]:
aux['header_user_agent'] = aux['header'].map(get_user_agent)

error with ['�\x00\x00\x00\x00\x00Cookie: mstshash=Administr', '\x01\x00\x08\x00\x03\x00\x00\x00']: list index out of range
error with ['\x01\x00\x00w\x03\x035S��&�R�v�UB��E�\x1dX�cȌ�B.5��w͢l\x00\x00\x1a�/�+�\x11�\x07�\x13�\t�\x14�\n\x00\x05\x00/\x005�\x12\x00\n\x01\x00\x004\x00\x05\x00\x05\x01\x00\x00\x00\x00\x00\n\x00\x08\x00\x06\x00\x17\x00\x18\x00\x19\x00\x0b\x00\x02\x01\x00\x00\r\x00\x10\x00\x0e\x04\x01\x04\x03\x02\x01\x02\x03\x04\x01\x05\x01\x06\x01�\x01\x00\x01\x00']: list index out of range
error with [' HTTP/1.1', 'Host: 34.68.158.238:8070']: list index out of range
error with [' HTTP/1.1', 'Host: 34.68.158.238:8070']: list index out of range
error with [' HTTP/1.1', 'Host: 34.68.158.238:8070']: list index out of range
error with ['\x01\x00\x00w\x03\x03yHmns\x14R\x0e��{z�\x12\x03M�*�^�Mt�\x07�\u07bbQ]\x05�\x00\x00\x1a�/�+�\x11�\x07�\x13�\t�\x14�\n\x00\x05\x00/\x005�\x12\x00\n\x01\x00\x004\x00\x05\x00\x05\x01\x00\x00\x00\x00\x00\n\x00\x08\x00\x06\x00\x17\x00\x18\x00\x19\x00\x0b

In [378]:
aux['header'][0]

['/api/v1/auth/device/signin HTTP/1.1',
 'Host: 34.68.158.238:8070',
 'User-Agent: ESP8266HTTPClient',
 'Accept-Encoding: identity;q=1,chunked;q=0.1,*;q=0',
 'Connection: keep-alive',
 'Content-Type: application/json',
 'Content-Length: 80']

In [379]:
aux

Unnamed: 0_level_0,option3,window,option4,option1,option2,ttl,rst,identification,data_offset,total_length,sequence_number,psh,acknowledgment_number,fin,original_length,syn,reserved,ack,source_port,zzz,option5,option6,flags_reserved,flags_dont_fragment,flags_more_fragment,option1_length,option2_length,option3_length,option4_length,option5_length,option6_length,method,header,payload,header_endpoint,header_user_agent
captured_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
2020-11-15 11:08:56.557755,,2144,,,,238,False,4106,20,268,18896121,True,3366166653,False,282,False,0,True,50973,POST /api/v1/auth/device/signin HTTP/1.1\r\nHo...,,,False,False,False,0,0,0,0,0,0,POST,"[/api/v1/auth/device/signin HTTP/1.1, Host: 34...",,/api/v1/auth/device/signin,ESP8266HTTPClient
2020-11-15 11:08:56.557921,,2144,,,,238,False,4107,20,120,18896349,True,3366166653,False,134,False,0,True,50973,"{""username"":""3841bf2d-6481-406a-9f41-6dba77cda...",,,False,False,False,0,0,0,0,0,0,,,{'username': '3841bf2d-6481-406a-9f41-6dba77cd...,,
2020-11-15 11:10:32.960528,,2144,,,,238,False,4114,20,268,18981399,True,1200034468,False,282,False,0,True,50209,POST /api/v1/auth/device/signin HTTP/1.1\r\nHo...,,,False,False,False,0,0,0,0,0,0,POST,"[/api/v1/auth/device/signin HTTP/1.1, Host: 34...",,/api/v1/auth/device/signin,ESP8266HTTPClient
2020-11-15 11:10:32.960651,,2144,,,,238,False,4115,20,120,18981627,True,1200034468,False,134,False,0,True,50209,"{""username"":""3841bf2d-6481-406a-9f41-6dba77cda...",,,False,False,False,0,0,0,0,0,0,,,{'username': '3841bf2d-6481-406a-9f41-6dba77cd...,,
2020-11-15 11:12:09.364875,,2144,,,,238,False,4124,20,268,19066870,True,2455090346,False,282,False,0,True,50555,POST /api/v1/auth/device/signin HTTP/1.1\r\nHo...,,,False,False,False,0,0,0,0,0,0,POST,"[/api/v1/auth/device/signin HTTP/1.1, Host: 34...",,/api/v1/auth/device/signin,ESP8266HTTPClient
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-11-22 19:06:25.088245,,2144,,,,238,False,2141,20,61,3297622,True,1475074318,False,75,False,0,True,62211,"{""temperatura"":23.64}",,,False,False,False,0,0,0,0,0,0,,,{'temperatura': 23.64},,
2020-11-22 19:08:02.100206,,2144,,,,238,False,2157,20,474,3347818,True,134865517,False,488,False,0,True,62718,POST /api/v1/message HTTP/1.1\r\nHost: 34.68.1...,,,False,False,False,0,0,0,0,0,0,POST,"[/api/v1/message HTTP/1.1, Host: 34.68.158.238...",,/api/v1/message,ESP8266HTTPClient
2020-11-22 19:08:02.106091,,2144,,,,238,False,2158,20,61,3348252,True,134865517,False,75,False,0,True,62718,"{""temperatura"":23.64}",,,False,False,False,0,0,0,0,0,0,,,{'temperatura': 23.64},,
2020-11-22 19:09:39.123390,,2144,,,,238,False,2172,20,474,3398835,True,2911016762,False,488,False,0,True,62306,POST /api/v1/message HTTP/1.1\r\nHost: 34.68.1...,,,False,False,False,0,0,0,0,0,0,POST,"[/api/v1/message HTTP/1.1, Host: 34.68.158.238...",,/api/v1/message,ESP8266HTTPClient


In [381]:
aux2 = aux[~aux['header_user_agent'].isna()]
# aux2['header_user_agent'].str.contains('error')
aux3 = aux2[aux2['header_user_agent'].str.contains('error')]

In [382]:
aux3.index

DatetimeIndex(['2020-11-17 15:52:49.931261', '2020-11-18 16:27:01.000473',
               '2020-11-18 16:27:02.047974', '2020-11-19 11:05:10.665845',
               '2020-11-22 01:29:16.193657', '2020-11-22 06:25:19.989631',
               '2020-11-22 06:25:21.037477', '2020-11-22 18:12:37.599326'],
              dtype='datetime64[ns]', name='captured_at', freq=None)

### Removing access attempts from port scanners (this could be useful as 0s in final dataset, but will remove for now) 

In [383]:
aux = aux.drop(aux3.index)

In [385]:
set(aux.header_user_agent)

{'ESP8266HTTPClient', 'Mozilla/5.0', nan}

In [386]:
aux4 = aux[~aux['header_user_agent'].isna()]
aux5 = aux4[aux4['header_user_agent'].str.contains('Mozilla')]
aux5

Unnamed: 0_level_0,option3,window,option4,option1,option2,ttl,rst,identification,data_offset,total_length,sequence_number,psh,acknowledgment_number,fin,original_length,syn,reserved,ack,source_port,zzz,option5,option6,flags_reserved,flags_dont_fragment,flags_more_fragment,option1_length,option2_length,option3_length,option4_length,option5_length,option6_length,method,header,payload,header_endpoint,header_user_agent
captured_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
2020-11-18 16:27:02.095408,8,83,,1,1,52,False,37411,32,216,3870818338,True,2869199830,False,230,False,0,True,46798,GET / HTTP/1.1\r\nHost: 34.68.158.238:8070\r\n...,,,False,True,False,0,0,10,0,0,0,POST,"[ HTTP/1.1, Host: 34.68.158.238:8070, User-Age...",,HTTP/1.1,Mozilla/5.0
2020-11-22 06:25:21.084621,8,83,,1,1,52,False,1773,32,216,539967287,True,156568047,False,230,False,0,True,39136,GET / HTTP/1.1\r\nHost: 34.68.158.238:8070\r\n...,,,False,True,False,0,0,10,0,0,0,POST,"[ HTTP/1.1, Host: 34.68.158.238:8070, User-Age...",,HTTP/1.1,Mozilla/5.0
2020-11-22 18:12:40.265770,8,502,,1,1,49,False,57555,32,308,4068939284,True,3734124480,False,322,False,0,True,41006,GET /system_api.php HTTP/1.1\r\nHost: 34.68.15...,,,False,True,False,0,0,10,0,0,0,POST,"[system_api.php HTTP/1.1, Host: 34.68.158.238:...",,system_api.php,Mozilla/5.0
2020-11-22 18:12:40.470273,8,502,,1,1,50,False,7945,32,306,1505196243,True,3382655699,False,320,False,0,True,41088,GET /c/version.js HTTP/1.1\r\nHost: 34.68.158....,,,False,True,False,0,0,10,0,0,0,POST,"[c/version.js HTTP/1.1, Host: 34.68.158.238:80...",,c/version.js,Mozilla/5.0
2020-11-22 18:12:40.672341,8,502,,1,1,48,False,28515,32,320,78865799,True,2850881899,False,334,False,0,True,41168,GET /streaming/clients_live.php HTTP/1.1\r\nHo...,,,False,True,False,0,0,10,0,0,0,POST,"[streaming/clients_live.php HTTP/1.1, Host: 34...",,streaming/clients_live.php,Mozilla/5.0
2020-11-22 18:12:40.871296,8,502,,1,1,49,False,48338,32,321,2978325916,True,3712898023,False,335,False,0,True,41252,GET /stalker_portal/c/version.js HTTP/1.1\r\nH...,,,False,True,False,0,0,10,0,0,0,POST,"[stalker_portal/c/version.js HTTP/1.1, Host: 3...",,stalker_portal/c/version.js,Mozilla/5.0
2020-11-22 18:12:41.074299,8,502,,1,1,49,False,39541,32,306,2081527296,True,1657702958,False,320,False,0,True,41338,GET /client_area/ HTTP/1.1\r\nHost: 34.68.158....,,,False,True,False,0,0,10,0,0,0,POST,"[client_area/ HTTP/1.1, Host: 34.68.158.238:80...",,client_area/,Mozilla/5.0
2020-11-22 18:12:41.273560,8,502,,1,1,50,False,6228,32,311,744829284,True,3248873241,False,325,False,0,True,41422,GET /stalker_portal/c/ HTTP/1.1\r\nHost: 34.68...,,,False,True,False,0,0,10,0,0,0,POST,"[stalker_portal/c/ HTTP/1.1, Host: 34.68.158.2...",,stalker_portal/c/,Mozilla/5.0


In [387]:
aux = aux.drop(aux5.index)

In [390]:
set(aux.header_user_agent)

{'ESP8266HTTPClient', nan}

In [389]:
set(aux.header_endpoint)

{'/api/v1/auth/device/signin', '/api/v1/message', nan}

In [404]:
def define_is_post(method):
    try:
        if np.isnan(method):
            return False
    except:
        pass
    
    return method == 'POST'

In [406]:
aux['is_post'] = aux['method'].map(define_is_post)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [409]:
aux = aux.drop(['zzz', 'method'], axis=1)

In [411]:
aux.loc[aux.index[0], 'header']

['/api/v1/auth/device/signin HTTP/1.1',
 'Host: 34.68.158.238:8070',
 'User-Agent: ESP8266HTTPClient',
 'Accept-Encoding: identity;q=1,chunked;q=0.1,*;q=0',
 'Connection: keep-alive',
 'Content-Type: application/json',
 'Content-Length: 80']

In [418]:
def make_get_from_header(pos, key_name):
    def get_from_header(header):
        try:
            if np.isnan(header):
                return np.nan
            else:
                print('weird', header)
        except:
            pass

        header_data = header[pos]
        if not header_data.startswith(key_name):
            print('weird', header)

        return header_data.split(': ')[1]
    return get_from_header

In [416]:
aux['header_accept_encoding'] = aux['header'].map(make_get_from_header(3, 'Accept-Encoding'))

In [420]:
aux['header_connection'] = aux['header'].map(make_get_from_header(4, 'Connection'))

In [423]:
aux['header_content_type'] = aux['header'].map(make_get_from_header(5, 'Content-Type'))

In [433]:
def get_header(key, caster=str):
    def get_from_header(header):
        try:
            if np.isnan(header):
                return np.nan
            else:
                print('weird', header)
        except:
            pass

        header = {i.split(': ')[0]: i.split(': ')[1] for i in header[1:]}
        try:
            return caster(header[key])
        except KeyError:
            return np.nan

    return get_from_header

In [428]:
aux['header_content_length'] = aux['header'].map(get_header('Content-Length', int))

In [435]:
aux['header_authorization'] = aux['header'].map(get_header('Authorization'))

In [438]:
aux = aux.drop('header', axis=1)

In [440]:
aux2 = aux[~aux['payload'].isna()]

In [446]:
keys = {j for i in aux2['payload'] for j in i}

In [447]:
keys

{'password', 'speed', 'temperatura', 'username'}

In [450]:
def get_from_payload(key, caster=str):
    def get(payload):
        try:
            if np.isnan(payload):
                return np.nan
            else:
                print('weird', payload)
        except:
            pass
        
        try:
            return caster(payload[key])
        except KeyError:
            return np.nan

    return get

In [462]:
aux['payload_password'] = aux['payload'].map(get_from_payload('password'))

In [463]:
aux['payload_speed'] = aux['payload'].map(get_from_payload('speed', float))

In [466]:
aux['payload_temperatura'] = aux['payload'].map(get_from_payload('temperatura', float))

In [468]:
aux['payload_username'] = aux['payload'].map(get_from_payload('username'))

In [470]:
aux = aux.drop(['payload', 'payload_username', 'payload_password', 'header_authorization'], axis=1)
# model should not have access to any password information, so this should be useless

In [475]:
set(aux['header_endpoint'])

{'/api/v1/auth/device/signin', '/api/v1/message', nan}

In [483]:
def make_define_endpoint(desired):
    def define_endpoint(endpoint):
        try:
            if np.isnan(endpoint):
                return False
            else:
                print('weird', endpoint)
        except:
            pass

        return endpoint.endswith(desired)
    return define_endpoint

In [484]:
aux['is_endpoint_signin'] = aux['header_endpoint'].map(make_define_endpoint('signin'))

In [485]:
aux['is_endpoint_message'] = aux['header_endpoint'].map(make_define_endpoint('message'))

In [487]:
aux = aux.drop('header_endpoint', axis=1)

In [489]:
set(aux['header_user_agent'])

{'ESP8266HTTPClient', nan}

In [490]:
def define_esp_user_agent(user_agent):
    def check():
        return user_agent == 'ESP8266HTTPClient'
    
    try:
        if np.isnan(user_agent):
            return False
        else:
            return check()
    except:
        return check()

In [492]:
aux['is_esp8266_user_agent'] = aux['header_user_agent'].map(define_esp_user_agent)

In [493]:
aux = aux.drop('header_user_agent', axis=1)

In [500]:
def make_encoder(default_value):
    def encoder(val):
        def check():
            return val == default_value
        
        try:
            if np.isnan(val):
                return False
            else:
                return check()
        except:
            return check()
    return encoder

In [503]:
aux['is_keep_alive'] = aux['header_connection'].map(make_encoder('keep-alive'))

In [505]:
aux['is_json'] = aux['header_content_type'].map(make_encoder('application/json'))

In [506]:
aux = aux.drop(['header_accept_encoding', 'header_connection', 'header_content_type'], axis=1)

In [509]:
aux['header_content_length'] = aux['header_content_length'].map(lambda d: 0 if np.isnan(d) else d)

In [516]:
aux['payload_speed'] = aux['payload_speed'].map(lambda d: -1 if np.isnan(d) else d)

In [518]:
aux['payload_temperatura'] = aux['payload_temperatura'].map(lambda d: 0 if np.isnan(d) else d)

In [520]:
set(aux.option3)

{nan}

In [521]:
set(aux.window)

{2144}

In [522]:
set(aux.option4)

{nan}

In [523]:
set(aux.option1)

{nan}

In [524]:
set(aux.option2)

{nan}

In [526]:
aux = aux.drop(['option3', 'window', 'option4', 'option1', 'option2'], axis=1) # no variation in columns

In [528]:
set(aux.rst)

{False}

In [530]:
set(aux.data_offset)

{'20'}

In [532]:
set(aux.psh)

{True}

In [533]:
set(aux.fin)

{False}

In [534]:
set(aux.syn)

{False}

In [535]:
set(aux.reserved)

{0}

In [536]:
set(aux.ack)

{True}

In [537]:
aux = aux.drop(['rst', 'data_offset', 'psh', 'fin', 'syn', 'reserved', 'ack'], axis=1) # no variation

In [539]:
set(aux.option5)

{nan}

In [540]:
set(aux.option6)

{nan}

In [541]:
set(aux.flags_reserved)

{False}

In [542]:
set(aux.flags_dont_fragment)

{False}

In [543]:
set(aux.flags_more_fragment)

{False}

In [544]:
aux = aux.drop(['option5', 'option6', 'flags_reserved', 'flags_dont_fragment', 'flags_more_fragment'], axis=1)

In [546]:
set(aux.option1_length)

{0}

In [547]:
set(aux.option2_length)

{0}

In [548]:
set(aux.option3_length)

{0}

In [549]:
set(aux.option4_length)

{0}

In [550]:
set(aux.option5_length)

{0}

In [551]:
set(aux.option6_length)

{0}

In [552]:
aux = aux.drop(['option1_length', 'option2_length', 'option3_length',
                'option4_length', 'option5_length', 'option6_length'], axis=1)

In [554]:
def encode_booleans(val):
    return 1 if val else 0

In [558]:
aux['is_post'] = aux['is_post'].map(encode_booleans)
aux['is_endpoint_signin'] = aux['is_endpoint_signin'].map(encode_booleans)
aux['is_endpoint_message'] = aux['is_endpoint_message'].map(encode_booleans)
aux['is_esp8266_user_agent'] = aux['is_esp8266_user_agent'].map(encode_booleans)
aux['is_keep_alive'] = aux['is_keep_alive'].map(encode_booleans)
aux['is_json'] = aux['is_json'].map(encode_booleans)

In [559]:
aux

Unnamed: 0_level_0,ttl,identification,total_length,sequence_number,acknowledgment_number,original_length,source_port,is_post,header_content_length,payload_speed,payload_temperatura,is_endpoint_signin,is_endpoint_message,is_esp8266_user_agent,is_keep_alive,is_json
captured_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2020-11-15 11:08:56.557755,238,4106,268,18896121,3366166653,282,50973,1,80.0,-1.0,0.00,1,0,1,1,1
2020-11-15 11:08:56.557921,238,4107,120,18896349,3366166653,134,50973,0,0.0,-1.0,0.00,0,0,0,0,0
2020-11-15 11:10:32.960528,238,4114,268,18981399,1200034468,282,50209,1,80.0,-1.0,0.00,1,0,1,1,1
2020-11-15 11:10:32.960651,238,4115,120,18981627,1200034468,134,50209,0,0.0,-1.0,0.00,0,0,0,0,0
2020-11-15 11:12:09.364875,238,4124,268,19066870,2455090346,282,50555,1,80.0,-1.0,0.00,1,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-11-22 19:06:25.088245,238,2141,61,3297622,1475074318,75,62211,0,0.0,-1.0,23.64,0,0,0,0,0
2020-11-22 19:08:02.100206,238,2157,474,3347818,134865517,488,62718,1,21.0,-1.0,0.00,0,1,1,1,1
2020-11-22 19:08:02.106091,238,2158,61,3348252,134865517,75,62718,0,0.0,-1.0,23.64,0,0,0,0,0
2020-11-22 19:09:39.123390,238,2172,474,3398835,2911016762,488,62306,1,21.0,-1.0,0.00,0,1,1,1,1


In [560]:
!ls

LICENSE         README.md       data_prep.ipynb


In [561]:
aux.to_parquet('preped_dataset.parquet')