In [22]:
import os
from datetime import datetime
import pandas as pd
from progressbar import progressbar
import subprocess
from multiprocessing.pool import ThreadPool

In [23]:
def num_lines(file):
    r = 0
    with open(file, 'r') as file:
        for l in file:
            r += 1
            
    return r

In [24]:
def epoch_to_datetime(d):
    return datetime.fromtimestamp(d)

In [25]:
base_folder = 'dataset/iot_23_datasets_full/opt/Malware-Project/BigDataset/IoTScenarios/'
destination_folder = 'dataset/parsed/'

In [26]:
def count_parsed_files():
    return len(os.listdir(destination_folder))

In [27]:
folders = [base_folder + i for i in os.listdir(base_folder)]

In [28]:
files = [i + '/bro/' + os.listdir(i + '/bro')[0] for i in folders if 'Honeypot-Capture-7-1' not in i] + ['dataset/iot_23_datasets_full/opt/Malware-Project/BigDataset/IoTScenarios/CTU-Honeypot-Capture-7-1/Somfy-01/bro/conn.log.labeled']

In [29]:
class FileParser:
    def __init__(self, file_path, batch_size=100000, verbose=False):
        self.file_path = file_path
        self.num_lines = num_lines(file_path)
        self.batch_size = batch_size
        self.aux_df = pd.DataFrame()
        self.files_generated = 0
        self.verbose = verbose

    def _parse_line(self, line):
        return line.replace('\t', ' ').split()

    def parse_fields(self, fields_line):
        self.columns = self._parse_line(fields_line)[1:]

    def parse_types(self, types_line):
        self.types = self._parse_line(types_line)[1:]
    
    def _append_line(self, line):
        aux = pd.DataFrame(self._parse_line(line)).T
        aux.columns = self.columns
        self.aux_df = self.aux_df.append(aux)
        if len(self.aux_df) > self.batch_size:
            self._rotate_df()
    
    def _rotate_df(self):
        part_name = destination_folder + f'part_{count_parsed_files() + 1}'
        if self.verbose:
            print('Rotating df; file saved =', part_name)
        
        self.aux_df.to_parquet(part_name)
        self.files_generated += 1
        self.aux_df = pd.DataFrame()
    
    def process(self):
        if self.verbose:
            print('starting processing of file:', self.file_path)
            
        with open(self.file_path, 'r') as file:
            for line in progressbar(file, max_value=self.num_lines, redirect_stdout=True):
                if line.startswith('#fields'):
                    self.parse_fields(line)
                elif line.startswith('#types'):
                    self.parse_types(line)
                elif not line.startswith('#'):
                    self._append_line(line)
        
        if len(self.aux_df) > 0:
            self._rotate_df()
        
        return self.files_generated

In [30]:
def process_file(file):
    parser = FileParser(file, verbose=True)
    return parser.process()

In [33]:
lines = []
columns = []
with open(files[0], 'r') as file:
    for i in range(50):
        l = file.readline()
        if l.startswith('#fields'):
            columns = l.replace('\t', ' ').split()[1:]
        elif not l.startswith('#'):
            lines.append(l.replace('\t', ' ').split())

In [36]:
pd.DataFrame(lines, columns=columns)

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,label,detailed-label
0,1540469302.53864,CGm6jB4dXK71ZDWUDh,192.168.1.132,58687,216.239.35.4,123,udp,-,0.114184,48,...,-,0,Dd,1,76,1,76,-,benign,-
1,1540469197.400159,CnaDAG3n5r8eiG4su2,192.168.1.132,1900,239.255.255.250,1900,udp,-,160.367579,7536,...,-,0,D,24,8208,0,0,-,benign,-
2,1540469385.734089,CUrxU238nt0m6yTgKf,192.168.1.132,32893,216.239.35.8,123,udp,-,0.016986,48,...,-,0,Dd,1,76,1,76,-,benign,-
3,1540469831.302625,CGQf8t1kjdxB5PHXL4,192.168.1.132,53395,2.16.60.82,443,tcp,-,0.003497,0,...,-,0,ShAFf,5,212,3,144,-,benign,-
4,1540469831.265405,CUo9DH2QDnCaBIGjkg,192.168.1.132,52801,192.168.1.1,53,udp,dns,0.036724,34,...,-,0,Dd,1,62,1,339,-,benign,-
5,1540469418.379528,CAvXOZ3htimWEtglii,192.168.1.132,1900,239.255.255.250,1900,udp,-,384.518261,15072,...,-,0,D,48,16416,0,0,-,benign,-
6,1540470081.850824,CfJsUD2NGQvnK2p7Vd,192.168.1.132,58124,216.239.35.12,123,udp,-,0.270332,48,...,-,0,Dd,1,76,1,76,-,benign,-
7,1540470187.222098,CVwKZS98dRvk1jeH2,192.168.1.132,35313,216.239.35.0,123,udp,-,0.111429,48,...,-,0,Dd,1,76,1,76,-,benign,-
8,1540470355.430009,CzbHG4aoHRooWvyMg,192.168.1.132,46064,216.239.35.4,123,udp,-,0.114433,48,...,-,0,Dd,1,76,1,76,-,benign,-
9,1540470419.608808,CbTB0B2ZnnDWLAIml3,192.168.1.132,45230,216.239.35.8,123,udp,-,0.016984,48,...,-,0,Dd,1,76,1,76,-,benign,-


In [31]:
for file in files:
    r = process_file(file)
    print(f'parts created for file: {r}')

starting processing of file: dataset/iot_23_datasets_full/opt/Malware-Project/BigDataset/IoTScenarios/CTU-Honeypot-Capture-4-1/bro/conn.log.labeled


100% (461 of 461) |######################| Elapsed Time: 0:00:00 Time:  0:00:00


Rotating df; file saved = dataset/parsed/part_1
parts created for file: 1
starting processing of file: dataset/iot_23_datasets_full/opt/Malware-Project/BigDataset/IoTScenarios/CTU-Honeypot-Capture-5-1/bro/conn.log.labeled


100% (1383 of 1383) |####################| Elapsed Time: 0:00:01 Time:  0:00:01


Rotating df; file saved = dataset/parsed/part_2
parts created for file: 1
starting processing of file: dataset/iot_23_datasets_full/opt/Malware-Project/BigDataset/IoTScenarios/CTU-IoT-Malware-Capture-1-1/bro/conn.log.labeled


  2% (26451 of 1008757) |                | Elapsed Time: 0:02:18 ETA:   3:01:43

KeyboardInterrupt: 