In [1]:
import numpy as np
from scipy.io import arff
import pandas as pd


def read_arff(filename):
    # read arff data files
    data, meta = arff.loadarff(filename)

    # convert numpy array of array on ndarray
    data = np.array([list(x) for x in data])

    # decode bytes to string
    data = np.char.decode(data)

    # convert numpy to dataframe
    data = pd.DataFrame(data, columns=meta.names(), index=None)

    # drop na
    data = data.dropna()

    return {'data': data, 'meta': meta}


In [2]:
 # dataset = {
#     'test': {
#        'data': numpy.ndarray
#        'meta': scipy.io.arff._arffread.MetaData
#     },
#     'train': {
#        'data': numpy.ndarray
#        'meta': scipy.io.arff._arffread.MetaData
#     }
# }

kdd_dataset = {
    'test': read_arff('../data/kdd/test_dataset_with_label.csv.arff'),
    'train': read_arff('../data/kdd/train_dataset_with_label.csv.arff'),
}

unsw_dataset = {
    'test': read_arff('../data/unsw/UNSWNB15Testing.arff'),
    'train': read_arff('../data/unsw/UNSWNB15Training.arff'),
}


In [3]:
# dummy encoding and save to csv
import matplotlib.pyplot as plt
import pandas as pd

# KDD dataset
kdd_dataset['train']['data'] = pd.get_dummies(
    kdd_dataset['train']['data'], columns=['Class'], prefix='Class')
kdd_dataset['train']['data'].to_csv(
    '../artefacts/3/kdd_train.csv', index=False)

kdd_dataset['test']['data'] = pd.get_dummies(
    kdd_dataset['test']['data'], columns=['Class'], prefix='Class')
kdd_dataset['test']['data'].to_csv(
    '../artefacts/3/kdd_test.csv', index=False)


In [4]:
# UNSW Dataset
unsw_dataset['train']['data'] = pd.get_dummies(
    unsw_dataset['train']['data'], columns=['label'], prefix='label')
unsw_dataset['train']['data'].to_csv(
    '../artefacts/3/unsw_train.csv', index=False)

unsw_dataset['test']['data'] = pd.get_dummies(
    unsw_dataset['test']['data'], columns=['label'], prefix='label')
unsw_dataset['test']['data'].to_csv(
    '../artefacts/3/unsw_test.csv', index=False)
