In [1]:
import torch
import pandas as pd
import os

In [2]:
path='.\\io_test\\fair_cut.csv'
shards_path='.\\io_test\\shards'

In [3]:
head=pd.read_csv(path, nrows=10)
cols=head.columns.to_list()
print(len(cols))

80


In [6]:
#DISSECT DATASET
def dissect(path_of_jumbo_ds, mem_limit, where_to_store):
    count=0
    with pd.read_csv(path, chunksize=mem_limit) as reader:
        for c in reader:
            c.to_csv(where_to_store + '\\shard_{0}.csv'.format(count), mode='w', header=True, index=False)
            count+=1

In [7]:
dissect(path, 100_000, shards_path)

In [5]:
row = pd.read_csv('.\\io_test\\shards\\shard_0.csv',
                    skiprows=0,
                    nrows=1)
keep_after_anova=['Src Port',
'Dst Port',
'Protocol',
'Flow Duration',
'Fwd Pkt Len Max',
'Fwd Pkt Len Min',
'Fwd Pkt Len Mean',
'Fwd Pkt Len Std',
'Bwd Pkt Len Min',
'Bwd Pkt Len Mean',
'Fwd IAT Tot',
'Pkt Len Min',
'RST Flag Cnt',
'PSH Flag Cnt',
'ACK Flag Cnt',
'CWE Flag Count',
'Fwd Seg Size Avg',
'Bwd Seg Size Avg',
'Init Fwd Win Byts',
'Fwd Seg Size Min']
relevant_x_idx= [i for i, c in enumerate(row.columns.tolist()) if c in keep_after_anova]

relevant_y_idx = [i for i, c in enumerate(row.columns.tolist()) if c == 'Label']

print(relevant_x_idx, relevant_y_idx)

[0, 1, 2, 3, 8, 9, 10, 11, 13, 14, 22, 40, 47, 48, 49, 51, 55, 56, 67, 70] [79]


In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
%store -r scaler2
scaler = scaler2
#scaler.mean_ = scaler_data2[0]
#scaler.var_ = scaler_data2[1]
#scaler.scale_ = scaler_data2[2]
#print(scaler_data2)



In [10]:
from torch.utils.data import Dataset, DataLoader

class CustomDatasetShard(Dataset):
    def __init__(self, shard_path, features_cols, target, onehot, scaler, shard_length = None):
        self.x_cols, self.y_cols, = features_cols, target
        self.path = shard_path
        self.len = shard_length if shard_length != None else self.__get_shard_len__()
        self.onehot = onehot
        self.scaler = scaler

    def __getitem__(self, index):
        row = pd.read_csv(self.path,
                          skiprows=index,
                          nrows=1)
        row_x = row.iloc[:, self.x_cols]
        tensor_label = self.onehot[row.iloc[0, self.y_cols][0]]
        return torch.tensor(scaler.transform(row_x.to_numpy().reshape(1,-1)).reshape(20), dtype=torch.float32), tensor_label

    def __get_shard_len__(self):
        return len(pd.read_csv(self.path))

    def __len__(self):
        return self.len

In [7]:
class Cache():
    def __init__(self) -> None:
        self.currently_loaded = None
        self.dataframe_loaded = None
    
    def cache_shard(self, path, df):
        self.dataframe_loaded = df
        self.currently_loaded = path
    
    def is_cached(self, path):
        if self.currently_loaded is None:
            return False
        
        return path == self.currently_loaded

cache = Cache()

In [8]:
from torch.utils.data import Dataset, DataLoader

class CachedCustomDatasetShard(Dataset):
    def __init__(self, shard_path, features_cols, target, onehot, scaler, cache: Cache = None, shard_length = None):
        self.cache = cache
        self.x_cols, self.y_cols, = features_cols, target
        self.path = shard_path
        self.len = shard_length if shard_length != None else self.__get_shard_len__()
        self.onehot = onehot
        self.scaler = scaler

    def __getitem__(self, index):
        if self.cache is None:
            row = pd.read_csv(self.path,
                          skiprows=index,
                          nrows=1)
            row_x = row.iloc[:, self.x_cols]
            tensor_label = self.onehot[row.iloc[0, self.y_cols][0]]
            return torch.tensor(scaler.transform(row_x.to_numpy().reshape(1,-1)).reshape(20), dtype=torch.float32), tensor_label
        
        #we have cache
        if not self.cache.is_cached(self.path): #shard is not cached
            df = pd.read_csv(self.path)
            self.cache.cache_shard(self.path, df) #cache shard

        row_x = self.cache.dataframe_loaded.iloc[index, self.x_cols]
        tensor_label = self.onehot[self.cache.dataframe_loaded.iloc[index, self.y_cols][0]]
        return torch.tensor(scaler.transform(row_x.to_numpy().reshape(1,-1)).reshape(20), dtype=torch.float32), tensor_label 

    def __get_shard_len__(self):
        return len(pd.read_csv(self.path))

    def __len__(self):
        return self.len

In [9]:
shards = []
#sh_len = [1_000_000,1_000_000,1_000_000,1_000_000,1_000_000,1_000_000,1_000_000,1_000_000,1_000_000,1_000_000,1_000_000, 548_461]
for dirname, _, filenames in os.walk(shards_path):
    for filename in filenames:
        shard = os.path.join(dirname, filename)
        shards.append(shard)


dict_={'ddos' : torch.tensor([1,0], dtype=torch.float32),'Benign': torch.tensor([0,1],dtype=torch.float32)}
dss = []
#for sh, l in zip(shards, sh_len):
for sh in shards:
    #dss.append(CustomDatasetShard(sh, list(range(20)), [20], dict_, scaler, l))#list(range(0,79)) [79]
    dss.append(CachedCustomDatasetShard(sh, relevant_x_idx, relevant_y_idx, dict_, scaler, cache=cache, shard_length=None))#list(range(0,79)) [79]

chain_ds=torch.utils.data.ConcatDataset(dss)

In [10]:
loader=DataLoader(chain_ds, shuffle=False, batch_size=200)

In [None]:
for features, targets in loader:
    print(features.shape, targets.shape)

In [11]:
class AttackNet(torch.nn.Module):

    def __init__(self):
        super(AttackNet, self).__init__()
        self.model=torch.nn.Sequential(
            torch.nn.Linear(20, 512),
            torch.nn.ReLU(),
            torch.nn.Linear(512, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, 128),
            torch.nn.Tanh(),
            torch.nn.Linear(128, 64),
            torch.nn.Tanh(),
            torch.nn.Linear(64, 2),
            torch.nn.LogSoftmax(dim=1)
            )

    def forward(self, x):
        return self.model(x)

In [12]:
model=AttackNet()
model.load_state_dict(torch.load('.\\models\\02-10-2023_11-38-49__anova_binary_opt_sched_on_fair.model'))

<All keys matched successfully>

In [13]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cuda:0'

\# Unfortunately print flush is not working well on jupyter notebook: last output that is per class accuracy is the first showed...

In [14]:
def test(model, loader, n_classes):
    model=model.to(device)
    model.eval()

    seen_of_class = [0]*n_classes
    acc = [0]*n_classes
    #translator={0 : 'ddos', 1 : 'Benign'}

    model.eval()

    with torch.no_grad():
        for inputs, ground_truth in loader:
            pred = torch.argmax(model(inputs.to(device)), dim=1)
            targ = torch.argmax(ground_truth, dim=1)

            targs = targ.tolist()
            preds = pred.detach().tolist()

            for p,t in zip(preds, targs):
                seen_of_class[t] += 1
                if p == t:
                    acc[p] += 1

            running_acc = [acc[i]/seen_of_class[i] if seen_of_class[i] != 0 else 0 for i in range(n_classes)]
            print("Running accuracy:", running_acc, ". Seen", sum(seen_of_class), "samples.", end='\r', flush=True)

test(model, loader, 2)

Running accuracy: [0.997919107205187, 0.9962137186555163] . Seen 12548460 samples..[0.9960423157514734, 0.9962236249982255] . Seen 6775000 samples. [0.9960423157514734, 0.9962232928460997] . Seen 6778600 samples. [0.9960423157514734, 0.9962268972167785] . Seen 7049000 samples. [0.9960423157514734, 0.9962197530544067] . Seen 7176200 samples. [0.9960423157514734, 0.9962178749024225] . Seen 7185800 samples. [0.9960423157514734, 0.9962138350526312] . Seen 7490400 samples. [0.9960423157514734, 0.9962129248950148] . Seen 7544400 samples. [0.9960423157514734, 0.9962173398707479] . Seen 7806400 samples. [0.9960423157514734, 0.9962178155396962] . Seen 7814200 samples. [0.9960423157514734, 0.9962176461905862] . Seen 7833000 samples. [0.9960423157514734, 0.9962182496712092] . Seen 7886200 samples. [0.9960423157514734, 0.9962190526967406] . Seen 8137800 samples. [0.9960423157514734, 0.9962190024480895] . Seen 8161000 samples. [0.9960423157514734, 0.9962239658552996] . Seen 8434000 samples. [0.9960