# CIC-Darknet2020 Dataset Statistics

Here we load data from the [CIC-Darknet2020](https://www.unb.ca/cic/datasets/darknet2020.html) dataset and process it for our experiments.

First we import all relevant libraries, set a random seed, and print python and library versions for reproducability

In [1]:
import datetime, os, platform, pprint, sys
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

seed: int = 14

# set up pretty printer for easier data evaluation
pretty = pprint.PrettyPrinter(indent=4, width=30).pprint

# set up pandas display options
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print(
    f'''
    Last Execution: {datetime.datetime.now()}
    python:\t{platform.python_version()}

    \tmatplotlib:\t{mpl.__version__}
    \tnumpy:\t\t{np.__version__}
    \tpandas:\t\t{pd.__version__}
    '''
)


    Last Execution: 2022-02-13 02:15:38.795827
    python:	3.7.10

    	matplotlib:	3.3.4
    	numpy:		1.20.3
    	pandas:		1.2.5
    


Next we prepare some helper functions to help process the data

In [2]:
def get_file_path(directory: str):
    '''
        Closure that will return a function. 
        Function will return the filepath to the directory given to the closure
    '''

    def func(file: str) -> str:
        return os.path.join(directory, file)

    return func



def load_data(filePath):
    '''
        Loads the Dataset from the given filepath and caches it for quick access in the future
        Function will only work when filepath is a .csv file
    '''

    # slice off the ./CSV/ from the filePath
    if filePath[0] == '.' and filePath[1] == '/':
        filePathClean: str = filePath[11::]
        pickleDump: str = f'./cache/{filePathClean}.pickle'
    else:
        pickleDump: str = f'./cache/{filePath}.pickle'
    
    print(f'Loading Dataset: {filePath}')
    print(f'\tTo Dataset Cache: {pickleDump}\n')
    
    # check if data already exists within cache
    if os.path.exists(pickleDump):
        df = pd.read_pickle(pickleDump)
        
    # if not, load data and clean it before caching it
    else:
        df = pd.read_csv(filePath, low_memory=True)
        df.to_pickle(pickleDump)
    
    return df



def features_with_bad_values(df: pd.DataFrame, datasetName: str) -> pd.DataFrame:
    '''
        Function will scan the dataframe for features with Inf, NaN, or Zero values.
        Returns a new dataframe describing the distribution of these values in the original dataframe
    '''

    # Inf and NaN values can take different forms so we screen for every one of them
    invalid_values: list = [ np.inf, np.nan, 'Infinity', 'inf', 'NaN', 'nan', 0 ]
    infs          : list = [ np.inf, 'Infinity', 'inf' ]
    NaNs          : list = [ np.nan, 'NaN', 'nan' ]

    # We will collect stats on the dataset, specifically how many instances of Infs, NaNs, and 0s are present.
    # using a dictionary that will be converted into a (3, 2+88) dataframe
    stats: dict = {
        'Dataset':[ datasetName, datasetName, datasetName ],
        'Value'  :['Inf', 'NaN', 'Zero']
    }

    i = 0
    for col in df.columns:
        
        i += 1
        feature = np.zeros(3)
        
        for value in invalid_values:
            if value in infs:
                j = 0
            elif value in NaNs:
                j = 1
            else:
                j = 2
            indexNames = df[df[col] == value].index
            if not indexNames.empty:
                feature[j] += len(indexNames)
                
        stats[col] = feature

    return pd.DataFrame(stats)



Before we do any processing on the data, we need to list out all their filepaths. If trying to reproduce the process carried out here, place files in the same location relative to the notebook.

In [3]:
# This code is used to scale to processing numerous datasets, even though we currently are only looking at one now
data_path_1: str = './original/'   
data_set_1: list = [
    'Darknet.csv',
]

data_set: list  = data_set_1
file_path_1      = get_file_path(data_path_1)
file_set: list   = list(map(file_path_1, data_set_1))
current_job: int = 0

Some more helper functions that process the data using the file and dataset information above

In [4]:
def examine_dataset(job_id: int) -> dict({'File': str, 'Dataset': pd.DataFrame, 'Feature_stats': pd.DataFrame, 'Data_composition': pd.DataFrame}):
    '''
        Function will return a dictionary containing dataframe of the job_id passed in as well as that dataframe's
        feature stats, data composition, and file name.
    '''

    job_id = job_id - 1  # adjusts for indexing while enumerating jobs from 1
    print(f'Dataset {job_id+1}/{len(data_set)}: We now look at {file_set[job_id]}\n\n')

    # Load the dataset
    df: pd.DataFrame = load_data(file_set[job_id])
    # df = clean_data(df)
 

    # print the data composition
    print(f'''
        File:\t\t\t\t{file_set[job_id]}  
        Job Number:\t\t\t{job_id+1}
        Shape:\t\t\t\t{df.shape}
        Samples:\t\t\t{df.shape[0]} 
        Features:\t\t\t{df.shape[1]}
    ''')
    

    # return the dataframe and the feature stats
    data_summary: dict =  {
        'File':             file_set[job_id],
        'Dataset':          df,
        'Feature_stats':    features_with_bad_values(df, file_set[job_id]), 
    }
    
    return data_summary



def check_infs(data_summary: dict) -> pd.DataFrame:
    '''
        Function will return a dataframe of features with a value of Inf.
    '''

    
    vals: pd.DataFrame = data_summary['Feature_stats']
    inf_df = vals[vals['Value'] == 'Inf'].T

    return inf_df[inf_df[0] != 0]



def check_nans(data_summary: dict) -> pd.DataFrame:
    '''
        Function will return a dataframe of features with a value of NaN.
    '''

    vals: pd.DataFrame = data_summary['Feature_stats']
    nan_df = vals[vals['Value'] == 'NaN'].T

    return nan_df[nan_df[1] != 0]



def check_zeros(data_summary: dict) -> pd.DataFrame:
    '''
        Function will return a dataframe of features with a value of 0.
    '''

    vals: pd.DataFrame = data_summary['Feature_stats']
    zero_df = vals[vals['Value'] == 'Zero'].T

    return zero_df[zero_df[2] != 0]



def check_zeros_over_threshold(data_summary: dict, threshold: int) -> pd.DataFrame:
    '''
        Function will return a dataframe of features with a value of 0.
    '''

    vals: pd.DataFrame = data_summary['Feature_stats']
    zero_df = vals[vals['Value'] == 'Zero'].T
    zero_df_bottom = zero_df[2:]

    return zero_df_bottom[zero_df_bottom[2] > threshold]



def check_zeros_over_threshold_percentage(data_summary: dict, threshold: float) -> pd.DataFrame:
    '''
        Function will return a dataframe of features with all features with
        a frequency of 0 values greater than the threshold
    '''

    vals: pd.DataFrame = data_summary['Feature_stats']
    size: int = data_summary['Dataset'].shape[0]
    zero_df = vals[vals['Value'] == 'Zero'].T
    zero_df_bottom = zero_df[2:]

    return zero_df_bottom[zero_df_bottom[2] > threshold*size]



def create_new_prune_candidates(zeros_df: pd.DataFrame) -> list:
    '''
        Function creates a list of prune candidates from a dataframe of features with a high frequency of 0 values
    '''

    return list(zeros_df.T.columns)



def intersection_of_prune_candidates(pruneCandidates: list, newPruneCandidates: list) -> list:
    '''
        Function will return a list of features that are in both pruneCandidates and newPruneCandidates
    '''

    return list(set(pruneCandidates).intersection(newPruneCandidates))


This gives us a set of file locations. Lets look at the set of files that make up our experiments.

In [5]:
print(f'We will be cleaning {len(file_set)} files:')
pretty(file_set)

We will be cleaning 1 files:
['./original/Darknet.csv']


## The Original CIC-Darknet2020 Dataset

In [6]:
dataset_1: dict = examine_dataset(1)
dataset_1['Dataset']['Label1'] = dataset_1['Dataset']['Label1'].str.lower()
dataset_1['Dataset'].Label1.unique()

print(
f"""
    Labels in the first layer:
{dataset_1['Dataset'].groupby('Label').size()}

    Labels in the second layer:
 {dataset_1['Dataset'].groupby('Label1').size()}
"""
)

Dataset 1/1: We now look at ./original/Darknet.csv


Loading Dataset: ./original/Darknet.csv
	To Dataset Cache: ./cache/Darknet.csv.pickle


        File:				./original/Darknet.csv  
        Job Number:			1
        Shape:				(141530, 85)
        Samples:			141530 
        Features:			85
    

    Labels in the first layer:
Label
Non-Tor    93356
NonVPN     23863
Tor         1392
VPN        22919
dtype: int64

    Labels in the second layer:
 Label1
audio-streaming    18064
browsing           32808
chat               11478
email               6145
file-transfer      11182
p2p                48520
video-streaming     9767
voip                3566
dtype: int64



### Feature Breakdown


Now that we have a dataset loaded, let's explore the features.

In [7]:
prune: list = [] # prune is a list of all features we know we don't want to use
clip : list = [] # clip is a list of all values we do not want to use

values = dataset_1['Dataset'].values
columns = dataset_1['Dataset'].columns

print("Feature types:")
for i in range(dataset_1['Dataset'].shape[1]):
    if type(values[0][i]) == str and columns[i] != 'Label' and columns[i] != 'Label1':
        prune.append(columns[i]) 
    print(f"Column: {i}\tType: {type(values[0][i])}\tFeature: {columns[i]}")


print("\nFeature Samples:")
for i in range(dataset_1['Dataset'].shape[1]):
    print(f"Column: {i}\tSample: {values[0][i]}")

Feature types:
Column: 0	Type: <class 'str'>	Feature: Flow ID
Column: 1	Type: <class 'str'>	Feature: Src IP
Column: 2	Type: <class 'int'>	Feature: Src Port
Column: 3	Type: <class 'str'>	Feature: Dst IP
Column: 4	Type: <class 'int'>	Feature: Dst Port
Column: 5	Type: <class 'int'>	Feature: Protocol
Column: 6	Type: <class 'str'>	Feature: Timestamp
Column: 7	Type: <class 'int'>	Feature: Flow Duration
Column: 8	Type: <class 'int'>	Feature: Total Fwd Packet
Column: 9	Type: <class 'int'>	Feature: Total Bwd packets
Column: 10	Type: <class 'int'>	Feature: Total Length of Fwd Packet
Column: 11	Type: <class 'int'>	Feature: Total Length of Bwd Packet
Column: 12	Type: <class 'int'>	Feature: Fwd Packet Length Max
Column: 13	Type: <class 'int'>	Feature: Fwd Packet Length Min
Column: 14	Type: <class 'float'>	Feature: Fwd Packet Length Mean
Column: 15	Type: <class 'float'>	Feature: Fwd Packet Length Std
Column: 16	Type: <class 'int'>	Feature: Bwd Packet Length Max
Column: 17	Type: <class 'int'>	Feature

We can see from the sample above that the certain features have a structure.

Flow ID: (Source IP)-(Destination IP)-(Source Port)-(Destination Port)-(Protocol)


In [8]:
dataset_1['Dataset'].head()

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Fwd Segment Size Avg,Bwd Segment Size Avg,Fwd Bytes/Bulk Avg,Fwd Packet/Bulk Avg,Fwd Bulk Rate Avg,Bwd Bytes/Bulk Avg,Bwd Packet/Bulk Avg,Bwd Bulk Rate Avg,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,FWD Init Win Bytes,Bwd Init Win Bytes,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Label1
0,10.152.152.11-216.58.220.99-57158-443-6,10.152.152.11,57158,216.58.220.99,443,6,24/07/2015 04:09:48 PM,229,1,1,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,8733.624454,229.0,0.0,229,229,0,0.0,0.0,0,0,0,0.0,0.0,0,0,0,0,0,0,20,20,4366.812227,4366.812227,0,0,0.0,0.0,0.0,2,0,0,0,2,0,0,0,1,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,1892,1047,0,20,0,0,0,0,0.0,0.0,0.0,0.0,Non-Tor,audio-streaming
1,10.152.152.11-216.58.220.99-57159-443-6,10.152.152.11,57159,216.58.220.99,443,6,24/07/2015 04:09:48 PM,407,1,1,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,4914.004914,407.0,0.0,407,407,0,0.0,0.0,0,0,0,0.0,0.0,0,0,0,0,0,0,20,20,2457.002457,2457.002457,0,0,0.0,0.0,0.0,2,0,0,0,2,0,0,0,1,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,1987,1047,0,20,0,0,0,0,0.0,0.0,0.0,0.0,Non-Tor,audio-streaming
2,10.152.152.11-216.58.220.99-57160-443-6,10.152.152.11,57160,216.58.220.99,443,6,24/07/2015 04:09:48 PM,431,1,1,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,4640.37123,431.0,0.0,431,431,0,0.0,0.0,0,0,0,0.0,0.0,0,0,0,0,0,0,20,20,2320.185615,2320.185615,0,0,0.0,0.0,0.0,2,0,0,0,2,0,0,0,1,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,2049,1047,0,20,0,0,0,0,0.0,0.0,0.0,0.0,Non-Tor,audio-streaming
3,10.152.152.11-74.125.136.120-49134-443-6,10.152.152.11,49134,74.125.136.120,443,6,24/07/2015 04:09:48 PM,359,1,1,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,5571.030641,359.0,0.0,359,359,0,0.0,0.0,0,0,0,0.0,0.0,0,0,0,0,0,0,20,20,2785.51532,2785.51532,0,0,0.0,0.0,0.0,2,0,0,0,2,0,0,0,1,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,2008,1047,0,20,0,0,0,0,0.0,0.0,0.0,0.0,Non-Tor,audio-streaming
4,10.152.152.11-173.194.65.127-34697-19305-6,10.152.152.11,34697,173.194.65.127,19305,6,24/07/2015 04:09:45 PM,10778451,591,400,64530,6659,131,0,109.187817,22.283313,498,0,16.6475,46.833714,6604.75239,91.942711,10887.32424,11412.46641,78158,13,10778451,18268.56102,11786.14309,81171,126,10747836,26936.93233,15897.73845,78158,307,1,0,0,0,11820,8000,54.831627,37.111084,0,498,71.876008,56.93647,3241.761603,1,0,0,659,991,0,0,0,0,71.948537,109.187817,16.6475,0,0,0,0,659,6605,0,65,0,6,1382,2320,581,20,0,0,0,0,1437760000000000.0,3117718.131,1437760000000000.0,1437760000000000.0,Non-Tor,audio-streaming


In [9]:
dataset_1['Dataset'].describe()

Unnamed: 0,Src Port,Dst Port,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Fwd Segment Size Avg,Bwd Segment Size Avg,Fwd Bytes/Bulk Avg,Fwd Packet/Bulk Avg,Fwd Bulk Rate Avg,Bwd Bytes/Bulk Avg,Bwd Packet/Bulk Avg,Bwd Bulk Rate Avg,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,FWD Init Win Bytes,Bwd Init Win Bytes,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141483.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0,141530.0
mean,38450.268819,18124.647333,10.350427,20812800.0,152.800749,154.642062,112621.1,130453.0,208.92042,15.617078,69.594959,63.974203,229.971299,41.252194,107.286657,65.183434,inf,inf,2604871.0,3217701.0,9893961.0,907903.0,19768460.0,4101510.0,2590150.0,9198071.0,2481004.0,16632780.0,3054588.0,2205417.0,7408141.0,1617510.0,0.099562,0.0,0.0,0.0,2933.009,3099.272,6321.814,4574.66523,14.365604,323.627316,94.373958,108.474093,54881.04,0.455691,0.462284,0.038571,61.523168,268.251749,0.0,0.0,0.0,0.607214,108.900008,69.594959,107.286657,0.0,0.0,0.0,0.0,158.869455,45218.01,0.286745,43.128022,0.0,55.488568,5308.18967,1766.762411,96.881573,15.808182,0.0,0.0,0.0,0.0,702803000000000.0,55261370000000.0,730588500000000.0,599407800000000.0
std,19124.80199,22202.197159,5.431807,38091550.0,2378.323352,3418.715287,3251357.0,4566180.0,649.432333,31.312298,219.965954,168.646012,786.736861,100.785013,240.511916,193.995126,,,7124917.0,7753864.0,19685180.0,5665210.0,37656630.0,11455490.0,7138834.0,19212860.0,10496750.0,35907700.0,10364020.0,6824438.0,18016330.0,9098340.0,0.299416,0.0,0.0,0.0,47526.86,69951.75,35588.99,21380.969146,23.951595,966.025271,190.563975,207.64084,793355.2,0.556133,0.844423,0.342543,807.239077,5342.816116,0.0,0.0,0.0,12.497544,197.306424,219.965954,240.511916,0.0,0.0,0.0,0.0,3042.371342,1046669.0,0.452243,137.725733,0.0,134.367698,9895.622577,7563.995687,1582.814902,7.110714,0.0,0.0,0.0,0.0,705871000000000.0,193016500000000.0,724820200000000.0,713828700000000.0
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01666866,0.0,0.0,0.0,-2255.0,0.0,0.0,0.0,0.0,-458.0,0.0,0.0,0.0,0.0,-445.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,32425.5,80.0,6.0,17781.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6411654,3021.5,0.0,16908.25,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.3798137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,43528.0,5355.0,6.0,416282.0,2.0,1.0,44.0,0.0,34.0,0.0,22.666667,0.0,0.0,0.0,0.0,0.0,74.07686,7.296616,207162.0,12976.21,411409.5,166.0,247678.5,68894.31,0.0,193295.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,16.0,4.869865,0.713498,0.0,59.0,28.363636,9.237604,85.33333,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,33.0,22.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0,913.0,0.0,1.0,20.0,0.0,0.0,0.0,0.0,728125000000000.0,0.0,1427730000000000.0,5584614.0
75%,53338.0,40020.0,17.0,11814700.0,4.0,3.0,216.0,216.0,103.0,31.0,45.0,39.259818,213.0,0.0,96.969207,11.313709,990.826,400.6812,1882400.0,2213250.0,7210274.0,129120.8,9743915.0,2682830.0,1436683.0,6323130.0,102411.2,1321916.0,186044.2,45919.65,733554.0,34.0,0.0,0.0,0.0,0.0,92.0,72.0,280.7026,37.10676,29.0,345.0,95.837366,118.472453,14035.72,1.0,0.0,0.0,2.0,6.0,0.0,0.0,0.0,1.0,141.78373,45.0,96.969207,0.0,0.0,0.0,0.0,0.0,0.0,1.0,23.0,0.0,47.0,14348.0,1047.0,2.0,20.0,0.0,0.0,0.0,0.0,1456260000000000.0,10314570.0,1456320000000000.0,1456260000000000.0
max,65534.0,65535.0,17.0,120000000.0,238161.0,470862.0,769307400.0,670428700.0,64240.0,1350.0,28502.36897,15870.12298,48168.0,1350.0,9901.912706,11469.19348,inf,inf,119985600.0,84774620.0,119999400.0,119985600.0,120000000.0,119999700.0,84836840.0,119999700.0,119999700.0,120000000.0,119806500.0,84807190.0,119965700.0,119806500.0,1.0,0.0,0.0,0.0,4768644.0,9417240.0,2000000.0,1000000.0,1350.0,64240.0,6647.070671,12617.09549,159191100.0,2.0,7.0,71.0,48025.0,709023.0,0.0,0.0,0.0,4266.0,6647.188377,28502.36897,9901.912706,0.0,0.0,0.0,0.0,470862.0,151574100.0,1.0,6644.0,0.0,4872.0,65535.0,65535.0,113325.0,44.0,0.0,0.0,0.0,0.0,1460000000000000.0,1030000000000000.0,1460000000000000.0,1460000000000000.0


In [10]:
check_infs(dataset_1)

Unnamed: 0,0
Dataset,./original/Darknet.csv
Value,Inf
Flow Bytes/s,2.0
Flow Packets/s,49.0


In [11]:
check_nans(dataset_1)

Unnamed: 0,1
Dataset,./original/Darknet.csv
Value,


In [12]:
check_zeros(dataset_1)

Unnamed: 0,2
Dataset,./original/Darknet.csv
Value,Zero
Src Port,799.0
Dst Port,799.0
Protocol,799.0
Flow Duration,49.0
Total Bwd packets,40583.0
Total Length of Fwd Packet,39805.0
Total Length of Bwd Packet,75089.0
Fwd Packet Length Max,39805.0


In [13]:
check_zeros_over_threshold(dataset_1, 120000)

Unnamed: 0,2
Fwd PSH Flags,127439.0
Bwd PSH Flags,141530.0
Fwd URG Flags,141530.0
Bwd URG Flags,141530.0
RST Flag Count,138243.0
URG Flag Count,141530.0
CWE Flag Count,141530.0
ECE Flag Count,141530.0
Fwd Bytes/Bulk Avg,141530.0
Fwd Packet/Bulk Avg,141530.0


In [14]:
print(f'Last Execution: {datetime.datetime.now()}')
assert False, 'Nothing is complete after this point'

Last Execution: 2022-02-13 02:15:43.151696


AssertionError: Nothing is complete after this point

Here we try the tabgan library from 

In [None]:
import tabgan
from tabgan.sampler import OriginalGenerator, GANGenerator

print(
    f'''
    Last Execution: {datetime.datetime.now()}
    python:\t{platform.python_version()}

    \tmatplotlib:\t{mpl.__version__}
    \tnumpy:\t\t{np.__version__}
    \tpandas:\t\t{pd.__version__}
    \ttabgan:\t\t{tabgan.__version__}
    '''
)


    Last Execution: 2022-02-13 01:08:56.815693
    python:	3.7.10

    	matplotlib:	3.3.4
    	numpy:		1.20.3
    	pandas:		1.2.5
    	tabgan:		1.2.0
    


In [None]:
train = pd.DataFrame(np.random.randint(-10, 150, size=(150, 4)), columns=list("ABCD"))
target = pd.DataFrame(np.random.randint(0, 3, size=(150, 1)), columns=list("Y"))
test = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list("ABCD"))

# generate data
new_train1, new_target1 = OriginalGenerator().generate_data_pipe(train, target, test, )
# new_train2, new_target2 = GANGenerator().generate_data_pipe(train, target, test, )



In [None]:
new_train1.shape
# new_target1.shape

(39, 4)