In [1]:
import os
import glob
from pathlib import Path

!pip install inquirer
import inquirer

import pyspark

from pyspark.sql.functions import *
from pyspark.ml import Pipeline
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.classification import  RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.feature import StringIndexer, VectorAssembler, VectorSlicer

import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
%autosave 60



Autosaving every 60 seconds


In [None]:
# 1. Take datasett name from user ( + option ALL)

# 2. Which feature selection user want to apply

# 3. selected/train.csv, test.csv

In [48]:
class SparkFactory:
    def __init__(self):
        self.base = Path('./datasets')
        self.aviable_datasets = ["CICIDS", "NET", "All"]
        self.spark = SparkSession.builder.appName(__name__).getOrCreate()
    
    def validate_available_dataset(self):
#         return [name for name in glob.glob(f'{self.source_dataset_folder}/**')]
        return [name for name in self.aviable_datasets]

    def preprocess(dataframe):
        dataframe.fillna(dataframe.mean(), inplace=True)
        TARGETS = ['binary_class', 'multi_class']
        ordinal_encoder = OrdinalEncoder()
        for target_column in TARGETS: 
            dataframe[f'{target_column}'] = ordinal_encoder.fit_transform(dataframe[[f'{target_column}']]).astype('int')
        
        X = dataframe.drop(['binary_class','multi_class'], axis=1)
        y = dataframe.drop()
        
        return X

    def read_dataset(self, dataset):
        """Read source dataset with sql interface"""
        print(f'Reading dataset: {dataset}')
        try:
            self.spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(dataset) 
            data = self.spark.read.option("header", "true").csv(dataset)
        except Exception as e:
            raise e
            
        return data
    
    def read_df(self, dataset):
        df = self.spark.createDataFrame(pd.read_csv(dataset))
        return df
    
    def _one_hot_endcoder(data):
        
        Vectorizer = CountVectorizer(inputCol="Color_Array", outputCol="Color_OneHotEncoded", vocabSize=4, minDF=1.0)
        return data
    
    def run(self):
        
        print(self.base)
        # validate and choose dataset to work with
        assert input('Hi! Would u like to see available datasets to process? Y/n: ') == 'Y', 'Okay, see u later!'    
        print(self.validate_available_dataset())
        dataset = input('Good! Choose one of datasets to work with pasting name shown in previous step: ')
        
        while False:
            path = f'' + self.source_dataset_folder + '/TRANSFORMED_{dataset}'
            print(path)
            assert os.path.isdir(path) == True, 'Wrong path, try one more time!'
        print(f'Nice, we gonna to preprocess {dataset}')
        data = self.read_dataset(dataset)
        # print(data.toPandas().head(10))
        
        # validate and choose method to use
        print('Available methods to apply: [OHE, B, C, D]')
        method = input('Now choose one of methods to apply ')
        if method == 'OHE':
            data = _one_hot_endcoder(data)


In [21]:
# SparkFactory().run()

In [49]:
TRANSFORMED_CICIDS_TRAIN = pd.read_csv('datasets/TRANSFORMED_CICIDS/train.csv')

_df = SparkFactory.preprocess(TRANSFORMED_CICIDS_TRAIN)

In [61]:
spark = SparkSession.builder.appName(__name__).getOrCreate()

In [62]:
spark_df = spark.createDataFrame(_df)

In [109]:
spark_df

DataFrame[src_port: bigint, pld_distinct: bigint, bytes_out: bigint, hdr_mean: double, num_pkts_out: bigint, pld_mean: double, rev_hdr_distinct: bigint, hdr_bin_40: bigint, pr: bigint, rev_hdr_bin_40: bigint, pld_bin_inf: bigint, rev_pld_mean: double, rev_pld_bin_128: bigint, hdr_distinct: bigint, dns_answer_cnt: double, id: bigint, dns_query_cnt: double, pld_median: bigint, time_length: double, rev_pld_distinct: bigint, num_pkts_in: bigint, rev_pld_var: double, pld_max: bigint, rev_pld_max: bigint, dst_port: bigint, bytes_in: bigint, tls_svr_key_exchange_len: double, tls_svr_cnt: double, tls_cnt: double, tls_key_exchange_len: double, tls_svr_ext_cnt: double, tls_svr_cs_cnt: double, tls_cs_cnt: double, tls_ext_cnt: double, http_method: double, http_code: double, http_content_len: double]

In [98]:
spark_df_

DataFrame[src_port: bigint, pld_distinct: bigint, bytes_out: bigint, hdr_mean: double, num_pkts_out: bigint, pld_mean: double, rev_hdr_distinct: bigint, hdr_bin_40: bigint, pr: bigint, rev_hdr_bin_40: bigint, pld_bin_inf: bigint, rev_pld_mean: double, rev_pld_bin_128: bigint, hdr_distinct: bigint, dns_answer_cnt: double, id: bigint, dns_query_cnt: double, pld_median: bigint, time_length: double, rev_pld_distinct: bigint, num_pkts_in: bigint, rev_pld_var: double, pld_max: bigint, rev_pld_max: bigint, dst_port: bigint, bytes_in: bigint, tls_svr_key_exchange_len: double, tls_svr_cnt: double, tls_cnt: double, tls_key_exchange_len: double, tls_svr_ext_cnt: double, tls_svr_cs_cnt: double, tls_cs_cnt: double, tls_ext_cnt: double, http_method: double, http_code: double, http_content_len: double, binary_class: bigint, multi_class: bigint]

In [103]:
drop_list = ['multi_class']

spark_df_ = spark_df_.select([column for column in spark_df_.columns if column not in drop_list])

In [113]:
spark_df_

DataFrame[src_port: bigint, pld_distinct: bigint, bytes_out: bigint, hdr_mean: double, num_pkts_out: bigint, pld_mean: double, rev_hdr_distinct: bigint, hdr_bin_40: bigint, pr: bigint, rev_hdr_bin_40: bigint, pld_bin_inf: bigint, rev_pld_mean: double, rev_pld_bin_128: bigint, hdr_distinct: bigint, dns_answer_cnt: double, id: bigint, dns_query_cnt: double, pld_median: bigint, time_length: double, rev_pld_distinct: bigint, num_pkts_in: bigint, rev_pld_var: double, pld_max: bigint, rev_pld_max: bigint, dst_port: bigint, bytes_in: bigint, tls_svr_key_exchange_len: double, tls_svr_cnt: double, tls_cnt: double, tls_key_exchange_len: double, tls_svr_ext_cnt: double, tls_svr_cs_cnt: double, tls_cs_cnt: double, tls_ext_cnt: double, http_method: double, http_code: double, http_content_len: double, binary_class: bigint]

In [110]:
num_var = [i[0] for i in spark_df_.dtypes if (((i[1]=='int') | (i[1]=='bigint') | (i[1]=='double')) & (i[0]!='binary_class'))]

In [114]:
label_indexes = StringIndexer(inputCol = 'binary_class', outputCol = 'label', handleInvalid = 'keep')

In [115]:
assembler = VectorAssembler(inputCols = num_var, outputCol = "features")

In [116]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed = 8464,numTrees=10, cacheNodeIds = True, subsamplingRate = 0.7)

In [117]:
pipe = Pipeline(stages = [assembler, label_indexes, rf])

In [119]:
mod = pipe.fit(spark_df_)

In [121]:
df2 = mod.transform(spark_df_)

In [122]:
mod.stages[-1].featureImportances

SparseVector(37, {0: 0.0027, 1: 0.0101, 2: 0.1089, 3: 0.0952, 4: 0.0041, 5: 0.1645, 6: 0.0011, 7: 0.0096, 8: 0.0004, 9: 0.0086, 10: 0.0003, 11: 0.0898, 12: 0.018, 13: 0.0106, 15: 0.0, 17: 0.0116, 18: 0.028, 19: 0.0002, 20: 0.0055, 21: 0.065, 22: 0.093, 23: 0.0791, 24: 0.0583, 25: 0.0763, 27: 0.008, 28: 0.0025, 29: 0.0055, 30: 0.0014, 32: 0.0021, 33: 0.0031, 34: 0.0, 35: 0.0059, 36: 0.0305})

In [124]:
def ExtractFeatureImp(featureImp, dataset, featuresCol):
    list_extract = []
    for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))

In [125]:
varlist = ExtractFeatureImp(mod.stages[-1].featureImportances, df2, "features")

In [126]:
varidx = [x for x in varlist['idx'][0:10]]
varidx

[5, 2, 3, 22, 11, 23, 25, 21, 24, 36]

In [127]:
slicer = VectorSlicer(inputCol="features", outputCol="features2", indices=varidx)
df3 = slicer.transform(df2)

In [128]:
df3 = df3.drop('rawPrediction', 'probability', 'prediction')
rf2 = RandomForestClassifier(labelCol="label", featuresCol="features2", seed = 8464,
                            numTrees=10, cacheNodeIds = True, subsamplingRate = 0.7)
mod2 = rf2.fit(df3)
df4 = mod2.transform(df3)

In [129]:
df4

DataFrame[src_port: bigint, pld_distinct: bigint, bytes_out: bigint, hdr_mean: double, num_pkts_out: bigint, pld_mean: double, rev_hdr_distinct: bigint, hdr_bin_40: bigint, pr: bigint, rev_hdr_bin_40: bigint, pld_bin_inf: bigint, rev_pld_mean: double, rev_pld_bin_128: bigint, hdr_distinct: bigint, dns_answer_cnt: double, id: bigint, dns_query_cnt: double, pld_median: bigint, time_length: double, rev_pld_distinct: bigint, num_pkts_in: bigint, rev_pld_var: double, pld_max: bigint, rev_pld_max: bigint, dst_port: bigint, bytes_in: bigint, tls_svr_key_exchange_len: double, tls_svr_cnt: double, tls_cnt: double, tls_key_exchange_len: double, tls_svr_ext_cnt: double, tls_svr_cs_cnt: double, tls_cs_cnt: double, tls_ext_cnt: double, http_method: double, http_code: double, http_content_len: double, binary_class: bigint, features: vector, label: double, features2: vector, rawPrediction: vector, probability: vector, prediction: double]

In [None]:
encoding_var = [i[0] for i in df.dtypes if (i[1]=='string') & (i[0]!='y')]
num_var = [i[0] for i in df.dtypes if ((i[1]=='int') | (i[1]=='double')) & (i[0]!='y')]

string_indexes = [StringIndexer(inputCol = c, outputCol = 'IDX_' + c, handleInvalid = 'keep') for c in encoding_var]
onehot_indexes = [OneHotEncoderEstimator(inputCols = ['IDX_' + c], outputCols = ['OHE_' + c]) for c in encoding_var]
label_indexes = StringIndexer(inputCol = 'y', outputCol = 'label', handleInvalid = 'keep')
assembler = VectorAssembler(inputCols = num_var + ['OHE_' + c for c in encoding_var], outputCol = "features")
rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed = 8464,
                            numTrees=10, cacheNodeIds = True, subsamplingRate = 0.7)

pipe = Pipeline(stages = string_indexes + onehot_indexes + [assembler, label_indexes, rf])

In [4]:
df.head()

Unnamed: 0,src_port,pld_distinct,bytes_out,hdr_mean,num_pkts_out,pld_mean,rev_hdr_distinct,hdr_bin_40,pr,rev_hdr_bin_40,...,tls_key_exchange_len,tls_svr_ext_cnt,tls_svr_cs_cnt,tls_cs_cnt,tls_ext_cnt,http_method,http_code,http_content_len,binary_class,multi_class
0,56565,1,58,8.0,2,29.0,1,0,17,0,...,71.037562,2.415821,1.0,17.295687,7.264552,1.195444,206.271259,64397.744691,0,2
1,52995,1,148,8.0,4,37.0,1,0,17,0,...,71.037562,2.415821,1.0,17.295687,7.264552,1.195444,206.271259,64397.744691,0,2
2,40805,1,70,8.0,2,35.0,1,0,17,0,...,71.037562,2.415821,1.0,17.295687,7.264552,1.195444,206.271259,64397.744691,0,2
3,31833,2,146,8.0,4,36.5,1,0,17,0,...,71.037562,2.415821,1.0,17.295687,7.264552,1.195444,206.271259,64397.744691,0,2
4,64062,7,3272,32.8,15,218.13,3,14,6,12,...,258.0,1.0,1.0,19.0,1.0,1.195444,206.271259,64397.744691,0,2


In [3]:
TRANSFORMED_CICIDS_TRAIN = pd.read_csv('datasets/TRANSFORMED_CICIDS/train.csv')
#TRANSFORMED_CICIDS_TEST = pd.read_csv('datasets/TRANSFORMED_CICIDS/train.csv')

TRANSFORMED_NET_TRAIN = pd.read_csv('datasets/TRANSFORMED_NET/train.csv')
#TRANSFORMED_NET_TEST = pd.read_csv('datasets/TRANSFORMED_CICIDS/train.csv')

TARGETS = ['binary_class', 'multi_class']

def preprocess(dataframe: pd.DataFrame):
    dataframe.fillna(dataframe.mean(), inplace=True)
    print(f'Filled Nan values for dataset')
    print(f'Sanity check: Nan values before: {dataframe.isna().sum()}, after: {dataframe.isna().sum()}')
    
    ordinal_encoder = OrdinalEncoder()
    for target_column in TARGETS: 
        dataframe[f'{target_column}'] = ordinal_encoder.fit_transform(dataframe[[f'{target_column}']]).astype('int')
        print(f'Encoded column {target_column} with class number: {dataframe[f"{target_column}"].unique()}')
    return dataframe

df = preprocess(TRANSFORMED_CICIDS_TRAIN)

Filled Nan values for dataset
Sanity check: Nan values before: src_port                    0
pld_distinct                0
bytes_out                   0
hdr_mean                    0
num_pkts_out                0
pld_mean                    0
rev_hdr_distinct            0
hdr_bin_40                  0
pr                          0
rev_hdr_bin_40              0
pld_bin_inf                 0
rev_pld_mean                0
rev_pld_bin_128             0
hdr_distinct                0
dns_answer_cnt              0
id                          0
dns_query_cnt               0
pld_median                  0
time_length                 0
rev_pld_distinct            0
num_pkts_in                 0
rev_pld_var                 0
pld_max                     0
rev_pld_max                 0
dst_port                    0
bytes_in                    0
tls_svr_key_exchange_len    0
tls_svr_cnt                 0
tls_cnt                     0
tls_key_exchange_len        0
tls_svr_ext_cnt             0
tls_svr

In [4]:
# SparkFactory().run()
data = SparkFactory().read_dataset('datasets/CICIDS2017/train/fine/train.csv')

Reading dataset: datasets/CICIDS2017/train/fine/train.csv


In [4]:
from pyspark.sql.types import StringType, DoubleType

categorical_columns = [f.name for f in data.schema.fields if isinstance(f.dataType, StringType)]
numerical_columns = [f.name for f in data.schema.fields]
print(len(categorical_columns))
print(len(numerical_columns))

123
123


In [5]:
from collections import defaultdict

data_types = defaultdict(list)

for entry in data.schema.fields:
    data_types[str(entry.dataType)].append(entry.name)
    
data_types    

defaultdict(list,
            {'StringType': ['_c0',
              'ack_psh_rst_syn_fin_cnt_0',
              'ack_psh_rst_syn_fin_cnt_1',
              'ack_psh_rst_syn_fin_cnt_2',
              'ack_psh_rst_syn_fin_cnt_3',
              'ack_psh_rst_syn_fin_cnt_4',
              'bytes_in',
              'bytes_out',
              'dst_port',
              'hdr_bin_40',
              'hdr_ccnt_0',
              'hdr_ccnt_1',
              'hdr_ccnt_2',
              'hdr_ccnt_3',
              'hdr_ccnt_4',
              'hdr_ccnt_5',
              'hdr_ccnt_6',
              'hdr_ccnt_7',
              'hdr_ccnt_8',
              'hdr_ccnt_9',
              'hdr_ccnt_10',
              'hdr_ccnt_11',
              'hdr_distinct',
              'hdr_mean',
              'intervals_ccnt_0',
              'intervals_ccnt_1',
              'intervals_ccnt_2',
              'intervals_ccnt_3',
              'intervals_ccnt_4',
              'intervals_ccnt_5',
              'intervals_cc

In [9]:
# from pyspark.ml import Pipeline
# from pyspark.ml.feature import OneHotEncoder, StringIndexer

# strings_used = [var for var in data_types["StringType"]]
# stage_string = [StringIndexer(inputCol= c, outputCol= c +"_string_encoded") for c in strings_used]
# stage_one_hot = [OneHotEncoder(inputCol= c + "_string_encoded", outputCol= c + "_one_hot") for c in strings_used]

# ppl = Pipeline(stages= stage_string + stage_one_hot)
# df = ppl.fit(data).transform(data)

In [13]:
df.drop(columns='Unnamed: 0', inplace=True)

In [15]:
X = df.drop(columns='y')
y = df.y

In [24]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 441116 entries, 0 to 441115
Columns: 121 entries, ack_psh_rst_syn_fin_cnt_0 to time_length
dtypes: float64(121)
memory usage: 407.2 MB
