In [None]:
!pip install kfp
from typing import NamedTuple
from kfp.components import InputPath, OutputPath
from kfp.components import func_to_container_op

from datetime import datetime

import sys
sys.path.insert(0,"..")

In [1]:
def prepare_data(
    url : str,
    X_train_path:  OutputPath("PKL"),
    Y_train_path:  OutputPath("PKL"),
    X_test_path:  OutputPath("PKL"),
    X_test_path:  OutputPath("PKL")
):  
    import os
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import time

    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.metrics import accuracy_score
    import numpy as np
    import matplotlib.pyplot as plt
    from keras.models import Sequential
    from keras.layers import Dense
    from keras.wrappers.scikit_learn import KerasClassifier
    import joblib
    import wget

    wget.download(url)
    cols="""duration,
    protocol_type,
    service,
    flag,
    src_bytes,
    dst_bytes,
    land,
    wrong_fragment,
    urgent,
    hot,
    num_failed_logins,
    logged_in,
    num_compromised,
    root_shell,
    su_attempted,
    num_root,
    num_file_creations,
    num_shells,
    num_access_files,
    num_outbound_cmds,
    is_host_login,
    is_guest_login,
    count,
    srv_count,
    serror_rate,
    srv_serror_rate,
    rerror_rate,
    srv_rerror_rate,
    same_srv_rate,
    diff_srv_rate,
    srv_diff_host_rate,
    dst_host_count,
    dst_host_srv_count,
    dst_host_same_srv_rate,
    dst_host_diff_srv_rate,
    dst_host_same_src_port_rate,
    dst_host_srv_diff_host_rate,
    dst_host_serror_rate,
    dst_host_srv_serror_rate,
    dst_host_rerror_rate,
    dst_host_srv_rerror_rate"""

    columns=[]
    for c in cols.split(','):
        if(c.strip()):
            columns.append(c.strip())
    columns.append('target')

    attacks_types = {
    'normal': 'normal',
    'back': 'dos',
    'buffer_overflow': 'u2r',
    'ftp_write': 'r2l',
    'guess_passwd': 'r2l',
    'imap': 'r2l',
    'ipsweep': 'probe',
    'land': 'dos',
    'loadmodule': 'u2r',
    'multihop': 'r2l',
    'neptune': 'dos',
    'nmap': 'probe',
    'perl': 'u2r',
    'phf': 'r2l',
    'pod': 'dos',
    'portsweep': 'probe',
    'rootkit': 'u2r',
    'satan': 'probe',
    'smurf': 'dos',
    'spy': 'r2l',
    'teardrop': 'dos',
    'warezclient': 'r2l',
    'warezmaster': 'r2l',
    }
    path = "kddcup.data_10_percent.gz"
    df = pd.read_csv(path,names=columns)

    #Adding Attack Type column
    df['Attack Type'] = df.target.apply(lambda r:attacks_types[r[:-1]])
    #Finding categorical features
    num_cols = df._get_numeric_data().columns

    cate_cols = list(set(df.columns)-set(num_cols))
    cate_cols.remove('target')
    cate_cols.remove('Attack Type')
    df = df.dropna('columns')# drop columns with NaN

    df = df[[col for col in df if df[col].nunique() > 1]]# keep columns where there are more than 1 unique values

    corr = df.corr()
    df.drop('num_root',axis = 1,inplace = True)
    df.drop('srv_serror_rate',axis = 1,inplace = True)
    df.drop('srv_rerror_rate',axis = 1, inplace=True)
    df.drop('dst_host_srv_serror_rate',axis = 1, inplace=True)
    df.drop('dst_host_serror_rate',axis = 1, inplace=True)
    df.drop('dst_host_rerror_rate',axis = 1, inplace=True)
    df.drop('dst_host_srv_rerror_rate',axis = 1, inplace=True)
    df.drop('dst_host_same_srv_rate',axis = 1, inplace=True)
    df_std = df.std()
    df_std = df_std.sort_values(ascending = True)
    #protocol_type feature mapping
    pmap = {'icmp':0,'tcp':1,'udp':2}
    df['protocol_type'] = df['protocol_type'].map(pmap)
    #flag feature mapping
    fmap = {'SF':0,'S0':1,'REJ':2,'RSTR':3,'RSTO':4,'SH':5 ,'S1':6 ,'S2':7,'RSTOS0':8,'S3':9 ,'OTH':10}
    df['flag'] = df['flag'].map(fmap)
    df.drop('service',axis = 1,inplace= True)
    df = df.drop(['target',], axis=1)
    print(df.shape)

    # Target variable and train set
    Y = df[['Attack Type']]
    X = df.drop(['Attack Type',], axis=1)

    sc = MinMaxScaler()
    X = sc.fit_transform(X)

    # Split test and train data 
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
    print(X_train.shape, X_test.shape)
    print(Y_train.shape, Y_test.shape)
    
prepare_data_op = func_to_container_op(
    func =  prepare_data ,
    packages_to_install = [
        "joblib",
        "wget",
        "scikit-learn",
        "matplotlib",
        "pandas",
        "numpy",
        "seaborn"
    ]   
)


IndentationError: expected an indented block (153950560.py, line 52)

In [None]:
def train_data(
    X_train_path:  InputPath("PKL"),
    Y_train_path:  InputPath("PKL"),
    X_test_path:  InputPath("PKL"),
    Y_test_path:  InputPath("PKL")
):  
    import os
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import time

    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.metrics import accuracy_score
    import numpy as np
    import matplotlib.pyplot as plt
    from keras.models import Sequential
    from keras.layers import Dense
    from keras.wrappers.scikit_learn import KerasClassifier
    import joblib
    import wget
    
    X_train = joblib.load(X_train_path)
    Y_train = joblib.load(Y_train_path)
    X_test = joblib.load(X_test_path)
    Y_test = joblib.load(Y_test_path)
    def create_model():
        model = Sequential()
        model.add(Dense(30,input_dim =30,activation = 'relu',kernel_initializer='random_uniform'))
        model.add(Dense(1,activation='sigmoid',kernel_initializer='random_uniform'))
        #5 classes-normal,dos,probe,r2l,u2r
        model.add(Dense(5,activation='softmax'))
        model.compile(loss ='categorical_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
        return model
    
    model = KerasClassifier(build_fn=create_model,epochs=100,batch_size=64)
    
    start_time = time.time()
    model.fit(X_train,Y_train.values.ravel())
    end_time = time.time()
    print("Testing time: ",end_time-start_time)
    
    start_time = time.time()
    Y_train_pred = model.predict(X_train)
    end_time = time.time()
    accuracy_score(Y_train,Y_train_pred)
    
    
    start_time = time.time()
    Y_test_pred = model.predict(X_test)
    end_time = time.time()
    accuracy_score(Y_test,Y_test_pred)
    
    
train_op = func_to_container_op(
    func = train_data  ,
    packages_to_install = [
        "joblib",
        "wget",
        "scikit-learn",
        "matplotlib",
        "pandas",
        "numpy",
        "seaborn"
    ]   
)


In [None]:
import kfp.dsl as dsl
@dsl.pipeline(
  name='Deep Learning IDS/IPS',
  description='Pipeline'
)
def my_pipeline(url):
    prepare_data_task = prepare_data_op(url)
    train_task = train_op(x_train= prepare_data_task.outputs["X_train"],
                          y_train= prepare_data_task.outputs["Y_train"],
                          x_test= prepare_data_task.outputs["X_test"],
                          y_test= prepare_data_task.outputs["Y_test"],
                          ) 

session_cookies = ""
HOST = ""
client = kfp.Client(
  host= f"{HOST}/pipeline",
  cookies = f"authservice_session={session_cookies}",
  namespace="admin"
)    
client.create_run_from_pipeline_func(
  my_pipeline,
  arguments={
    'url': 'http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz'
  }
)