## DDOS DETECTION

In [70]:
import sklearn # Check Sklearn version
sklearn.__version__

'0.22.1'

## 1. Initialize Boto3 SDK and create S3 bucket. 

In [71]:
import numpy as np
from sagemaker import get_execution_role
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
import datetime
import time
import tarfile
import boto3
import pandas as pd

sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'ddos-detection' # Mention the created S3 bucket name here
print("Using bucket " + bucket)

Using bucket ddos-detection


## 3. Data Exploration and Understanding.

In [None]:
df = pd.read_csv("cleaned_dataset.csv")

In [72]:
df.head()

Unnamed: 0,Protocol,Fwd Pkt Len Min,Bwd Pkt Len Min,Pkt Len Min,PSH Flag Cnt,ACK Flag Cnt,Fwd Seg Size Min,Label
0,6,0.0,0.0,0.0,0,0,0,1
1,6,0.0,0.0,0.0,0,0,0,1
2,6,0.0,0.0,0.0,0,0,0,1
3,6,0.0,0.0,0.0,0,0,0,1
4,6,0.0,0.0,0.0,0,1,0,1


In [8]:
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [10]:
df['Label'].value_counts(normalize=True)

1    0.50747
0    0.49253
Name: Label, dtype: float64

In [73]:
df.columns

Index(['Protocol', 'Fwd Pkt Len Min', 'Bwd Pkt Len Min', 'Pkt Len Min',
       'PSH Flag Cnt', 'ACK Flag Cnt', 'Fwd Seg Size Min', 'Label'],
      dtype='object')

In [12]:
df.shape

(638322, 8)

In [13]:
features = list(df.columns)
features

['Protocol',
 'Fwd Pkt Len Min',
 'Bwd Pkt Len Min',
 'Pkt Len Min',
 'PSH Flag Cnt',
 'ACK Flag Cnt',
 'Fwd Seg Size Min',
 'Label']

In [14]:
label = features.pop(-1)
label

'Label'

In [15]:
x = df[features]
y = df[label]

In [16]:
x.head()

Unnamed: 0,Protocol,Fwd Pkt Len Min,Bwd Pkt Len Min,Pkt Len Min,PSH Flag Cnt,ACK Flag Cnt,Fwd Seg Size Min
0,6,0.0,0.0,0.0,0,0,0
1,6,0.0,0.0,0.0,0,0,0
2,6,0.0,0.0,0.0,0,0,0
3,6,0.0,0.0,0.0,0,0,0
4,6,0.0,0.0,0.0,0,1,0


In [18]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: Label, dtype: int64

In [19]:
x.shape

(638322, 7)

In [20]:
y.value_counts()

1    323929
0    314393
Name: Label, dtype: int64

In [22]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=101)

In [23]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(446825, 7)
(191497, 7)
(446825,)
(191497,)


## 4. Split the data into Train/Test CSV File. 

In [24]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

In [74]:
print(trainX.shape)
print(testX.shape)

(446825, 8)
(191497, 8)


In [26]:
trainX.head()

Unnamed: 0,Protocol,Fwd Pkt Len Min,Bwd Pkt Len Min,Pkt Len Min,PSH Flag Cnt,ACK Flag Cnt,Fwd Seg Size Min,Label
85057,6,0.0,0.0,0.0,0,0,0,1
51107,6,0.0,0.0,0.0,0,1,0,1
428144,6,0.0,0.0,0.0,1,0,20,0
36373,6,0.0,0.0,0.0,0,1,0,1
553088,17,25.0,41.0,25.0,0,0,8,0


## 5. Upload data into the S3 Bucket.

In [28]:
trainX.to_csv("train-V-1.csv",index = False)
testX.to_csv("test-V-1.csv", index = False)

In [29]:
# send data to S3. SageMaker will take training data from s3
sk_prefix = "sagemaker/ddos-detection/sklearncontainer"
trainpath = sess.upload_data(
    path="train-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)

testpath = sess.upload_data(
    path="test-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)

In [75]:
testpath

's3://ddos-detection/sagemaker/ddos-detection/sklearncontainer/test-V-1.csv'

In [76]:
trainpath

's3://ddos-detection/sagemaker/ddos-detection/sklearncontainer/train-V-1.csv'

## 6. Create Training Script

In [84]:
%%writefile script.py


from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import joblib
import os
import numpy as np
import pandas as pd

# inference functions ---------------

# def input_fn(request_body, request_content_type):
#     print(request_body)
#     print(request_content_type)
#     if request_content_type == "text/csv":
#         request_body = request_body.strip()
#         try:
#             df = pd.read_csv(StringIO(request_body), header=None)
#             return df
        
#         except Exception as e:
#             print(e)
#     else:
#         return """Please use Content-Type = 'text/csv' and, send the request!!""" 
 
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

# def predict_fn(input_data, model):
#     if type(input_data) != str:
#         prediction = model.predict(input_data)
#         print(prediction)
#         return prediction
#     else:
#         return input_data
        
    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("-hl1", type=int, default=1)
    parser.add_argument("--hl2", type=int, default=1)
    parser.add_argument("--alpha", type=float, default = 1.0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (85%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (15%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
  
    print("Training MLP MODEL.....")
    print()
    model = MLPClassifier(hidden_layer_sizes=(int(args.hl1), int(args.hl2)), activation='relu', solver='adam', alpha=args.alpha, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    print()
    

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    print("Model persisted at " + model_path)
    print()

    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test,y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)

Overwriting script.py


In [78]:
! python script.py --hl2 5 \
                   --hl1 6 \
                   --alpha 0.005529107510830757 \
                   --model-dir ./ \
                   --train ./ \
                   --test ./ \

[INFO] Extracting arguments
SKLearn Version:  0.22.1
Joblib Version:  1.2.0
[INFO] Reading data

Building training and testing datasets

Column order: 
['Protocol', 'Fwd Pkt Len Min', 'Bwd Pkt Len Min', 'Pkt Len Min', 'PSH Flag Cnt', 'ACK Flag Cnt', 'Fwd Seg Size Min']

Label column is:  Label

Data Shape: 

---- SHAPE OF TRAINING DATA (85%) ----
(446825, 7)
(446825,)

---- SHAPE OF TESTING DATA (15%) ----
(191497, 7)
(191497,)

Training MLP MODEL.....


Model persisted at ./model.joblib


---- METRICS RESULTS FOR TESTING DATA ----

Total Rows are:  191497
[TESTING] Model Accuracy is:  0.9382340193318955
[TESTING] Testing Report: 
              precision    recall  f1-score   support

           0       0.91      0.96      0.94     94374
           1       0.96      0.91      0.94     97123

    accuracy                           0.94    191497
   macro avg       0.94      0.94      0.94    191497
weighted avg       0.94      0.94      0.94    191497



## 7. Train script in-side Sagemaker container.

In [79]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="MLP-sklearn",
    hyperparameters={
        "hl1": 5,
        "hl2": 6,
        "alpha": 0.005529107510830757
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)

In [81]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


2023-04-08 04:19:54 Starting - Preparing the instances for training
2023-04-08 04:19:54 Downloading - Downloading input data
2023-04-08 04:19:54 Training - Training image download completed. Training in progress.
2023-04-08 04:19:54 Uploading - Uploading generated training model
2023-04-08 04:19:54 Completed - Training job completed
Model artifact persisted at s3://sagemaker-ap-south-1-289991152277/MLP-sklearn-2023-04-08-04-16-14-201/output/model.tar.gz


## 9. Deploy Sagemaker Endpoint(API) for trained model, and test it. 

In [82]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name =  model_name,
    model_data=artifact,
    role=get_execution_role(),
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

In [83]:
endpoint_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
)

EndpointName=Custom-sklearn-model-2023-04-08-04-21-49


INFO:sagemaker:Creating model with name: Custom-sklearn-model-2023-04-08-04-21-43
INFO:sagemaker:Creating endpoint-config with name Custom-sklearn-model-2023-04-08-04-21-49
INFO:sagemaker:Creating endpoint with name Custom-sklearn-model-2023-04-08-04-21-49


----!

In [41]:
testX[features][0:2].values.tolist()

[[6.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0], [6.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]]

In [42]:
print(predictor.predict(testX[features][0:2].values.tolist()))

[1 1]


In [53]:
import io

In [49]:
def np2csv(arr):
    csv = io.BytesIO()
    np.savetxt(csv,arr,delimiter=",",fmt="%g")
    return csv.getvalue().decode().rstrip()

##  deleting the endpoint !

In [43]:
sm_boto3.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': '6d675b79-2c7d-4944-91e0-5222826d820e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '6d675b79-2c7d-4944-91e0-5222826d820e',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Tue, 04 Apr 2023 18:00:21 GMT'},
  'RetryAttempts': 0}}