In [None]:
import pandas as pd
s3_path = 's3://zero-trust-ml-dataset/datasets/external_threats.csv'
df = pd.read_csv(s3_path)
df.dropna(inplace=True)

  from pandas.core.computation.check import NUMEXPR_INSTALLED
severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import joblib
import shap
import numpy as np

In [3]:
# Preprocessing
X = df.drop('Label', axis=1)
y = LabelEncoder().fit_transform(df['Label'])

In [4]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, stratify=y)

In [None]:
# Sspliting and saving trained dataset
pd.DataFrame(X_train).to_csv('X_train_ext.csv', index=False)
pd.DataFrame(X_test).to_csv('X_test_ext.csv', index=False)
pd.DataFrame(y_train).to_csv('y_train_ext.csv', index=False)
pd.DataFrame(y_test).to_csv('y_test_ext.csv', index=False)

In [7]:
!aws s3 cp X_train_ext.csv s3://zero-trust-ml-dataset/X_train_ext.csv
!aws s3 cp X_test_ext.csv s3://zero-trust-ml-dataset/X_test_ext.csv
!aws s3 cp y_train_ext.csv s3://zero-trust-ml-dataset/y_train_ext.csv
!aws s3 cp y_test_ext.csv s3://zero-trust-ml-dataset/y_test_ext.csv

upload: ./X_train_ext.csv to s3://zero-trust-ml-dataset/X_train_ext.csv
upload: ./X_test_ext.csv to s3://zero-trust-ml-dataset/X_test_ext.csv
upload: ./y_train_ext.csv to s3://zero-trust-ml-dataset/y_train_ext.csv
upload: ./y_test_ext.csv to s3://zero-trust-ml-dataset/y_test_ext.csv


In [None]:
# RF Pipeline
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
])
rf_pipeline.fit(X_train, y_train)

In [None]:
# Evaluate of RF
y_pred_rf = rf_pipeline.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.9122222222222223
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.66      0.69      3600
           1       1.00      1.00      1.00      3600
           2       1.00      1.00      1.00      3600
           3       1.00      1.00      1.00      3600
           4       1.00      1.00      1.00      3600
           5       0.69      0.73      0.71      3600
           6       1.00      1.00      1.00      3600

    accuracy                           0.91     25200
   macro avg       0.91      0.91      0.91     25200
weighted avg       0.91      0.91      0.91     25200



In [None]:

joblib.dump(rf_pipeline, 'rf_pipeline.pkl')
!aws s3 cp rf_pipeline.pkl s3://zero-trust-ml-dataset/models/rf_pipeline.pkl

upload: ./rf_pipeline.pkl to s3://zero-trust-ml-dataset/models/rf_pipeline.pkl


In [None]:
# SHAP Explainability for RF
'''explainer_rf = shap.Explainer(rf_pipeline.named_steps['rf'], X_train)
shap_values_rf = explainer_rf(X_test[:100], check_additivity=False)
shap_df_rf = pd.DataFrame(shap_values_rf.values[:, :, 0])
shap_df_rf.to_json('shap_external_rf.json')
!aws s3 cp shap_external_rf.json s3://zero-trust-ml-dataset/explainability/shap_external_rf.json'''



upload: ./shap_external_rf.json to s3://zero-trust-ml-dataset/explainability/shap_external_rf.json


Simulation Script

In [None]:
import shap
import boto3
import json
import numpy as np
from  datetime  import datetime
start_time = time.time()



background_data = shap.sample(X_train, 1)  

#RF pipeline
rf_model = rf_pipeline.named_steps['rf']

# Created SHAP TreeExplainer
explainer = shap.TreeExplainer(rf_model)


sample_data = X_test[0:1]  

shap_values = explainer.shap_values(sample_data)


simulated_probability = 0.5556  

threat_level = round(0.1 + (0.9 * simulated_probability), 2)  


assert threat_level == 0.6, "Simulated threat level is not high enough."

# Policy enforcement simulation
def policy_action(threat_score):
    if threat_score < 0.5:
        return "Allow"
    elif 0.5 <= threat_score < 0.8:
        return "Require MFA"
    else:
        return "Revoke/Terminate Session"


action = policy_action(threat_level)

#data to send
simulated_threat_data = {
    'threat_level': threat_level,  
    'user_id': 'user123',
    'prediction': y_pred_rf[0],  
    'shap_values': shap_values[1].tolist() if len(shap_values) > 1 else shap_values[0].tolist(), 
    'recommended_action': action  
}

def convert_to_native_types(obj):
    if isinstance(obj, np.int64):
        return int(obj)
    elif isinstance(obj, np.float64):
        return float(obj)
    elif isinstance(obj, list):
        return [convert_to_native_types(i) for i in obj]
    elif isinstance(obj, dict):
        return {key: convert_to_native_types(value) for key, value in obj.items()}
    return obj

simulated_threat_data = convert_to_native_types(simulated_threat_data)

lambda_client = boto3.client('lambda')

def invoke_lambda(threat_data):
    response = lambda_client.invoke(
        FunctionName='External-test',  
        InvocationType='Event',  
        Payload=json.dumps(threat_data)  
    )
    print("Lambda invoked:", response)

# Trigger Lambda
invoke_lambda(simulated_threat_data)

end_time = time.time()
print(f"Time for simulation: {end_time - start_time:.2f} seconds")
dt = datetime.now()

print("current system time :", dt)


In [None]:
# SVM 
binary_mask = np.isin(y, [0, 1])
X_train_bin = X_train[binary_mask[:len(X_train)]]
X_test_bin = X_test[binary_mask[len(X_train):]]
y_train_bin = y_train[binary_mask[:len(X_train)]]
y_test_bin = y_test[binary_mask[len(X_train):]]

svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(probability=True))
])
svm_pipeline.fit(X_train_bin, y_train_bin)

In [None]:
# Evaluation of SVM
y_pred_svm = svm_pipeline.predict(X_test_bin)
print("SVM Accuracy:", accuracy_score(y_test_bin, y_pred_svm))
print("SVM Classification Report:")
print(classification_report(y_test_bin, y_pred_svm))

SVM Accuracy: 0.8964436571740019
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.44      0.57      1064
           1       0.99      0.98      0.99      1004
           2       0.98      1.00      0.99      1039
           3       1.00      1.00      1.00      1053
           4       0.99      1.00      1.00      1072
           5       0.60      0.87      0.71      1051
           6       1.00      1.00      1.00      1056

    accuracy                           0.90      7339
   macro avg       0.91      0.90      0.89      7339
weighted avg       0.91      0.90      0.89      7339



In [None]:
joblib.dump(svm_pipeline, 'svm_pipeline.pkl')
!aws s3 cp svm_pipeline.pkl s3://zero-trust-ml-dataset/models/svm_pipeline.pkl

upload: ./svm_pipeline.pkl to s3://zero-trust-ml-dataset/models/svm_pipeline.pkl
