In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [None]:
s3_path = 's3://zero-trust-ml-dataset/datasets/internal_threats.csv'
df = pd.read_csv(s3_path)
df.dropna(inplace=True)

In [None]:
#features and labels
X = df.drop('Label', axis=1)
le = LabelEncoder()
y = le.fit_transform(df['Label']) 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

In [None]:
#LR Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(
        multi_class='multinomial',
        solver='lbfgs',
        max_iter=500,
        C=1.0,
        random_state=42
    ))
])

In [None]:
pipeline.fit(X_train, y_train)



In [None]:
#Evaluation
y_pred = pipeline.predict(X_test)
print("✅ Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("✅ Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

✅ Logistic Regression Accuracy: 0.9930555555555556
✅ Classification Report:
                precision    recall  f1-score   support

          bot       1.00      0.99      0.99      3600
infilteration       0.99      1.00      0.99      3600

     accuracy                           0.99      7200
    macro avg       0.99      0.99      0.99      7200
 weighted avg       0.99      0.99      0.99      7200



In [None]:
joblib.dump(pipeline, 'internal_lr_pipeline.pkl')
!aws s3 cp internal_lr_pipeline.pkl s3://zero-trust-ml-dataset/models/internal_lr_pipeline.pkl

upload: ./internal_lr_pipeline.pkl to s3://zero-trust-ml-dataset/models/internal_lr_pipeline.pkl


In [None]:
#save label encoder
joblib.dump(le, 'internal_label_encoder.pkl')
!aws s3 cp internal_label_encoder.pkl s3://zero-trust-ml-dataset/models/internal_label_encoder.pkl

upload: ./internal_label_encoder.pkl to s3://zero-trust-ml-dataset/models/internal_label_encoder.pkl


In [None]:
#for reference
pd.DataFrame(X_train).to_csv('X_train_int.csv', index=False)
pd.DataFrame(X_test).to_csv('X_test_int.csv', index=False)
pd.DataFrame(y_train).to_csv('y_train_int.csv', index=False)
pd.DataFrame(y_test).to_csv('y_test_int.csv', index=False)

In [59]:
!aws s3 cp X_train_int.csv s3://zero-trust-ml-dataset/X_train_int.csv
!aws s3 cp X_test_int.csv s3://zero-trust-ml-dataset/X_test_int.csv
!aws s3 cp y_train_int.csv s3://zero-trust-ml-dataset/y_train_int.csv
!aws s3 cp y_test_int.csv s3://zero-trust-ml-dataset/y_test_int.csv

upload: ./X_train_int.csv to s3://zero-trust-ml-dataset/X_train_int.csv
upload: ./X_test_int.csv to s3://zero-trust-ml-dataset/X_test_int.csv
upload: ./y_train_int.csv to s3://zero-trust-ml-dataset/y_train_int.csv
upload: ./y_test_int.csv to s3://zero-trust-ml-dataset/y_test_int.csv


Simulation

In [None]:
import shap
import boto3
import json
import numpy as np
from datetime import datetime
import time

start_time = time.time()

#use the pipeline
explainer = shap.LinearExplainer(pipeline.named_steps['lr'], X_train)


sample_data = X_test[0:1].values


shap_values = explainer.shap_values(sample_data)


print("SHAP values shape:", [v.shape for v in shap_values])

# Simulate the prediction 
probability = pipeline.named_steps['lr'].predict_proba(sample_data)[0][1]  

#calculate the threat level
threat_level = round(0.1 + (0.9 * probability), 2)


def policy_action(threat_score):
    if threat_score < 0.5:
        return "User is okay"  
    elif 0.5 <= threat_score < 0.8:
        return "Verify user"  
    else:
        return "Revoke/Terminate Session" 


action = policy_action(threat_level)

# Preparing the data to send to Lambda
simulated_threat_data = {
    'threat_level': threat_level,  
    'user_id': 'user123',
    'prediction': pipeline.named_steps['lr'].predict(sample_data)[0],  
    'shap_values': shap_values[1].tolist() if len(shap_values) > 1 else shap_values[0].tolist(), 
    'recommended_action': action  
}

# Function for converting numpy types to native Python types 
def convert_to_native_types(obj):
    if isinstance(obj, np.int64):
        return int(obj)
    elif isinstance(obj, np.float64):
        return float(obj)
    elif isinstance(obj, list):
        return [convert_to_native_types(i) for i in obj]
    elif isinstance(obj, dict):
        return {key: convert_to_native_types(value) for key, value in obj.items()}
    return obj

simulated_threat_data = convert_to_native_types(simulated_threat_data)


lambda_client = boto3.client('lambda')


def invoke_lambda(threat_data):
    response = lambda_client.invoke(
        FunctionName='Internal-test',  
        InvocationType='Event', 
        Payload=json.dumps(threat_data)  
    )
    print("Lambda invoked:", response)


invoke_lambda(simulated_threat_data)


end_time = time.time()
print(f"Time for simulation: {end_time - start_time:.2f} seconds")  

dt = datetime.now()
print("current system time:", dt)
