In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor, CatBoostClassifier
import lightgbm as lgb
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

pd.set_option('display.max_columns', None)



## importing dataset

In [2]:
dataset = pd.read_csv("data.csv")
dataset

Unnamed: 0,campLocation,timestamp,source,destination,user,device,eventType,eventDescription,eventSeverity,mlRiskScore
0,Bangalore,2023-09-13T01:24,10.1.1.1,172.16.0.20,guest,ServerABC,malware-detection,Malware detected: File 'file910.txt' found on ...,warning,0.37
1,Bangalore,2023-09-17T23:36,10.1.1.1,172.16.0.20,anonymous,Workstation123,network-disconnected,Device 'Workstation123' disconnected from the ...,error,0.13
2,Bangalore,2023-09-18T09:47,84.124.148.119,48.239.159.33,admin,Workstation123,api-called,API called: API2 from IP 84.124.148.119,error,0.73
3,Bangalore,2023-09-15T23:45,192.168.10.5,10.1.1.2,anonymous,ServerABC,auth-success,Successful login for user 'anonymous' from IP ...,informational,0.10
4,Bangalore,2023-09-16T07:17,192.168.0.1,172.16.1.101,admin,ServerABC,dns-queries,DNS query from IP 192.168.0.1 for domain 'exam...,warning,0.17
...,...,...,...,...,...,...,...,...,...,...
495,Bangalore,2023-09-18T10:29,192.168.2.5,192.168.10.6,anonymous,DeviceXYZ,auth-success,Successful login for user 'anonymous' from IP ...,informational,0.10
496,Bangalore,2023-09-17T04:53,192.168.1.100,10.10.10.20,guest,Workstation123,permission-changes,User 'guest' changed permissions for file '/pa...,warning,0.17
497,Bangalore,2023-09-17T10:31,192.168.0.1,10.1.1.2,anonymous,ServerABC,auth-success,Successful login for user 'anonymous' from IP ...,critical,0.10
498,Bangalore,2023-09-12T18:27,192.168.2.5,10.1.1.2,user123,DeviceXYZ,auth-lockout,User 'user123' locked out after multiple faile...,critical,0.37


## dropping redundant columns

In [3]:
dataset.drop(columns=['source', 'destination', 'timestamp', 'eventDescription'], inplace=True)
dataset

Unnamed: 0,campLocation,user,device,eventType,eventSeverity,mlRiskScore
0,Bangalore,guest,ServerABC,malware-detection,warning,0.37
1,Bangalore,anonymous,Workstation123,network-disconnected,error,0.13
2,Bangalore,admin,Workstation123,api-called,error,0.73
3,Bangalore,anonymous,ServerABC,auth-success,informational,0.10
4,Bangalore,admin,ServerABC,dns-queries,warning,0.17
...,...,...,...,...,...,...
495,Bangalore,anonymous,DeviceXYZ,auth-success,informational,0.10
496,Bangalore,guest,Workstation123,permission-changes,warning,0.17
497,Bangalore,anonymous,ServerABC,auth-success,critical,0.10
498,Bangalore,user123,DeviceXYZ,auth-lockout,critical,0.37


## label encoding Source and Destination IPs (safe and malicious)

In [4]:
le = LabelEncoder()
columns = ['campLocation', 'user', 'device', 'eventType', 'eventSeverity']
dataset[columns] = dataset[columns].apply(le.fit_transform)

In [5]:
# Map Severity to numerical values
event_severity_threat = {"informational": 0.1, "warning": 0.7, "error": 0.4, "critical": 0.9}
dataset['eventSeverity'] = dataset['eventSeverity'].replace(event_severity_threat)

# Map EventType to numerical values
event_type_threat = {
    "auth-failed": 0.8,
    "auth-success": 0.1,
    "auth-lockout": 0.9,
    "network-connected": 0.2,
    "network-disconnected": 0.2,
    "firewall-change": 0.7,
    "dns-queries": 0.3,
    "malware-detection": 0.9,
    "system-shutdown": 0.8,
    "system-restart": 0.7,
    "system-failure": 0.9,
    "application-errors": 0.6,
    "application-usage": 0.2,
    "api-called": 0.4,
    "file-access": 0.5,
    "permission-changes": 0.3,
    "software-update": 0.6
}
dataset['eventType'] = dataset['eventType'].replace(event_type_threat)

# Encode other categorical columns manually
devices = {"Workstation123": 1, "DeviceXYZ": 2, "ServerABC": 3}
dataset['device'] = dataset['device'].replace(devices)

users = {"user123": 1, "guest": 2, "admin": 3, "anonymous": 4}
dataset['user'] = dataset['user'].replace(users)

# Add a random Access column
dataset['Access'] = np.random.randint(2, size=len(dataset))

# Prepare features and target variables for risk prediction
X = dataset.drop(columns=['mlRiskScore', 'Access'])
y = dataset['mlRiskScore']
y2 = dataset['Access']

In [6]:
# Define base and meta models for stacking
cat_model = CatBoostRegressor(verbose=False)
lgb_model = lgb.LGBMRegressor()
xgb_model = XGBRegressor()
meta_regressor = MLPRegressor(hidden_layer_sizes=(400, 200, 100, 50), activation='relu', solver='adam', random_state=42)

base_regressors = [
    ('lightgbm', lgb_model),
    ('catboost', cat_model),
    ('xgboost', xgb_model)
]

model = StackingRegressor(estimators=base_regressors, final_estimator=meta_regressor)

In [7]:
# Risk prediction using KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
predictions = np.zeros(len(X))

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)
    fold_preds = model.predict(X_test)
    fold_r2 = r2_score(y_test, fold_preds)
    print(f"R2 Score for this fold: {fold_r2}")

    predictions[test_index] += fold_preds

final_r2 = r2_score(y, predictions)
print(f"\nOverall R2 Score: {final_r2}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001697 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28
[LightGBM] [Info] Number of data points in the train set: 400, number of used features: 4
[LightGBM] [Info] Start training from score 0.302350
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000067 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28
[LightGBM] [Info] Number of data points in the train set: 320, number of used features: 4
[LightGBM] [Info] Start training from score 0.304188
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000020 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28
[Lig

In [8]:
# Prepare features for access revoked prediction
new_dataset = dataset.drop(columns=['Access'])
X2 = new_dataset
y2 = dataset['Access']

# Access revoked prediction using CatBoostClassifier
kf = KFold(n_splits=5, shuffle=True, random_state=42)
catboost_model = CatBoostClassifier(iterations=1000, depth=5, learning_rate=0.1, loss_function='Logloss', random_seed=42, verbose=False)

predictions2 = np.zeros(len(X2))


In [9]:
for train_index, test_index in kf.split(X2):
    X2_train, X2_test = X2.iloc[train_index], X2.iloc[test_index]
    y2_train, y2_test = y2.iloc[train_index], y2.iloc[test_index]

    catboost_model.fit(X2_train, y2_train)
    fold_preds2 = catboost_model.predict(X2_test)
    fold_r2 = r2_score(y2_test, fold_preds2)
    print(f"R2 Score for this fold: {fold_r2}")

    predictions2[test_index] += fold_preds2

final_r2 = r2_score(y2, predictions2)
print(f"\nOverall R2 Score for Access Prediction: {final_r2}")

dataset['Access'] = np.random.randint(2, size=500)

dataset

R2 Score for this fold: -1.0
R2 Score for this fold: -1.437999159310635
R2 Score for this fold: -1.2321428571428563
R2 Score for this fold: -1.13365539452496
R2 Score for this fold: -1.1818181818181817

Overall R2 Score for Access Prediction: -1.1605531015940076


Unnamed: 0,campLocation,user,device,eventType,eventSeverity,mlRiskScore,Access
0,0,2,1,9,3,0.37,1
1,0,1,2,11,1,0.13,1
2,0,0,2,0,1,0.73,0
3,0,1,1,5,2,0.10,0
4,0,0,1,6,3,0.17,1
...,...,...,...,...,...,...,...
495,0,1,0,5,2,0.10,1
496,0,2,2,12,3,0.17,1
497,0,1,1,5,0,0.10,0
498,0,3,0,4,0,0.37,0
