In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

## importing dataset

In [2]:
dataset = pd.read_csv("/kaggle/input/random-2/Log Dataset.csv")
dataset

Unnamed: 0,Timestamp,Source,SourceClass,Destination,DestinationClass,User,Device,EventType,Description,Severity,MLRiskScore
0,2023-09-17T01:42:31,192.168.10.5,safe,10.0.0.101,safe,user123,Workstation123,application-usage,User 'user123' used application 'App1',informational,0.13
1,2023-09-17T01:42:31,10.1.1.1,safe,192.168.10.6,safe,anonymous,ServerABC,system-shutdown,System shutdown: Reason2,critical,0.33
2,2023-09-17T01:42:31,10.0.0.2,malicious,10.0.0.3,safe,admin,DeviceXYZ,auth-lockout,User 'admin' locked out after multiple failed ...,informational,0.63
3,2023-09-17T01:42:31,31.15.164.90:27783,malicious,192.168.10.6,safe,user123,Workstation123,system-failure,Critical system failure: Error message: 78,error,0.63
4,2023-09-17T01:42:31,10.0.0.2,malicious,10.0.0.101,safe,guest,DeviceXYZ,file-access,File access: Read to file '/path/to/file120.tx...,error,0.50
...,...,...,...,...,...,...,...,...,...,...,...
495,2023-09-17T01:42:31,192.168.10.5,malicious,166.127.213.176,malicious,anonymous,DeviceXYZ,auth-success,Successful login for user 'anonymous' from IP ...,informational,0.63
496,2023-09-17T01:42:31,192.168.1.100,safe,192.168.10.6,safe,anonymous,DeviceXYZ,system-shutdown,System shutdown: Reason2,error,0.33
497,2023-09-17T01:42:31,172.16.0.10,safe,192.168.10.6,safe,anonymous,Workstation123,system-shutdown,System shutdown: Reason6,critical,0.33
498,2023-09-17T01:42:31,10.0.0.100,safe,192.168.2.6,safe,guest,DeviceXYZ,application-errors,Application error: Error message: 96,warning,0.27


## dropping redundant columns

In [3]:
dataset.drop(columns=['Source', 'Destination', 'Timestamp', 'Description'], inplace=True)
# dataset

## label encoding Source and Destination IPs (safe and malicious)

In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
columns = ['SourceClass', 'DestinationClass']

dataset[columns] = dataset[columns].apply(le.fit_transform)
dataset

Unnamed: 0,SourceClass,DestinationClass,User,Device,EventType,Severity,MLRiskScore
0,1,1,user123,Workstation123,application-usage,informational,0.13
1,1,1,anonymous,ServerABC,system-shutdown,critical,0.33
2,0,1,admin,DeviceXYZ,auth-lockout,informational,0.63
3,0,1,user123,Workstation123,system-failure,error,0.63
4,0,1,guest,DeviceXYZ,file-access,error,0.50
...,...,...,...,...,...,...,...
495,0,0,anonymous,DeviceXYZ,auth-success,informational,0.63
496,1,1,anonymous,DeviceXYZ,system-shutdown,error,0.33
497,1,1,anonymous,Workstation123,system-shutdown,critical,0.33
498,1,1,guest,DeviceXYZ,application-errors,warning,0.27


In [5]:
## Severity Column
event_severity_threat = {"informational": 0.1, "warning": 0.7, "error": 0.4, "critical": 0.9}
dataset['Severity'] = dataset['Severity'].replace(event_severity_threat)

## EventType Column
event_type_threat = {
    "auth-failed": 0.8,
    "auth-success": 0.1,
    "auth-lockout": 0.9,
    "network-connected": 0.2,
    "network-disconnected": 0.2,
    "firewall-change": 0.7,
    "dns-queries": 0.3,
    "malware-detection": 0.9,
    "system-shutdown": 0.8,
    "system-restart": 0.7,
    "system-failure": 0.9,
    "application-errors": 0.6,
    "application-usage": 0.2,
    "api-called": 0.4,
    "file-access": 0.5,
    "permission-changes": 0.3,
    "software-update": 0.6
}
dataset['EventType'] = dataset['EventType'].replace(event_type_threat)

## Device Column
devices = {"Workstation123": 1, "DeviceXYZ": 2, "ServerABC": 3}
dataset['Device'] = dataset['Device'].replace(devices)


## User Column
users = {"user123": 1, "guest": 2, "admin": 3, "anonymous":4}
dataset['User'] = dataset['User'].replace(users)

dataset

Unnamed: 0,SourceClass,DestinationClass,User,Device,EventType,Severity,MLRiskScore
0,1,1,1,1,0.2,0.1,0.13
1,1,1,4,3,0.8,0.9,0.33
2,0,1,3,2,0.9,0.1,0.63
3,0,1,1,1,0.9,0.4,0.63
4,0,1,2,2,0.5,0.4,0.50
...,...,...,...,...,...,...,...
495,0,0,4,2,0.1,0.1,0.63
496,1,1,4,2,0.8,0.4,0.33
497,1,1,4,1,0.8,0.9,0.33
498,1,1,2,2,0.6,0.7,0.27


## splitting into X and Y

In [6]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

## importing catboost regressor

In [7]:
from catboost import CatBoostRegressor
cat_model = CatBoostRegressor(verbose = False)

## importing lightgbm regressor

In [8]:
import lightgbm as lgb 

lgb_model = lgb.LGBMRegressor()

## importing xgboost regressor

In [9]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor()

## stacking these models together

In [10]:
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import StackingRegressor

base_regressors = [
    ('lightgbm', lgb_model),
    ('catboost', cat_model),
    ('xgboost', xgb_model)
]

meta_regressor = MLPRegressor(hidden_layer_sizes=(400, 200, 100, 50), activation='relu', solver='adam', random_state=42)


model = StackingRegressor(estimators=base_regressors, final_estimator=meta_regressor)
model

## implementing kfold

In [11]:
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)

predictions = np.zeros(len(X))

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)
    fold_preds = model.predict(X_test)
    fold_r2 = r2_score(y_test, fold_preds)
    print(f"R2 Score for this fold: {fold_r2}")

    predictions[test_index] += fold_preds

final_r2 = r2_score(y, predictions)
print(f"\nOverall R2 Score: {final_r2}")


R2 Score for this fold: 0.9947904819895304
R2 Score for this fold: 0.9963572563928428
R2 Score for this fold: 0.9952153360158307
R2 Score for this fold: 0.9953399556347511
R2 Score for this fold: 0.9970876556901683

Overall R2 Score: 0.995817550654829
