#  Week 3 Capstone: Regularization & Feature Selection on Cybersecurity Datasets

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm


## BETH Dataset Analysis

In [12]:

import pandas as pd
import numpy as np
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# Load the BETH dataset
beth_train_set = r"C:\Users\kegem\OneDrive\Datascience Masters\DS Summer 2025 SEMESTER 3\DX799S O1 Data Science Capstone (Summer 1 2025)\Beth DataSet\labelled_training_data.csv"
beth = pd.read_csv(beth_train_set)

# Drop problematic or high-cardinality columns
beth.drop(columns=['args', 'stackAddresses', 'hostName', 'processName', 'eventName', 'timestamp'], inplace=True, errors='ignore')

# Drop missing target labels
beth.dropna(subset=['eventId'], inplace=True)
beth.dropna(inplace=True)

# Separate target and features
y = pd.to_numeric(beth['eventId'], errors='coerce').astype(int)
X_raw = beth.drop(columns=['evil', 'eventId'], errors='ignore')

# Identify categorical columns with low cardinality for safe one-hot encoding
low_card_cols = [col for col in X_raw.select_dtypes(include='object').columns if X_raw[col].nunique() < 20]
X_encoded = pd.get_dummies(X_raw, columns=low_card_cols, drop_first=True)

# Drop any remaining object columns (likely high-cardinality)
X_encoded = X_encoded.select_dtypes(exclude='object')

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# Fit Ridge and Lasso regression
ridge = RidgeCV(alphas=np.logspace(-3, 3, 100), cv=5).fit(X_scaled, y)
lasso = LassoCV(alphas=np.logspace(-3, 3, 100), cv=5, max_iter=10000).fit(X_scaled, y)

# Create a DataFrame for coefficients
coefs = pd.DataFrame({
    'Feature': X_encoded.columns,
    'Ridge': ridge.coef_,
    'Lasso': lasso.coef_
})

# Print performance
print("R² Ridge:", ridge.score(X_scaled, y))
print("R² Lasso:", lasso.score(X_scaled, y))

# Display top 10 most influential features
top_lasso0 = coefs.reindex(coefs['Lasso'].abs().sort_values(ascending=False).index)
top_ridge0 = coefs.reindex(coefs['Ridge'].abs().sort_values(ascending=False).index)

print("\nTop 10 Influential Features by Lasso:")
print(top_lasso0.head(10))

print("\nTop 10 Influential Features by Ridge:")
print(top_ridge0.head(10))


R² Ridge: 0.4304687060032537
R² Lasso: 0.4301998332669419

Top 10 Influential Features by Lasso:
           Feature       Ridge       Lasso
5          argsNum  235.987814  234.292689
4   mountNamespace  -65.926731  -62.404625
6      returnValue  -28.467104  -26.094332
2  parentProcessId   -9.374198   -5.978298
0        processId   -6.803372   -3.908188
7              sus   -8.996285   -2.270552
1         threadId    1.566100   -0.080795
3           userId    5.092193   -0.000000

Top 10 Influential Features by Ridge:
           Feature       Ridge       Lasso
5          argsNum  235.987814  234.292689
4   mountNamespace  -65.926731  -62.404625
6      returnValue  -28.467104  -26.094332
2  parentProcessId   -9.374198   -5.978298
7              sus   -8.996285   -2.270552
0        processId   -6.803372   -3.908188
3           userId    5.092193   -0.000000
1         threadId    1.566100   -0.080795


In [3]:
# Forward Selection
numcols = X.shape[1]
cols_used = []
rsq_obtained = []
cols_unused = np.arange(numcols)

for _ in range(numcols):
    cur_col = None
    cur_rsq = -np.inf
    for col in cols_unused:
        test_cols = list(set(cols_used) | {col})
        rsq = sm.OLS(y, X[:, test_cols]).fit().rsquared
        if rsq > cur_rsq:
            cur_col = col
            cur_rsq = rsq
    cols_unused = list(set(cols_unused) - {cur_col})
    cols_used.append(cur_col)
    rsq_obtained.append(cur_rsq)

print("Forward Selection Order:", cols_used[:10])
print("R² Values:", [f"{x:.3f}" for x in rsq_obtained[:10]])


Forward Selection Order: [np.int64(5), np.int64(4), np.int64(6), np.int64(2), np.int64(1), np.int64(7), np.int64(3), np.int64(0)]
R² Values: ['0.256', '0.272', '0.275', '0.276', '0.276', '0.276', '0.276', '0.276']


In [4]:
# Backward Selection
cols_used = list(np.arange(numcols))
cols_removed = []
rsq_obtained = []

for _ in range(numcols - 1):
    cur_col = None
    cur_rsq = -np.inf
    for col in cols_used:
        test_cols = list(set(cols_used) - {col})
        rsq = sm.OLS(y, X[:, test_cols]).fit().rsquared
        if rsq > cur_rsq:
            cur_col = col
            cur_rsq = rsq
    cols_used = list(set(cols_used) - {cur_col})
    cols_removed.append(cur_col)
    rsq_obtained.append(cur_rsq)

print("Backward Selection Remaining:", cols_used[:10])
print("R² Values:", [f"{x:.3f}" for x in rsq_obtained[:10]])


Backward Selection Remaining: [np.int64(5)]
R² Values: ['0.276', '0.276', '0.276', '0.276', '0.275', '0.272', '0.256']


## CYBER Dataset Analysis

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# Load the Cybersecurity Attacks dataset from the defined path
cyber_path = r"C:\Users\kegem\OneDrive\Datascience Masters\DS Summer 2025 SEMESTER 3\DX799S O1 Data Science Capstone (Summer 1 2025)\Cybersecurity Attacks DataSets\cybersecurity_attacks.csv"
cyber_df = pd.read_csv(cyber_path)

# Drop rows with missing anomaly scores (our target variable)
cyber_df.dropna(subset=['Anomaly Scores'], inplace=True)

# Drop columns known to be high-cardinality or irrelevant to modeling
drop_cols = [
    'Timestamp', 'Source IP Address', 'Destination IP Address', 'Payload Data',
    'User Information', 'Device Information', 'Geo-location Data',
    'Proxy Information', 'Firewall Logs', 'IDS/IPS Alerts', 'Log Source'
]
cyber_df.drop(columns=[col for col in drop_cols if col in cyber_df.columns], inplace=True)

# Select only a subset of categorical columns to one-hot encode
low_card_cols = [col for col in cyber_df.select_dtypes(include='object').columns if cyber_df[col].nunique() < 20]
cyber_df = pd.get_dummies(cyber_df, columns=low_card_cols, drop_first=True)

# Drop any remaining rows with missing values
cyber_df.dropna(inplace=True)

# Define features and target
y = pd.to_numeric(cyber_df['Anomaly Scores'], errors='coerce').astype(int)
X = cyber_df.drop(columns=['Anomaly Scores'], errors='ignore')

# Standardize the features
X_scaled = StandardScaler().fit_transform(X)

# Fit Ridge and Lasso regression models with cross-validation
ridge = RidgeCV(alphas=np.logspace(-3, 3, 100), cv=5).fit(X_scaled, y)
lasso = LassoCV(alphas=np.logspace(-3, 3, 100), cv=5).fit(X_scaled, y)

# Print R² performance scores
print("R² Ridge:", ridge.score(X_scaled, y))
print("R² Lasso:", lasso.score(X_scaled, y))

# Identify top 10 most influential features from Lasso and Ridge
coefs = pd.DataFrame({
    'Feature': X.columns,
    'Ridge': ridge.coef_,
    'Lasso': lasso.coef_
})
top_lasso = coefs.reindex(coefs['Lasso'].abs().sort_values(ascending=False).index)
top_ridge = coefs.reindex(coefs['Ridge'].abs().sort_values(ascending=False).index)

print("\nTop 10 Influential Features by Lasso:")
print(top_lasso.head(10))

print("\nTop 10 Influential Features by Ridge:")
print(top_ridge.head(10))


R² Ridge: 0.00036333976298086323
R² Lasso: 9.259502058089897e-05

Top 10 Influential Features by Lasso:
                 Feature     Ridge    Lasso
6       Traffic Type_FTP -0.348084 -0.13018
0            Source Port  0.134826  0.00000
1       Destination Port -0.099901 -0.00000
3           Protocol_TCP  0.049704  0.00000
2          Packet Length -0.100596 -0.00000
4           Protocol_UDP -0.055639 -0.00000
5       Packet Type_Data -0.170402 -0.00000
7      Traffic Type_HTTP  0.003221  0.00000
8  Attack Type_Intrusion -0.113304 -0.00000
9    Attack Type_Malware -0.049484  0.00000

Top 10 Influential Features by Ridge:
                      Feature     Ridge    Lasso
6            Traffic Type_FTP -0.348084 -0.13018
14      Severity Level_Medium -0.220281 -0.00000
5            Packet Type_Data -0.170402 -0.00000
15  Network Segment_Segment B -0.141655 -0.00000
0                 Source Port  0.134826  0.00000
8       Attack Type_Intrusion -0.113304 -0.00000
2               Packet Length 

In [None]:
numcols = X.shape[1]
cols_used = []
rsq_obtained = []
cols_unused = np.arange(numcols)

for _ in range(min(numcols, 20)):
    cur_col = None
    cur_rsq = -np.inf
    for col in cols_unused:
        test_cols = list(set(cols_used) | {col})
        rsq = sm.OLS(y, X[:, test_cols]).fit().rsquared
        if rsq > cur_rsq:
            cur_col = col
            cur_rsq = rsq
    cols_unused = list(set(cols_unused) - {cur_col})
    cols_used.append(cur_col)
    rsq_obtained.append(cur_rsq)

print("Forward Selection Top 10:", cols_used[:10])
print("R² Forward:", [f"{x:.3f}" for x in rsq_obtained[:10]])


In [None]:
cols_used = list(np.arange(numcols))
cols_removed = []
rsq_obtained = []

for _ in range(min(numcols - 1, 20)):
    cur_col = None
    cur_rsq = -np.inf
    for col in cols_used:
        test_cols = list(set(cols_used) - {col})
        rsq = sm.OLS(y, X[:, test_cols]).fit().rsquared
        if rsq > cur_rsq:
            cur_col = col
            cur_rsq = rsq
    cols_used = list(set(cols_used) - {cur_col})
    cols_removed.append(cur_col)
    rsq_obtained.append(cur_rsq)

print("Backward Selection Remaining 10:", cols_used[:10])
print("R² Backward:", [f"{x:.3f}" for x in rsq_obtained[:10]])


##  UNSW Dataset Analysis

In [13]:
unsw_train_path = r"C:\Users\kegem\OneDrive\Datascience Masters\DS Summer 2025 SEMESTER 3\DX799S O1 Data Science Capstone (Summer 1 2025)\Network Security DataSet\CSV Files\Training and Testing Sets\UNSW_NB15_training-set.csv"


unsw_df = pd.read_csv(unsw_train_path)
unsw_df.dropna(subset=['label'], inplace=True)
unsw_df = pd.get_dummies(unsw_df.drop(columns=['id', 'proto', 'service', 'state', 'attack_cat'] if 'unsw'=='unsw' else [], errors='ignore'), drop_first=True)
unsw_df.dropna(inplace=True)

y = pd.to_numeric(unsw_df['label'], errors='coerce').astype(int)
X = StandardScaler().fit_transform(unsw_df.drop(columns=['label'], errors='ignore'))

ridge = RidgeCV(alphas=np.logspace(-3, 3, 100), cv=5).fit(X, y)
lasso = LassoCV(alphas=np.logspace(-3, 3, 100), cv=5).fit(X, y)

print("R² Ridge:", ridge.score(X, y))
print("R² Lasso:", lasso.score(X, y))


top_lasso1 = coefs.reindex(coefs['Lasso'].abs().sort_values(ascending=False).index)
top_ridge1 = coefs.reindex(coefs['Ridge'].abs().sort_values(ascending=False).index)

print("\nTop 10 Influential Features by Lasso:")
print(top_lasso1.head(10))

print("\nTop 10 Influential Features by Ridge:")
print(top_ridge1.head(10))



R² Ridge: 0.682015115929326
R² Lasso: 0.6758962489020018

Top 10 Influential Features by Lasso:
           Feature       Ridge       Lasso
5          argsNum  235.987814  234.292689
4   mountNamespace  -65.926731  -62.404625
6      returnValue  -28.467104  -26.094332
2  parentProcessId   -9.374198   -5.978298
0        processId   -6.803372   -3.908188
7              sus   -8.996285   -2.270552
1         threadId    1.566100   -0.080795
3           userId    5.092193   -0.000000

Top 10 Influential Features by Ridge:
           Feature       Ridge       Lasso
5          argsNum  235.987814  234.292689
4   mountNamespace  -65.926731  -62.404625
6      returnValue  -28.467104  -26.094332
2  parentProcessId   -9.374198   -5.978298
7              sus   -8.996285   -2.270552
0        processId   -6.803372   -3.908188
3           userId    5.092193   -0.000000
1         threadId    1.566100   -0.080795


In [7]:
numcols = X.shape[1]
cols_used = []
rsq_obtained = []
cols_unused = np.arange(numcols)

for _ in range(min(numcols, 20)):
    cur_col = None
    cur_rsq = -np.inf
    for col in cols_unused:
        test_cols = list(set(cols_used) | {col})
        rsq = sm.OLS(y, X[:, test_cols]).fit().rsquared
        if rsq > cur_rsq:
            cur_col = col
            cur_rsq = rsq
    cols_unused = list(set(cols_unused) - {cur_col})
    cols_used.append(cur_col)
    rsq_obtained.append(cur_rsq)

print("Forward Selection Top 10:", cols_used[:10])
print("R² Forward:", [f"{x:.3f}" for x in rsq_obtained[:10]])


Forward Selection Top 10: [np.int64(6), np.int64(28), np.int64(7), np.int64(16), np.int64(19), np.int64(38), np.int64(21), np.int64(31), np.int64(37), np.int64(24)]
R² Forward: ['0.153', '0.161', '0.171', '0.183', '0.193', '0.199', '0.204', '0.207', '0.209', '0.212']


In [8]:
cols_used = list(np.arange(numcols))
cols_removed = []
rsq_obtained = []

for _ in range(min(numcols - 1, 20)):
    cur_col = None
    cur_rsq = -np.inf
    for col in cols_used:
        test_cols = list(set(cols_used) - {col})
        rsq = sm.OLS(y, X[:, test_cols]).fit().rsquared
        if rsq > cur_rsq:
            cur_col = col
            cur_rsq = rsq
    cols_used = list(set(cols_used) - {cur_col})
    cols_removed.append(cur_col)
    rsq_obtained.append(cur_rsq)

print("Backward Selection Remaining 10:", cols_used[:10])
print("R² Backward:", [f"{x:.3f}" for x in rsq_obtained[:10]])


Backward Selection Remaining 10: [np.int64(0), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(9), np.int64(10), np.int64(11), np.int64(16), np.int64(19)]
R² Backward: ['0.218', '0.218', '0.218', '0.218', '0.218', '0.218', '0.218', '0.218', '0.218', '0.218']
