# Week 3: Feature Selection, PCR, and PLSR

This notebook applies various regression modeling strategies—forward selection, principal component regression (PCR), and partial least squares regression (PLSR)—to predict the severity level of cybersecurity incidents.


In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load data
df = pd.read_csv("cybersecurity_attacks.csv")
df = df[df["Severity Level"].notna()]
severity_map = {"Low": 0, "Medium": 1, "High": 2, "Critical": 3}
df["Severity_Level_Num"] = df["Severity Level"].map(severity_map)

# Select features and target
features = ["Packet Length", "Anomaly Scores", "Source Port", "Destination Port"]
X = df[features]
y = df["Severity_Level_Num"]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
lr_mse = mean_squared_error(y_test, y_pred_lr)

# Forward Selection
sfs = SequentialFeatureSelector(lr, direction='forward', n_features_to_select='auto')
sfs.fit(X_train, y_train)
X_train_sfs = sfs.transform(X_train)
X_test_sfs = sfs.transform(X_test)
lr.fit(X_train_sfs, y_train)
y_pred_sfs = lr.predict(X_test_sfs)
sfs_mse = mean_squared_error(y_test, y_pred_sfs)

# PCR
pca = PCA(n_components=4)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
lr.fit(X_train_pca, y_train)
y_pred_pcr = lr.predict(X_test_pca)
pcr_mse = mean_squared_error(y_test, y_pred_pcr)

# PLSR
pls = PLSRegression(n_components=2)
pls.fit(X_train, y_train)
y_pred_pls = pls.predict(X_test)
pls_mse = mean_squared_error(y_test, y_pred_pls)


In [3]:
import pandas as pd

results = pd.DataFrame({
    "Model": ["Linear Regression", "Forward Selection", "PCR", "PLSR"],
    "MSE": [lr_mse, sfs_mse, pcr_mse, pls_mse]
})
results

Unnamed: 0,Model,MSE
0,Linear Regression,0.659429
1,Forward Selection,0.659161
2,PCR,0.659429
3,PLSR,0.659429
