In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
from datetime import datetime, timedelta
import kagglehub
import os

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# Download the latest version of the dataset
path = kagglehub.dataset_download("teamincribo/cyber-security-attacks")

# print("Path to dataset files:", path)

# Construct the full path to the CSV file (update the file name if necessary)
csv_file = os.path.join(path, "cybersecurity_attacks.csv")

# Read the dataset into a DataFrame
df = pd.read_csv(csv_file)

Downloading from https://www.kaggle.com/api/v1/datasets/download/teamincribo/cyber-security-attacks?dataset_version_number=20...


100%|██████████| 5.03M/5.03M [00:00<00:00, 78.1MB/s]

Extracting files...





In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Timestamp               40000 non-null  object 
 1   Source IP Address       40000 non-null  object 
 2   Destination IP Address  40000 non-null  object 
 3   Source Port             40000 non-null  int64  
 4   Destination Port        40000 non-null  int64  
 5   Protocol                40000 non-null  object 
 6   Packet Length           40000 non-null  int64  
 7   Packet Type             40000 non-null  object 
 8   Traffic Type            40000 non-null  object 
 9   Payload Data            40000 non-null  object 
 10  Malware Indicators      20000 non-null  object 
 11  Anomaly Scores          40000 non-null  float64
 13  Attack Type             40000 non-null  object 
 14  Attack Signature        40000 non-null  object 
 15  Action Taken            40000 non-null

*Note: Unfortunately due to the nature of the datasets for my project (Network Security), linear models will likely not perform well with these datasets. More complex methods such as neural networks will be required to effectively predict for target values. For the sake of the assignment, I will still run the linear regression models on one of my datasets.*

**Lasso Regression**

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

In [14]:
# Drop columns unlikely to be useful or with too much unstructured text
columns_to_drop = [
    'Timestamp', 'Payload Data', 'Malware Indicators', 'Alerts/Warnings',
    'User Information', 'Device Information', 'Geo-location Data',
    'Proxy Information', 'Firewall Logs', 'IDS/IPS Alerts'
]

df_model = df.drop(columns=columns_to_drop)

# Separate features and target
X = df_model.drop(columns=['Anomaly Scores'])
y = df_model['Anomaly Scores']

# Identify categorical and numeric columns
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing: one-hot encode categoricals, scale numerics, impute missing
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), numeric_cols),
    
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]), categorical_cols)
])

# Full pipeline with Lasso
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('lasso', Lasso(alpha=0.1))  # alpha is regularization strength
])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse:.4f}')

Mean Squared Error: 835.3243


In [15]:
import numpy as np
rmse = np.sqrt(835)
print(f'RMSE: {rmse:.2f}')  # Output: ~28.90

RMSE: 28.90


In [16]:
print("Anomaly Score range:", y.min(), "to", y.max())
print("Standard deviation:", y.std())

Anomaly Score range: 0.0 to 100.0
Standard deviation: 28.853598250518676


Based on the RMSE being near equal to the STD, it appears that the model is not much better than just predicting the mean of anomaly scores for every instance. May need to account for categorical features for further effectiveness, but I believe that it still won't help as there linear regression methods may not be best suited for this dataset.

**Ridge Regression**

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
import numpy as np

In [18]:
# Drop high-cardinality or complex columns
columns_to_drop = [
    'Timestamp', 'Payload Data', 'Malware Indicators', 'Alerts/Warnings',
    'User Information', 'Device Information', 'Geo-location Data',
    'Proxy Information', 'Firewall Logs', 'IDS/IPS Alerts'
]

df_model = df.drop(columns=columns_to_drop)

# Separate features and target
X = df_model.drop(columns=['Anomaly Scores'])
y = df_model['Anomaly Scores']

# Identify column types
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), numeric_cols),

    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), categorical_cols)
])

# Full pipeline with Ridge Regression
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('ridge', Ridge(alpha=1.0))  # You can tune alpha
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")

Mean Squared Error: 835.97
Root Mean Squared Error: 28.91


Similar results as lasso regression. Could be due to underfitting or that true linear relationships are not present in this dataset.

**Elastic Regression**

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
import numpy as np

In [20]:
# Drop unstructured or high-cardinality columns
columns_to_drop = [
    'Timestamp', 'Payload Data', 'Malware Indicators', 'Alerts/Warnings',
    'User Information', 'Device Information', 'Geo-location Data',
    'Proxy Information', 'Firewall Logs', 'IDS/IPS Alerts'
]

df_model = df.drop(columns=columns_to_drop)

# Separate features and target
X = df_model.drop(columns=['Anomaly Scores'])
y = df_model['Anomaly Scores']

# Identify column types
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), numeric_cols),

    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), categorical_cols)
])

# Elastic Net pipeline
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('elasticnet', ElasticNet(alpha=1.0, l1_ratio=0.5))  # Mix of L1 and L2
])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")

Mean Squared Error: 835.28
Root Mean Squared Error: 28.90


It appears that linear regressions perform poorly on this dataset, which makes sense given the nature of the data being random network traffic. More complex methods such as neural networks are required to make effective predictions for this dataset.