#  Marilyn:  Loaded the necessary libraries needed for data mining

In [49]:
# Data handling
import pandas as pd
import numpy as np

# Data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Models (choose based on what you're using)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans, DBSCAN  # for clustering (if needed)

# Model evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Association rules (optional, if you're doing market basket analysis)
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder


In [50]:
df = pd.read_csv("data/transformed/transformed_cybersecurity_intrusion_data.csv")
df.head()

Unnamed: 0,session_id,network_packet_size,protocol_type,login_attempts,session_duration,encryption_used,ip_reputation_score,failed_logins,browser_type,unusual_time_access,attack_detected
0,SID_00001,599,TCP,4,492.983263,DES,0.606818,1,Edge,0,1
1,SID_00002,472,TCP,3,1557.996461,DES,0.301569,0,Firefox,0,0
2,SID_00003,629,TCP,3,75.044262,DES,0.739164,2,Chrome,0,1
3,SID_00004,804,UDP,4,601.248835,DES,0.123267,0,Unknown,0,1
4,SID_00005,453,TCP,5,532.540888,AES,0.054874,1,Firefox,0,0


#### Marilyn:   Inspecting the data making sure it is ready for data modeling and other data mining techniques

In [51]:
df.columns
print(df.isnull().sum())
print(df.shape)

session_id                0
network_packet_size       0
protocol_type             0
login_attempts            0
session_duration          0
encryption_used        1966
ip_reputation_score       0
failed_logins             0
browser_type              0
unusual_time_access       0
attack_detected           0
dtype: int64
(9537, 11)


In [52]:
df_new=df.copy()
df_new = df_new.drop(columns=["session_id"])
#replacing N/A values with None
df_new["encryption_used"]=df_new["encryption_used"].fillna("None")
df_new.isnull().sum()


network_packet_size    0
protocol_type          0
login_attempts         0
session_duration       0
encryption_used        0
ip_reputation_score    0
failed_logins          0
browser_type           0
unusual_time_access    0
attack_detected        0
dtype: int64

In [53]:
df_new.dtypes



network_packet_size      int64
protocol_type           object
login_attempts           int64
session_duration       float64
encryption_used         object
ip_reputation_score    float64
failed_logins            int64
browser_type            object
unusual_time_access      int64
attack_detected          int64
dtype: object

### Marilyn: Preprocessing the data for  modeling

#### Marilyn: separating the data into features and target variables

In [54]:
X = df_new.drop("attack_detected", axis=1)
y = df_new["attack_detected"]
print(X.columns)
X.head(2)



Index(['network_packet_size', 'protocol_type', 'login_attempts',
       'session_duration', 'encryption_used', 'ip_reputation_score',
       'failed_logins', 'browser_type', 'unusual_time_access'],
      dtype='object')


Unnamed: 0,network_packet_size,protocol_type,login_attempts,session_duration,encryption_used,ip_reputation_score,failed_logins,browser_type,unusual_time_access
0,599,TCP,4,492.983263,DES,0.606818,1,Edge,0
1,472,TCP,3,1557.996461,DES,0.301569,0,Firefox,0


In [None]:
categorical_cols = ['protocol_type', 'encryption_used', 'browser_type', 'unusual_time_access']

# Encoding categorical columns using Label Encoding
le = LabelEncoder()
for col in categorical_cols:
    X[col] = le.fit_transform(X[col])

#scaling numerical features 
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])