In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv("cybersecurity_intrusion_data.csv")

# Display first few rows
print(df.head())


  session_id  network_packet_size protocol_type  login_attempts  \
0  SID_00001                  599           TCP               4   
1  SID_00002                  472           TCP               3   
2  SID_00003                  629           TCP               3   
3  SID_00004                  804           UDP               4   
4  SID_00005                  453           TCP               5   

   session_duration encryption_used  ip_reputation_score  failed_logins  \
0        492.983263             DES             0.606818              1   
1       1557.996461             DES             0.301569              0   
2         75.044262             DES             0.739164              2   
3        601.248835             DES             0.123267              0   
4        532.540888             AES             0.054874              1   

  browser_type  unusual_time_access  attack_detected  
0         Edge                    0                1  
1      Firefox                    0 

In [3]:
# Display column names
print(df.columns)


Index(['session_id', 'network_packet_size', 'protocol_type', 'login_attempts',
       'session_duration', 'encryption_used', 'ip_reputation_score',
       'failed_logins', 'browser_type', 'unusual_time_access',
       'attack_detected'],
      dtype='object')


In [4]:
# Check for missing values
print(df.isnull().sum())


session_id                0
network_packet_size       0
protocol_type             0
login_attempts            0
session_duration          0
encryption_used        1966
ip_reputation_score       0
failed_logins             0
browser_type              0
unusual_time_access       0
attack_detected           0
dtype: int64


In [6]:
df.loc[:, 'encryption_used'] = df['encryption_used'].fillna(df['encryption_used'].mode()[0])


In [7]:
# Check for any remaining missing values
print(df.isnull().sum())


session_id             0
network_packet_size    0
protocol_type          0
login_attempts         0
session_duration       0
encryption_used        0
ip_reputation_score    0
failed_logins          0
browser_type           0
unusual_time_access    0
attack_detected        0
dtype: int64


In [8]:
# Check data types
print(df.dtypes)


session_id              object
network_packet_size      int64
protocol_type           object
login_attempts           int64
session_duration       float64
encryption_used         object
ip_reputation_score    float64
failed_logins            int64
browser_type            object
unusual_time_access      int64
attack_detected          int64
dtype: object


In [9]:
df['protocol_type'] = df['protocol_type'].map({'TCP': 0, 'UDP': 1})


In [10]:
df['encryption_used'] = df['encryption_used'].map({'DES': 0, 'AES': 1})


In [11]:
df['browser_type'] = df['browser_type'].map({'Chrome': 0, 'Firefox': 1, 'Edge': 2, 'Unknown': 3})


In [12]:
print(df.head())  # Shows first few rows to confirm changes
print(df.dtypes)  # Ensures all categorical values are now numerical


  session_id  network_packet_size  protocol_type  login_attempts  \
0  SID_00001                  599            0.0               4   
1  SID_00002                  472            0.0               3   
2  SID_00003                  629            0.0               3   
3  SID_00004                  804            1.0               4   
4  SID_00005                  453            0.0               5   

   session_duration  encryption_used  ip_reputation_score  failed_logins  \
0        492.983263                0             0.606818              1   
1       1557.996461                0             0.301569              0   
2         75.044262                0             0.739164              2   
3        601.248835                0             0.123267              0   
4        532.540888                1             0.054874              1   

   browser_type  unusual_time_access  attack_detected  
0           2.0                    0                1  
1           1.0       

In [14]:
print(df[['protocol_type', 'browser_type']].isna().sum())  # Check NaNs
print(df[['protocol_type', 'browser_type']].isin([float('inf')]).sum())  # Check infinity values


protocol_type    507
browser_type     485
dtype: int64
protocol_type    0
browser_type     0
dtype: int64


In [17]:
df['protocol_type'] = df['protocol_type'].astype(int)
df['browser_type'] = df['browser_type'].astype(int)


In [18]:
print(df.dtypes)  # Ensure `protocol_type` and `browser_type` are int64
print(df[['protocol_type', 'browser_type']].head())  # Verify data structure


session_id              object
network_packet_size      int64
protocol_type            int32
login_attempts           int64
session_duration       float64
encryption_used          int64
ip_reputation_score    float64
failed_logins            int64
browser_type             int32
unusual_time_access      int64
attack_detected          int64
dtype: object
   protocol_type  browser_type
0              0             2
1              0             1
2              0             0
3              1             3
4              0             1


In [19]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[['network_packet_size', 'session_duration', 'ip_reputation_score']] = scaler.fit_transform(df[['network_packet_size', 'session_duration', 'ip_reputation_score']])


In [20]:
print(df[['network_packet_size', 'session_duration', 'ip_reputation_score']].describe())


       network_packet_size  session_duration  ip_reputation_score
count          9537.000000       9537.000000          9537.000000
mean              0.357437          0.110189             0.356737
std               0.162473          0.109398             0.192205
min               0.000000          0.000000             0.000000
25%               0.246519          0.032191             0.205521
50%               0.356265          0.077300             0.338773
75%               0.467649          0.153671             0.489141
max               1.000000          1.000000             1.000000


In [21]:
print(df[['network_packet_size', 'session_duration', 'ip_reputation_score']].describe())


       network_packet_size  session_duration  ip_reputation_score
count          9537.000000       9537.000000          9537.000000
mean              0.357437          0.110189             0.356737
std               0.162473          0.109398             0.192205
min               0.000000          0.000000             0.000000
25%               0.246519          0.032191             0.205521
50%               0.356265          0.077300             0.338773
75%               0.467649          0.153671             0.489141
max               1.000000          1.000000             1.000000


In [22]:
# Separate features (X) and target (y)
X = df.drop(columns=['session_id', 'attack_detected'])  # Removing session_id (not useful for ML)
y = df['attack_detected']


In [23]:
from sklearn.model_selection import train_test_split

# 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

# Define X (features) and y (target)
X = df.drop(columns=['session_id', 'attack_detected'])
y = df['attack_detected']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Naïve Bayes": GaussianNB(),
    "Neural Network": MLPClassifier(max_iter=500)
}

# Train & evaluate each model
accuracy_results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_results[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.4f}")

# Find the best model
best_model = max(accuracy_results, key=accuracy_results.get)
print(f"\nBest Performing Model: {best_model} with Accuracy: {accuracy_results[best_model]:.4f}")


Logistic Regression Accuracy: 0.7521
Decision Tree Accuracy: 0.8344
Random Forest Accuracy: 0.8962
SVM Accuracy: 0.8438
KNN Accuracy: 0.8417
Naïve Bayes Accuracy: 0.8260
Neural Network Accuracy: 0.8873

Best Performing Model: Random Forest with Accuracy: 0.8962


In [25]:
import joblib

# Save Random Forest model
joblib.dump(models['Random Forest'], 'cyber_intrusion_model.pkl')

# Load model later when needed
loaded_model = joblib.load('cyber_intrusion_model.pkl')
