# 1) Data Exploration & Understanding

Import Data and understand the structures, features and variables 

In [None]:
import pandas as pd

df = pd.read_csv('GUIDE_Train.csv')

# Display basic information about the dataset
print(df.shape)
print(df.columns)
print(df.info())
df.head()

In [None]:
# Check the distribution of the target variable 'IncidentGrade'
# This is crucial for understanding class imbalance.
df['IncidentGrade'].value_counts()

# 2) Data Preprocessing

##### Handling Missing Data

In [None]:
df.isna().sum()

From above we have seen that few columns have more than 50% null values.   

Analyze missing values in specific columns to see if they correlate with any 'IncidentGrade'.  
This analysis helps in making an informed decision about dropping columns instead of imputation.

In [None]:
false_positive_nulls = df[(df['MitreTechniques'].isnull()) & 
                          (df['IncidentGrade'] == "FalsePositive")]

false_positive_nulls.shape

In [None]:
null_rows = df[df['MitreTechniques'].isnull()]
null_rows['IncidentGrade'].value_counts()

In [None]:
x_list = ['MitreTechniques',  'ActionGrouped', 'ActionGranular', 'EmailClusterId', 'ThreatFamily', 'ResourceType', 'Roles', 'AntispamDirection', 'SuspicionLevel', 'LastVerdict']

for column in x_list:
    print(column)
    null_rows = df[df[column].isnull()]
    print(null_rows['IncidentGrade'].value_counts())

From above we can clearly see that, the columns which have more null values are not related to a single incident grade.  
So we can continue with dropping the columns.

In [None]:
x_list = ['MitreTechniques',  'ActionGrouped', 'ActionGranular', 'EmailClusterId', 'ThreatFamily', 'ResourceType', 'Roles', 'AntispamDirection', 'SuspicionLevel', 'LastVerdict']

df.drop(columns = x_list, inplace = True)
df.shape

In [None]:
df.isna().sum()

In [None]:
df = df.dropna()

In [None]:
df.isna().sum()

##### Encoding

Convert categorical features and the target variable into a numerical format that machine learning models can process.

In [None]:
df.info()

In [None]:
for column in df.columns:
    print(column, df[column].nunique())

In [None]:
category_columns = ['Category', 'EntityType', 'EvidenceRole']

for column in category_columns:
    print(df[column].value_counts())

Apply Label Encoding to the target variable and selected categorical features.

In [None]:
from sklearn.preprocessing import LabelEncoder

category_columns = ['Category', 'EntityType', 'EvidenceRole']

# Encode target variable
label_encoding = LabelEncoder()
df['IncidentGrade'] = label_encoding.fit_transform(df['IncidentGrade'])

for col in category_columns:
    df[col] = LabelEncoder().fit_transform(df[col])

In [None]:
df.shape

In [None]:
df.info()

##### Feature Engineering

Create a new feature 'Month' from the 'Timestamp' column to capture potential time-based patterns.

In [None]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')

# Extract month number (1–12)
df['Month'] = df['Timestamp'].dt.month

df.drop(['Timestamp'], axis=1, inplace=True)

##### EDA

Visualize the data to gain insights into feature distributions, relationships, and potential issues like class imbalance.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

print(df['IncidentGrade'].value_counts())
print(df['IncidentGrade'].value_counts(normalize=True) * 100)

sns.countplot(x="IncidentGrade", data=df)
plt.title("Incident Grade Distribution")
plt.show()


In [None]:
for col in ['Category','EntityType','EvidenceRole']:
    print(f"\nValue counts for {col}:")
    print(df[col].value_counts().head(10))
    sns.countplot(y=col, data=df, order=df[col].value_counts().head(10).index)
    plt.title(f"Top 10 values in {col}")
    plt.show()

In [None]:
num_cols = df.select_dtypes(include=['int64','float64']).columns

df[num_cols].describe().T  # summary statistics

for col in num_cols:
    sns.histplot(df[col].dropna(), kde=True)
    plt.title(f"Distribution of {col}")
    plt.show()

In [None]:
sns.countplot(x="IncidentGrade", hue="EvidenceRole", data=df)
plt.title("Incident Grade vs EvidenceRole")
plt.show()

sns.countplot(x="IncidentGrade", hue="EntityType", data=df)
plt.title("Incident Grade vs EntityType")
plt.show()

In [None]:
corr = df[num_cols].corr()
plt.figure(figsize=(12,8))
sns.heatmap(corr, annot=False, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()


# 3) Data Splitting

In [None]:
import pandas as pd

df = pd.read_csv('prepped_dataset.csv')

Prepare the data for model training and evaluation by splitting it into training and validation sets.

In [3]:
from sklearn.model_selection import train_test_split

X = df.drop('IncidentGrade', axis=1)
y = df['IncidentGrade']

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,  
    stratify=y         # Ensures same proportion of classes in train and test
)

# 4) Model Selection & Training

##### Baseline Model

Train a simple Logistic Regression model to establish a performance baseline.  
This baseline will be used to benchmark more complex models like XGBoost.

1) Logistic Regression

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report

# Initialize Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')  # class_weight optional

# Train the model
lr_model.fit(X_train, y_train)

# Predict on test set
y_pred = lr_model.predict(X_val)

# Confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

# Classification report (includes precision, recall, f1 for each class)
print("\nClassification Report:\n", classification_report(y_val, y_pred))

# Macro-averaged metrics
macro_f1 = f1_score(y_val, y_pred, average='macro')
precision = precision_score(y_val, y_pred, average='weighted') 
recall = recall_score(y_val, y_pred, average='weighted')

print(f"Macro F1 Score: {macro_f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")


Confusion Matrix:
 [[742996 160585 329665]
 [323957  99616 186017]
 [410117 103344 483353]]

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.60      0.55   1233246
           1       0.27      0.16      0.20    609590
           2       0.48      0.48      0.48    996814

    accuracy                           0.47   2839650
   macro avg       0.42      0.42      0.41   2839650
weighted avg       0.45      0.47      0.45   2839650

Macro F1 Score: 0.4125
Precision: 0.4471
Recall: 0.4669


2) Decision Tree 

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report


# Initialize Decision Tree
dt_model = DecisionTreeClassifier(random_state=42, class_weight='balanced')

# Train the model
dt_model.fit(X_train, y_train)

# Predict on validation set
y_pred = dt_model.predict(X_val)

# Confusion matrix
print("Decision Tree - Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

# Classification report
print("\nDecision Tree - Classification Report:\n", classification_report(y_val, y_pred))

# Macro-averaged F1
macro_f1 = f1_score(y_val, y_pred, average='macro')

# Regular precision and recall
precision = precision_score(y_val, y_pred, average='weighted')
recall = recall_score(y_val, y_pred, average='weighted')

print(f"Decision Tree - Macro F1 Score: {macro_f1:.4f}")
print(f"Decision Tree - Precision: {precision:.4f}")
print(f"Decision Tree - Recall: {recall:.4f}")


Decision Tree - Confusion Matrix:
 [[1228372    2814    2060]
 [   3021  604303    2266]
 [   2128    2147  992539]]

Decision Tree - Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00   1233246
           1       0.99      0.99      0.99    609590
           2       1.00      1.00      1.00    996814

    accuracy                           0.99   2839650
   macro avg       0.99      0.99      0.99   2839650
weighted avg       0.99      0.99      0.99   2839650

Decision Tree - Macro F1 Score: 0.9944
Decision Tree - Precision: 0.9949
Decision Tree - Recall: 0.9949


##### Advanced Models

1) Random Forest 

In [1]:
import pandas as pd
df = pd.read_csv('prepped_dataset.csv')

In [2]:
from sklearn.model_selection import train_test_split

X = df.drop('IncidentGrade', axis=1)
y = df['IncidentGrade']

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report

# Initialize Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Train the model
rf_model.fit(X_train, y_train)

# Predict on validation set
y_pred = rf_model.predict(X_val)

# Confusion matrix
print("Random Forest - Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

# Classification report
print("\nRandom Forest - Classification Report:\n", classification_report(y_val, y_pred))

# Macro F1 score
macro_f1 = f1_score(y_val, y_pred, average='macro')

# Weighted precision and recall
precision = precision_score(y_val, y_pred, average='weighted')
recall = recall_score(y_val, y_pred, average='weighted')

print(f"Random Forest - Macro F1 Score: {macro_f1:.4f}")
print(f"Random Forest - Precision: {precision:.4f}")
print(f"Random Forest - Recall: {recall:.4f}")

import joblib
joblib.dump(rf_model, "rf_model.pkl")

Random Forest - Confusion Matrix:
 [[1220267    7662    5317]
 [  13745  590129    5716]
 [  13405    6552  976857]]

Random Forest - Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98   1233246
           1       0.98      0.97      0.97    609590
           2       0.99      0.98      0.98    996814

    accuracy                           0.98   2839650
   macro avg       0.98      0.98      0.98   2839650
weighted avg       0.98      0.98      0.98   2839650

Random Forest - Macro F1 Score: 0.9802
Random Forest - Precision: 0.9816
Random Forest - Recall: 0.9815


['rf_model.pkl']

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report

# Initialize Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Train the model
rf_model.fit(X_train, y_train)

# Predict on validation set
y_pred = rf_model.predict(X_val)

# Confusion matrix
print("Random Forest - Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

# Classification report
print("\nRandom Forest - Classification Report:\n", classification_report(y_val, y_pred))

# Macro F1 score
macro_f1 = f1_score(y_val, y_pred, average='macro')

# Weighted precision and recall
precision = precision_score(y_val, y_pred, average='weighted')
recall = recall_score(y_val, y_pred, average='weighted')

print(f"Random Forest - Macro F1 Score: {macro_f1:.4f}")
print(f"Random Forest - Precision: {precision:.4f}")
print(f"Random Forest - Recall: {recall:.4f}")


Random Forest - Confusion Matrix:
 [[1220267    7662    5317]
 [  13745  590129    5716]
 [  13405    6552  976857]]

Random Forest - Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98   1233246
           1       0.98      0.97      0.97    609590
           2       0.99      0.98      0.98    996814

    accuracy                           0.98   2839650
   macro avg       0.98      0.98      0.98   2839650
weighted avg       0.98      0.98      0.98   2839650

Random Forest - Macro F1 Score: 0.9802
Random Forest - Precision: 0.9816
Random Forest - Recall: 0.9815


2) XGBoost

In [7]:
from xgboost import XGBClassifier

# Initialize XGBoost
xgb_model = XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Train the model
xgb_model.fit(X_train, y_train)

# Predict on validation set
y_pred = xgb_model.predict(X_val)

# Confusion matrix
print("XGBoost - Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

# Classification report
print("\nXGBoost - Classification Report:\n", classification_report(y_val, y_pred))

# Macro F1 score
macro_f1 = f1_score(y_val, y_pred, average='macro')

# Weighted precision and recall
precision = precision_score(y_val, y_pred, average='weighted')
recall = recall_score(y_val, y_pred, average='weighted')

print(f"XGBoost - Macro F1 Score: {macro_f1:.4f}")
print(f"XGBoost - Precision: {precision:.4f}")
print(f"XGBoost - Recall: {recall:.4f}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost - Confusion Matrix:
 [[1181346   23317   28583]
 [  77650  506052   25888]
 [  80261   13759  902794]]

XGBoost - Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.96      0.92   1233246
           1       0.93      0.83      0.88    609590
           2       0.94      0.91      0.92    996814

    accuracy                           0.91   2839650
   macro avg       0.92      0.90      0.91   2839650
weighted avg       0.91      0.91      0.91   2839650

XGBoost - Macro F1 Score: 0.9068
XGBoost - Precision: 0.9142
XGBoost - Recall: 0.9122


##### Hyper Parameter Tuning with Cross Validation

1) Logistic Regression

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

param_dist_lr = {
    "C": np.logspace(-3, 2, 6),   # smaller search space
    "penalty": ["l2"],
    "solver": ["lbfgs", "saga"]
}

rand_lr = RandomizedSearchCV(
    LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    param_distributions=param_dist_lr,
    n_iter=5,         # try only 5 random combinations
    scoring="f1_macro",
    cv=3,             # use 3-fold instead of 5
    n_jobs=-1,
    verbose=1
)

rand_lr.fit(X_train, y_train)
print("Best Params:", rand_lr.best_params_)
print("Best CV Macro-F1:", rand_lr.best_score_)


Fitting 3 folds for each of 5 candidates, totalling 15 fits


MemoryError: Unable to allocate 573. MiB for an array with shape (34, 2208615) and data type int64

2) Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

param_grid_dt = {
    "max_depth": [10, 20, 30, None],
    "min_samples_split": [2, 5, 10, 50],
    "min_samples_leaf": [1, 5, 10],
    "max_features": ["sqrt", "log2", None]
}

grid_dt = GridSearchCV(
    DecisionTreeClassifier(random_state=42, class_weight='balanced'),
    param_grid=param_grid_dt,
    scoring="f1_macro",
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_dt.fit(X_train, y_train)
print("Best Params (DT):", grid_dt.best_params_)
print("Best CV Macro-F1:", grid_dt.best_score_)

3) Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

param_grid_rf = {
    "n_estimators": [100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 5, 10],
    "max_features": ["sqrt", "log2"]
}

grid_rf = GridSearchCV(
    RandomForestClassifier(random_state=42, class_weight='balanced'),
    param_grid=param_grid_rf,
    scoring="f1_macro",
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_rf.fit(X_train, y_train)
print("Best Params (RF):", grid_rf.best_params_)
print("Best CV Macro-F1:", grid_rf.best_score_)

4) XGBoost

In [None]:
from xgboost import XGBClassifier

param_grid_xgb = {
    "n_estimators": [100, 200],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
    "gamma": [0, 1]
}

grid_xgb = GridSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42),
    param_grid=param_grid_xgb,
    scoring="f1_macro",
    cv=3,     # XGBoost tuning can be slower, so 3-fold is often used
    n_jobs=-1,
    verbose=1
)

grid_xgb.fit(X_train, y_train)
print("Best Params (XGB):", grid_xgb.best_params_)
print("Best CV Macro-F1:", grid_xgb.best_score_)

##### I couldn't do the Hyper Parameter Tuning , Cross Validation & SMOTE -- because doing that is causing memory error.