# Model and Evaluation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

## Importing the Cleaned Train Csv

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/Project/4.Microsoft/train_cleaned.csv')
df_train.head(5)

Unnamed: 0,Id,OrgId,IncidentId,AlertId,DetectorId,AlertTitle,Category,IncidentGrade,EntityType,EvidenceRole,IpAddress,Url,AccountName,DeviceName,NetworkMessageId,RegistryKey,RegistryValueData,ApplicationName,OAuthApplicationId,FileName,ResourceIdName,OSVersion,City,Day,Year,Hour,Time
0,1709396985476,26,18583,687462,31,813,12,0,9,1,360606,160396,453297,153085,529644,1631,860,3421,881,4132,3586,66,10630,6,2024,5,21407
1,927712939180,33,5065,3990,38,25,9,0,17,0,360606,160396,453297,3142,529644,1631,860,3421,881,289573,3586,66,10630,3,2024,10,38709
2,1090921697002,201,150787,807590,419,444,6,0,17,0,360606,160396,453297,4181,529644,1631,860,3421,881,289573,3586,66,10630,13,2024,3,13000
3,1434519079555,204,108287,28575,44,1233,16,2,9,1,360606,160396,453297,153085,529644,1631,860,3421,881,97007,3586,66,10630,8,2024,16,60897
4,1005022347708,54,528202,1458226,102,58829,7,0,9,0,360606,160396,453297,153085,529644,1631,860,3421,881,39978,3586,66,10630,9,2024,2,10081


In [None]:
df_train.shape

(4577400, 27)

In [None]:
df_train.columns

Index(['Id', 'OrgId', 'IncidentId', 'AlertId', 'DetectorId', 'AlertTitle',
       'Category', 'IncidentGrade', 'EntityType', 'EvidenceRole', 'IpAddress',
       'Url', 'AccountName', 'DeviceName', 'NetworkMessageId', 'RegistryKey',
       'RegistryValueData', 'ApplicationName', 'OAuthApplicationId',
       'FileName', 'ResourceIdName', 'OSVersion', 'City', 'Day', 'Year',
       'Hour', 'Time'],
      dtype='object')

In [None]:
from sklearn.model_selection import train_test_split

X = df_train.drop(columns=['Id','IncidentGrade'])
y = df_train['IncidentGrade']

# Stratified train-validation split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [None]:
print("y_train: ",y_train.value_counts())
print("\n")
print("y_test: ",y_test.value_counts())

y_train:  IncidentGrade
0    1381760
2    1126270
1     696150
Name: count, dtype: int64


y_test:  IncidentGrade
0    592183
2    482687
1    298350
Name: count, dtype: int64


In [None]:
print("y_train: ",y_train.value_counts(normalize=True))
print("\n")
print("y_test: ",y_test.value_counts(normalize=True))

y_train:  IncidentGrade
0    0.431237
2    0.351500
1    0.217263
Name: proportion, dtype: float64


y_test:  IncidentGrade
0    0.431237
2    0.351500
1    0.217263
Name: proportion, dtype: float64


## Selecting the Best Model

In [None]:
from sklearn.metrics import classification_report

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# Logistic Regression Model
lr = LogisticRegression(class_weight='balanced', random_state=42)

# Training the Logistic Regression model
print("\nTraining Logistic Regression...")
lr.fit(X_train, y_train)

# Predicting on the validation set
y_pred_train_lr = lr.predict(X_train)
y_pred_test_lr = lr.predict(X_test)

# Classification Report for Logistic Regression
print("Logistic Regression Classification Report for Train :\n", classification_report(y_train, y_pred_train_lr, target_names=['Class 0', 'Class 1', 'Class 2']))
print("-----------------------------------------------------------------------")
print("Logistic Regression Classification Report for Test :\n", classification_report(y_test, y_pred_test_lr, target_names=['Class 0', 'Class 1', 'Class 2']))



Training Logistic Regression...
Logistic Regression Classification Report for Train :
               precision    recall  f1-score   support

     Class 0       0.60      0.46      0.52   1381760
     Class 1       0.34      0.33      0.33    696150
     Class 2       0.52      0.69      0.60   1126270

    accuracy                           0.51   3204180
   macro avg       0.49      0.49      0.48   3204180
weighted avg       0.52      0.51      0.51   3204180

-----------------------------------------------------------------------
Logistic Regression Classification Report for Test :
               precision    recall  f1-score   support

     Class 0       0.60      0.45      0.52    592183
     Class 1       0.34      0.33      0.33    298350
     Class 2       0.52      0.69      0.60    482687

    accuracy                           0.51   1373220
   macro avg       0.49      0.49      0.48   1373220
weighted avg       0.52      0.51      0.51   1373220



### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
# prompt: based on logestic regression i need for decision tree classifier

# Decision Tree Classifier Model
dt = DecisionTreeClassifier(class_weight='balanced', random_state=42)

# Training the Decision Tree Classifier model
print("\nTraining Decision Tree Classifier...")
dt.fit(X_train, y_train)

# Predicting on the validation set
y_pred_train_dt = dt.predict(X_train)
y_pred_test_dt = dt.predict(X_test)

# Classification Report for Decision Tree Classifier
print("Decision Tree Classifier Classification Report for Train :\n", classification_report(y_train, y_pred_train_dt, target_names=['Class 0', 'Class 1', 'Class 2']))
print("-----------------------------------------------------------------------")
print("Decision Tree Classifier Classification Report for Test :\n", classification_report(y_test, y_pred_test_dt, target_names=['Class 0', 'Class 1', 'Class 2']))



Training Decision Tree Classifier...
Decision Tree Classifier Classification Report for Train :
               precision    recall  f1-score   support

     Class 0       1.00      1.00      1.00   1381760
     Class 1       1.00      1.00      1.00    696150
     Class 2       1.00      1.00      1.00   1126270

    accuracy                           1.00   3204180
   macro avg       1.00      1.00      1.00   3204180
weighted avg       1.00      1.00      1.00   3204180

-----------------------------------------------------------------------
Decision Tree Classifier Classification Report for Test :
               precision    recall  f1-score   support

     Class 0       0.99      0.99      0.99    592183
     Class 1       0.98      0.98      0.98    298350
     Class 2       0.99      0.99      0.99    482687

    accuracy                           0.99   1373220
   macro avg       0.99      0.99      0.99   1373220
weighted avg       0.99      0.99      0.99   1373220



### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(class_weight='balanced', random_state=42)

print("\nTraining Random Forest Classifier...")
rf.fit(X_train, y_train)

y_pred_train_rf = rf.predict(X_train)
y_pred_test_rf = rf.predict(X_test)

print("Random Forest Classifier Classification Report for Train :\n", classification_report(y_train, y_pred_train_rf, target_names=['Class 0', 'Class 1', 'Class 2']))
print("-----------------------------------------------------------------------")
print("Random Forest Classifier Classification Report for Test :\n", classification_report(y_test, y_pred_test_rf, target_names=['Class 0', 'Class 1', 'Class 2']))



Training Random Forest Classifier...
Random Forest Classifier Classification Report for Train :
               precision    recall  f1-score   support

     Class 0       1.00      1.00      1.00   1381760
     Class 1       1.00      1.00      1.00    696150
     Class 2       1.00      1.00      1.00   1126270

    accuracy                           1.00   3204180
   macro avg       1.00      1.00      1.00   3204180
weighted avg       1.00      1.00      1.00   3204180

-----------------------------------------------------------------------
Random Forest Classifier Classification Report for Test :
               precision    recall  f1-score   support

     Class 0       0.97      0.99      0.98    592183
     Class 1       0.98      0.96      0.97    298350
     Class 2       0.99      0.97      0.98    482687

    accuracy                           0.98   1373220
   macro avg       0.98      0.97      0.98   1373220
weighted avg       0.98      0.98      0.98   1373220



### Lightgbm

In [None]:
import lightgbm as lgb

# LightGBM Classifier
lgbm = lgb.LGBMClassifier(random_state=42)

print("\nTraining LightGBM Classifier...")
lgbm.fit(X_train, y_train)

y_pred_train_lgbm = lgbm.predict(X_train)
y_pred_test_lgbm = lgbm.predict(X_test)

print("LightGBM Classifier Classification Report for Train :\n", classification_report(y_train, y_pred_train_lgbm, target_names=['Class 0', 'Class 1', 'Class 2']))
print("-----------------------------------------------------------------------")
print("LightGBM Classifier Classification Report for Test :\n", classification_report(y_test, y_pred_test_lgbm, target_names=['Class 0', 'Class 1', 'Class 2']))



Training LightGBM Classifier...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.171963 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3642
[LightGBM] [Info] Number of data points in the train set: 3204180, number of used features: 25
[LightGBM] [Info] Start training from score -0.841098
[LightGBM] [Info] Start training from score -1.526646
[LightGBM] [Info] Start training from score -1.045545
LightGBM Classifier Classification Report for Train :
               precision    recall  f1-score   support

     Class 0       0.86      0.96      0.91   1381760
     Class 1       0.93      0.80      0.86    696150
     Class 2       0.94      0.89      0.91   1126270

    accuracy                           0.90   3204180
   macro avg       0.91      0.88      0.89   3204180
weighted avg       0.90      0.90      0.90   3204180

-----------------------------------------------------------------------
LightG

### XgBoost

In [None]:
# prompt: i want for xgboost too

import xgboost as xgb

# XGBoost Classifier
xg = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

print("\nTraining XGBoost Classifier...")
xg.fit(X_train, y_train)

y_pred_train_xg = xg.predict(X_train)
y_pred_test_xg = xg.predict(X_test)

print("XGBoost Classifier Classification Report for Train :\n", classification_report(y_train, y_pred_train_xg, target_names=['Class 0', 'Class 1', 'Class 2']))
print("-----------------------------------------------------------------------")
print("XGBoost Classifier Classification Report for Test :\n", classification_report(y_test, y_pred_test_xg, target_names=['Class 0', 'Class 1', 'Class 2']))



Training XGBoost Classifier...
XGBoost Classifier Classification Report for Train :
               precision    recall  f1-score   support

     Class 0       0.89      0.96      0.92   1381760
     Class 1       0.93      0.85      0.89    696150
     Class 2       0.95      0.91      0.93   1126270

    accuracy                           0.92   3204180
   macro avg       0.92      0.91      0.91   3204180
weighted avg       0.92      0.92      0.92   3204180

-----------------------------------------------------------------------
XGBoost Classifier Classification Report for Test :
               precision    recall  f1-score   support

     Class 0       0.89      0.96      0.92    592183
     Class 1       0.93      0.85      0.89    298350
     Class 2       0.95      0.91      0.93    482687

    accuracy                           0.92   1373220
   macro avg       0.92      0.91      0.91   1373220
weighted avg       0.92      0.92      0.92   1373220



## Choosing best model
1. Logistic Regression
Training Accuracy: 51%, Test Accuracy: 51% — Poor performance with low precision and recall for Class 1, and weaker performance overall.
2. Decision Tree Classifier
Training Accuracy: 100%, Test Accuracy: 99% — Excellent performance but shows signs of overfitting, with perfect training accuracy and slight drop in test accuracy.
3. Random Forest Classifier
Training Accuracy: 100%, Test Accuracy: 98% — Similar to Decision Tree, strong performance but potentially overfitting due to perfect training scores.
4. LightGBM Classifier
Training Accuracy: 90%, Test Accuracy: 90% — Balanced performance with stable test results, no signs of overfitting.
5. XGBoost Classifier
Training Accuracy: 92%, Test Accuracy: 92% — High performance with good generalization and no overfitting.


## Conclusion
* Best Performing Models: LightGBM and XGBoost offer the best generalization and stable performance.
* Overfitting Concern: Decision Tree and Random Forest show overfitting due to perfect training scores but good test results.
* Logistic Regression: Should be reconsidered as it performs poorly on both training and test sets.




## Taking 20% of X_train, y_train


In [None]:
X_train_sample = X_train.sample(frac=0.2, random_state=42)
y_train_sample = y_train.loc[X_train_sample.index]

print("Original Training Data Size:", X_train.shape)
print("Sampled Training Data Size:", X_train_sample.shape)

Original Training Data Size: (3204180, 25)
Sampled Training Data Size: (640836, 25)


## SMOTE TECHNIQUE

In [None]:
# Applying SMOTE to the training data
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_sample, y_train_sample)

In [None]:
# Checking the new class distribution
print("Original class distribution:\n", y_train_sample.value_counts())
print("After SMOTE class distribution:\n", y_train_smote.value_counts())

Original class distribution:
 IncidentGrade
0    276464
2    225252
1    139120
Name: count, dtype: int64
After SMOTE class distribution:
 IncidentGrade
2    276464
0    276464
1    276464
Name: count, dtype: int64


### Fitting the Smote to Xgboost Algorithm

In [None]:
# Training the XGBoost model
from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=42,tree_method='hist',device='cuda')
xgb.fit(X_train_smote, y_train_smote)

# Prediction on the validation set
y_test_smote_pred = xgb.predict(X_test)

# Evaluate performance
print("Validation Set Classification Report (After SMOTE):")
print(classification_report(y_test, y_test_smote_pred, target_names=['Class 0', 'Class 1', 'Class 2']))

Validation Set Classification Report (After SMOTE):
              precision    recall  f1-score   support

     Class 0       0.92      0.92      0.92    592183
     Class 1       0.83      0.90      0.86    298350
     Class 2       0.95      0.90      0.92    482687

    accuracy                           0.91   1373220
   macro avg       0.90      0.90      0.90   1373220
weighted avg       0.91      0.91      0.91   1373220



## Feature Importance

In [None]:
feature_importance = pd.DataFrame({
    "Column": X_train_smote.columns,
    "Score": xgb.feature_importances_ *100
}).sort_values('Score',ascending=False)
feature_importance

Unnamed: 0,Column,Score
0,OrgId,12.725735
11,DeviceName,8.15239
1,IncidentId,7.918359
4,AlertTitle,7.769856
7,EvidenceRole,7.709688
3,DetectorId,6.139986
5,Category,6.0933
2,AlertId,5.951309
12,NetworkMessageId,5.537221
20,City,4.686828


In [None]:
# Taking top features based on feature importance
top_features = ['OrgId', 'IncidentId', 'AlertTitle', 'DetectorId',
                   'AlertId', 'Category', 'Hour', 'Time', 'Day', 'EntityType', 'City']

In [None]:
X_train_features = X_train_smote[top_features]
X_test_features = X_test[top_features]

In [None]:
print("Shape of training set with top features:", X_train_features.shape)
print("Shape of validation set with top features:", X_test_features.shape)

Shape of training set with top features: (829392, 11)
Shape of validation set with top features: (1373220, 11)


## Training XGB with top_features

In [None]:
xgb_top_features = XGBClassifier(random_state=42,tree_method='hist',device='cuda')
xgb_top_features.fit(X_train_features, y_train_smote)

y_test_top_features_pred = xgb_top_features.predict(X_test_features)

print("Validation Set Classification Report (After SMOTE):")
print(classification_report(y_test, y_test_top_features_pred, target_names=['Class 0', 'Class 1', 'Class 2']))

Validation Set Classification Report (After SMOTE):
              precision    recall  f1-score   support

     Class 0       0.92      0.92      0.92    592183
     Class 1       0.84      0.90      0.87    298350
     Class 2       0.95      0.90      0.93    482687

    accuracy                           0.91   1373220
   macro avg       0.90      0.91      0.90   1373220
weighted avg       0.91      0.91      0.91   1373220



## HyperParameter Tuning

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
}

# Initialize the XGBoost classifier with minimal settings
xgb_tuned = XGBClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(xgb_tuned, param_grid, cv=3, scoring='f1_macro', verbose=2, n_jobs=-1)

# Fit GridSearchCV to the training data
grid_search.fit(X_train_features, y_train_smote)

# Best hyperparameters and score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate on the test set
y_pred = grid_search.best_estimator_.predict(X_test_features)
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 300}
Best Score: 0.9274068016594391
              precision    recall  f1-score   support

           0       0.95      0.95      0.95    592183
           1       0.90      0.93      0.91    298350
           2       0.96      0.94      0.95    482687

    accuracy                           0.94   1373220
   macro avg       0.94      0.94      0.94   1373220
weighted avg       0.94      0.94      0.94   1373220



# Saving the best model using Joblib

In [None]:
import joblib
joblib.dump(grid_search.best_estimator_, 'best_model.joblib')


['best_model.joblib']

# Evaluating the Train dataset using Joblib

In [None]:
import pandas as pd
from sklearn.metrics import classification_report

# Loading the trained model
model = joblib.load('best_model.joblib')
y_pred = model.predict(X_test_features)


# Evaluating the model
print("Test Set Classification Report (Top Features):")
print(classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1', 'Class 2']))


Test Set Classification Report (Top Features):
              precision    recall  f1-score   support

     Class 0       0.95      0.95      0.95    592183
     Class 1       0.90      0.93      0.91    298350
     Class 2       0.96      0.94      0.95    482687

    accuracy                           0.94   1373220
   macro avg       0.94      0.94      0.94   1373220
weighted avg       0.94      0.94      0.94   1373220



# Evaluating the Test Dataset using the Best Model Joblib File

In [None]:
df_test = pd.read_csv('/content/drive/MyDrive/Project/4.Microsoft/test_cleaned.csv')
df_test.head(5)

Unnamed: 0,Id,OrgId,IncidentId,AlertId,DetectorId,AlertTitle,Category,IncidentGrade,EntityType,EvidenceRole,...,ApplicationName,OAuthApplicationId,FileName,ResourceIdName,OSVersion,City,Day,Year,Hour,Time
0,1245540519230,657,11767,87199,524,563,11,0,28,0,...,3421,881,289573,3586,66,10630,4,2024,22,82520
1,1400159342154,3,91158,632273,2,2,1,0,15,0,...,3421,881,289573,3586,0,10630,3,2024,12,46649
2,1279900255923,145,32247,131719,2932,10807,11,0,23,1,...,3421,881,14,3586,66,10630,8,2024,3,12037
3,60129547292,222,15294,917686,0,0,10,1,7,1,...,3421,881,289573,3586,66,10630,12,2024,12,43594
4,515396080539,363,7615,5944,27,18,5,0,28,0,...,3421,881,289573,3586,66,10630,6,2024,17,63665


In [None]:
from sklearn.model_selection import train_test_split
X = df_test.drop(columns=['Id', 'IncidentGrade'])
y = df_test['IncidentGrade']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
top_features = ['OrgId', 'IncidentId', 'AlertTitle', 'DetectorId',
                   'AlertId', 'Category', 'Hour', 'Time', 'Day', 'EntityType', 'City']

In [None]:
X_train_features = X_train[top_features]
X_test_features = X_test[top_features]

In [None]:
model = joblib.load('/content/drive/MyDrive/Project/4.Microsoft/best_model.joblib')
y_pred = model.predict(X_test_features)

In [None]:
print("Test Set Classification Report (Top Features):")
print(classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1', 'Class 2']))

Test Set Classification Report (Top Features):
              precision    recall  f1-score   support

     Class 0       0.95      0.82      0.88    350749
     Class 1       0.73      0.92      0.81    181180
     Class 2       0.91      0.92      0.92    297649

    accuracy                           0.88    829578
   macro avg       0.86      0.89      0.87    829578
weighted avg       0.89      0.88      0.88    829578

