# Microsoft Classification Problem

Install required libraries

In [1]:
!pip install category-encoders
!pip install xgboost

Collecting category-encoders
  Downloading category_encoders-2.8.0-py3-none-any.whl.metadata (7.9 kB)
Collecting patsy>=0.5.1 (from category-encoders)
  Downloading patsy-1.0.1-py2.py3-none-any.whl.metadata (3.3 kB)
Collecting statsmodels>=0.9.0 (from category-encoders)
  Downloading statsmodels-0.14.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.2 kB)
Downloading category_encoders-2.8.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading patsy-1.0.1-py2.py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.9/232.9 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading statsmodels-0.14.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m95.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages

In [2]:
#Import necessary libraries for this project

import pandas as pd
import numpy as np
import pickle

import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold

import xgboost as xgb

import category_encoders as ce

**## 1. Reading input Train Data**

In [3]:
def read_data_custom():
  # Reading from Goggle drive as running millions of records in local is not possible
  from google.colab import drive
  drive.mount('/content/drive')
  # Reading data from drive

  # Reading 0.01 percent of data inorder to make this test with smaller data before moving to whole data
  #nd_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train_data/GUIDE_Train.csv', skiprows=lambda i: i>0 and np.random.rand() > 0.01)

  # Once everything done, comment above line and run below
  nd_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train_data/GUIDE_Train.csv')
  return nd_df

# 2. Data Exploration and feature Engineering

In [4]:
def data_quality(df):
  df = df.copy()
  # Drop duplicate columns
  df = df.drop_duplicates(keep=False)
  print(df.shape)
  #Dropping ID columns as it will not help in machine learning prediction
  id_columns = ['Id', 'OrgId', 'IncidentId', 'AlertId', 'DetectorId', 'DeviceId']
  df.drop(columns=id_columns)

  df['Timestamp'] = pd.to_datetime(df['Timestamp'])

  df['day_of_week'] = df['Timestamp'].dt.dayofweek

  #dropping timestamp column
  df.drop('Timestamp', axis=1, inplace=True)

  df['day_of_week'] = df['day_of_week'].astype('int64')

  return df

# 3. Data Cleaning function for train and test data

In [5]:
def data_cleaning(df):
  df["EmailClusterId"] = df["EmailClusterId"].fillna(-1)
  #Dropping majority of missing columns as it will have lesser impact on final prediction
  major_null_columns = ['ActionGrouped', 'ActionGranular', 'ThreatFamily', 'ResourceType', 'Roles', 'AntispamDirection']
  df.drop(columns=major_null_columns, inplace=True)

  df[['MitreTechniques', 'SuspicionLevel', 'LastVerdict']] = df[['MitreTechniques', 'SuspicionLevel', 'LastVerdict']].fillna('Unknown')

  # Filling the EmailClusterId with the most frequent values in list
  df['EmailClusterId'] = df['EmailClusterId'].fillna(df['EmailClusterId'].mode()[0])

  df = df.dropna(subset=['IncidentGrade']).copy()
  label_encoder = LabelEncoder()
  df['EncodedIncidentGrade'] = label_encoder.fit_transform(df['IncidentGrade'])
  df.drop(columns='IncidentGrade', inplace=True)

  return df


# 4. Data Preprocessing function for train and test data

In [6]:
def encode_categorical_values(df):
  label_encoder = LabelEncoder()
  df['EncodedCategory'] = label_encoder.fit_transform(df['Category'].fillna('Unknown'))  # Handling NaN

  # Encoding 'MitreTechniques' using Target Encoding (Mean Encoding)
  target_encoder = ce.TargetEncoder(cols=['MitreTechniques'])
  df['EncodedMitreTechniques'] = target_encoder.fit_transform(df['MitreTechniques'], df['EncodedCategory'])

  # One-Hot Encoding for 'EntityType'
  df = pd.get_dummies(df, columns=['EntityType'], drop_first=True)

  # Encoding 'EvidenceRole', 'SuspicionLevel', and 'LastVerdict' using Label Encoding
  df['EncodedEvidenceRole'] = label_encoder.fit_transform(df['EvidenceRole'])
  df['EncodedSuspicionLevel'] = label_encoder.fit_transform(df['SuspicionLevel'])
  df['EncodedLastVerdict'] = label_encoder.fit_transform(df['LastVerdict'])

  # Step 2: Dropping Redundant Columns (Original Categorical Columns)
  df = df.drop(columns=['Category', 'MitreTechniques', 'EvidenceRole', 'SuspicionLevel', 'LastVerdict'])

  return df

# 5. Train Test Split function for train and test data

In [7]:
def train_test_split_custom(df):
  X = df.drop('EncodedIncidentGrade', axis=1)  # Drop the target column to get features
  y = df['EncodedIncidentGrade']  # The target variable
  # Split the data into training and test sets
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  return X_train, X_test, y_train, y_test, X, y

# 6. Logistic Regression Model for train data

In [8]:
def logistic_regression_model(X_train, X_test, y_train, y_test):
  scaler = StandardScaler()
  X_train_scaled = scaler.fit_transform(X_train)
  X_test_scaled = scaler.transform(X_test)

  # Save the fitted scaler
  with open("log_scaler.pkl", "wb") as scaler_file:
      pickle.dump(scaler, scaler_file)

  # Initialize the Logistic Regression model
  log_reg = LogisticRegression(max_iter=1000)
  # Train the model
  log_reg.fit(X_train_scaled, y_train)

  # Make predictions on the test set
  y_pred = log_reg.predict(X_test_scaled)

  # Evaluate the model
  accuracy = accuracy_score(y_test, y_pred)
  print(f"Accuracy: {accuracy:.4f}")

  # Detailed classification report
  print("\nClassification Report:")
  print(classification_report(y_test, y_pred))

  with open("logistic_regression_model.pkl", "wb") as model_file:
    pickle.dump(log_reg, model_file)

  return accuracy

# 7. Xgboost model function for Tran data

In [9]:
def xgboost_model(X_train, X_test, y_train, y_test, y):
  scaler = StandardScaler()
  X_train_scaled = scaler.fit_transform(X_train)
  X_test_scaled = scaler.transform(X_test)


  xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(y.unique()), eval_metric='mlogloss')
  xgb_model.fit(X_train_scaled, y_train)


  y_pred = xgb_model.predict(X_test_scaled)
  accuracy = accuracy_score(y_test, y_pred)
  print(f"Accuracy: {accuracy:.4f}")
  print("\nClassification Report:")
  print(classification_report(y_test, y_pred))

  with open("xgboost_model.pkl", "wb") as model_file:
    pickle.dump(xgb_model, model_file)
    # Save the fitted scaler
  with open("xgb_scaler.pkl", "wb") as scaler_file:
      pickle.dump(scaler, scaler_file)
  return accuracy

# 8. Random Forest Model for Train Data

In [10]:
def random_forest_model(X_train, X_test, y_train, y_test):
  scaler = StandardScaler()
  X_train_scaled = scaler.fit_transform(X_train)
  X_test_scaled = scaler.transform(X_test)
  rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
  rf_model.fit(X_train_scaled, y_train)
  y_pred = rf_model.predict(X_test_scaled)
  accuracy = accuracy_score(y_test, y_pred)
  print(f"Accuracy: {accuracy:.4f}")

  # Detailed classification report
  print("\nClassification Report:")
  print(classification_report(y_test, y_pred))

  with open("random_forest_model.pkl", "wb") as model_file:
    pickle.dump(rf_model, model_file)

  with open("rf_scaler.pkl", "wb") as scaler_file:
      pickle.dump(scaler, scaler_file)
  return accuracy

# 9. KFold Cross Validation for train data

In [11]:

def kfold_xgboost(X, y, k=5):
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    accuracies = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Standardize features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # XGBoost model
        xgb_model = xgb.XGBClassifier(objective='multi:softmax',
                                      num_class=len(y.unique()),
                                      eval_metric='mlogloss')
        xgb_model.fit(X_train_scaled, y_train)

        # Predictions
        y_pred = xgb_model.predict(X_test_scaled)
        acc = accuracy_score(y_test, y_pred)
        accuracies.append(acc)

        print(f"Fold Accuracy: {acc:.4f}")
        print("\nClassification Report:\n", classification_report(y_test, y_pred))

    print(f"\nAverage Accuracy across {k} folds: {np.mean(accuracies):.4f}")

# Execution all the function to get prediction score of models

In [12]:
df = read_data_custom()

Mounted at /content/drive


In [13]:
df = data_quality(df)

(9481718, 45)


In [14]:
df = data_cleaning(df)

In [15]:
df = encode_categorical_values(df)

In [16]:
expected_columns = df.columns
print(expected_columns)

Index(['Id', 'OrgId', 'IncidentId', 'AlertId', 'DetectorId', 'AlertTitle',
       'DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn',
       'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId',
       'EmailClusterId', 'RegistryKey', 'RegistryValueName',
       'RegistryValueData', 'ApplicationId', 'ApplicationName',
       'OAuthApplicationId', 'FileName', 'FolderPath', 'ResourceIdName',
       'OSFamily', 'OSVersion', 'CountryCode', 'State', 'City', 'day_of_week',
       'EncodedIncidentGrade', 'EncodedCategory', 'EncodedMitreTechniques',
       'EntityType_AmazonResource', 'EntityType_AzureResource',
       'EntityType_Blob', 'EntityType_BlobContainer',
       'EntityType_CloudApplication', 'EntityType_CloudLogonRequest',
       'EntityType_CloudLogonSession', 'EntityType_Container',
       'EntityType_ContainerImage', 'EntityType_ContainerRegistry',
       'EntityType_File', 'EntityType_GenericEntity',
       'EntityType_GoogleCloudResource', 'Entit

In [17]:
X_train, X_test, y_train, y_test, X, y= train_test_split_custom(df)

# --> Logistic Regression Model validation for Train data

In [22]:
print(logistic_regression_model(X_train, X_test, y_train, y_test))

Accuracy: 0.6185

Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.85      0.69    822245
           1       0.57      0.20      0.29    404816
           2       0.71      0.59      0.65    659022

    accuracy                           0.62   1886083
   macro avg       0.62      0.55      0.54   1886083
weighted avg       0.63      0.62      0.59   1886083

0.6184876275328286


# -->  XGBoost Model validation for Train data

In [23]:
print(xgboost_model(X_train, X_test, y_train, y_test, y))

Accuracy: 0.9150

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.96      0.92    822245
           1       0.93      0.84      0.88    404816
           2       0.94      0.91      0.93    659022

    accuracy                           0.91   1886083
   macro avg       0.92      0.90      0.91   1886083
weighted avg       0.92      0.91      0.91   1886083

0.9149793513859146


# --> Random Forest Model validation for Train data

In [20]:
print(random_forest_model(X_train, X_test, y_train, y_test))

Accuracy: 0.9764

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98    822245
           1       0.97      0.96      0.97    404816
           2       0.99      0.97      0.98    659022

    accuracy                           0.98   1886083
   macro avg       0.98      0.97      0.97   1886083
weighted avg       0.98      0.98      0.98   1886083

0.9763843902945947


In [21]:
print(kfold_xgboost(X, y))

Fold Accuracy: 0.9153

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.96      0.92    822136
           1       0.93      0.84      0.88    405432
           2       0.94      0.91      0.93    658515

    accuracy                           0.92   1886083
   macro avg       0.92      0.90      0.91   1886083
weighted avg       0.92      0.92      0.91   1886083

Fold Accuracy: 0.9146

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.96      0.92    822136
           1       0.93      0.85      0.89    405432
           2       0.94      0.91      0.92    658515

    accuracy                           0.91   1886083
   macro avg       0.92      0.90      0.91   1886083
weighted avg       0.92      0.91      0.91   1886083

Fold Accuracy: 0.9159

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.96      0.9

# **Testing Data to test the model**

In [24]:
# Read test data

def read_test_data_custom():
  # Reading from Goggle drive as running millions of records in local is not possible
  from google.colab import drive
  drive.mount('/content/drive')
  # Reading data from drive

  # Reading 0.01 percent of data inorder to make this test with smaller data before moving to whole data
  #nd_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train_data/GUIDE_Test.csv', skiprows=lambda i: i>0 and np.random.rand() > 0.01)

  # Once everything done, comment above line and run below
  nd_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train_data/GUIDE_Test.csv')
  return nd_df

In [25]:
test_df = read_test_data_custom()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  nd_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train_data/GUIDE_Test.csv')


In [26]:
test_df = data_quality(test_df)

(4147784, 46)


In [27]:
test_df = data_cleaning(test_df)

In [28]:
test_df = encode_categorical_values(test_df)

In [29]:
for col in expected_columns:
  if col not in test_df.columns:
    test_df[col] = 0

test_df = test_df[expected_columns]

# **Logistic Regression Model for Test data**

In [30]:
def test_logistic_regression_model(test_df):

  # Load the trained scaler
  with open("log_scaler.pkl", "rb") as scaler_file:
      scaler = pickle.load(scaler_file)

  X_test_new = test_df.drop(columns=['EncodedIncidentGrade'])

  X_test_scaled = scaler.transform(X_test_new)

  with open("logistic_regression_model.pkl", "rb") as model_file:
    xgb_model = pickle.load(model_file)

  y_pred = xgb_model.predict(X_test_scaled)

  y_test_new = test_df['EncodedIncidentGrade']
  print(f"Accuracy: {accuracy_score(y_test_new, y_pred):.4f}")
  print("\nClassification Report:")
  print(classification_report(y_test_new, y_pred))

In [31]:
print(test_logistic_regression_model(test_df))

Accuracy: 0.6194

Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.87      0.69   1752850
           1       0.62      0.11      0.19    902630
           2       0.72      0.64      0.68   1492304

    accuracy                           0.62   4147784
   macro avg       0.64      0.54      0.52   4147784
weighted avg       0.63      0.62      0.58   4147784

None


# **XGBoost Model for Test data**

In [32]:
def test_xgboost_model(test_df):

  # Load the trained scaler
  with open("xgb_scaler.pkl", "rb") as scaler_file:
      scaler = pickle.load(scaler_file)

  X_test_new = test_df.drop(columns=['EncodedIncidentGrade'])
  X_test_scaled = scaler.transform(X_test_new)
  with open("xgboost_model.pkl", "rb") as model_file:
    xgb_model = pickle.load(model_file)

  y_pred = xgb_model.predict(X_test_scaled)

  y_test_new = test_df['EncodedIncidentGrade']
  print(f"Accuracy: {accuracy_score(y_test_new, y_pred):.4f}")
  print("\nClassification Report:")
  print(classification_report(y_test_new, y_pred))

In [33]:
print(test_xgboost_model(test_df))

Accuracy: 0.8822

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.91      0.89   1752850
           1       0.86      0.79      0.83    902630
           2       0.90      0.90      0.90   1492304

    accuracy                           0.88   4147784
   macro avg       0.88      0.87      0.87   4147784
weighted avg       0.88      0.88      0.88   4147784

None


# **Random Forest Model for Test data**

In [36]:
def test_random_forest_model(test_df):

  # Load the trained scaler
  with open("rf_scaler.pkl", "rb") as scaler_file:
      scaler = pickle.load(scaler_file)

  X_test_new = test_df.drop(columns=['EncodedIncidentGrade'])
  X_test_scaled = scaler.transform(X_test_new)
  with open("random_forest_model.pkl", "rb") as model_file:
    xgb_model = pickle.load(model_file)

  y_pred = xgb_model.predict(X_test_scaled)

  y_test_new = test_df['EncodedIncidentGrade']
  print(f"Accuracy: {accuracy_score(y_test_new, y_pred):.4f}")
  print("\nClassification Report:")
  print(classification_report(y_test_new, y_pred))

In [37]:
print(test_random_forest_model(test_df))

Accuracy: 0.9134

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.95      0.92   1752850
           1       0.92      0.84      0.88    902630
           2       0.94      0.92      0.93   1492304

    accuracy                           0.91   4147784
   macro avg       0.92      0.90      0.91   4147784
weighted avg       0.91      0.91      0.91   4147784

None
