In [3]:
%pip install dask[parquet]


Collecting dask[parquet]
  Downloading dask-2024.10.0-py3-none-any.whl.metadata (3.7 kB)
Collecting cloudpickle>=3.0.0 (from dask[parquet])
  Downloading cloudpickle-3.1.0-py3-none-any.whl.metadata (7.0 kB)
Collecting fsspec>=2021.09.0 (from dask[parquet])
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting partd>=1.4.0 (from dask[parquet])
  Downloading partd-1.4.2-py3-none-any.whl.metadata (4.6 kB)
Collecting pyyaml>=5.3.1 (from dask[parquet])
  Downloading PyYAML-6.0.2-cp312-cp312-win_amd64.whl.metadata (2.1 kB)
Collecting locket (from partd>=1.4.0->dask[parquet])
  Downloading locket-1.0.0-py2.py3-none-any.whl.metadata (2.8 kB)
Downloading cloudpickle-3.1.0-py3-none-any.whl (22 kB)
Downloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
Downloading partd-1.4.2-py3-none-any.whl (18 kB)
Downloading PyYAML-6.0.2-cp312-cp312-win_amd64.whl (156 kB)
Downloading dask-2024.10.0-py3-none-any.whl (1.3 MB)
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:



In [1]:
import dask.dataframe as dd

# Read parquet file in chunks using dask
df = dd.read_parquet("GUIDE_train_Feature_engineered_2.parquet")

# Optionally convert it back to pandas (if it fits in memory)
df = df.compute()


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['IncidentGrade'])
y = df['IncidentGrade']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize the Random Forest model with default parameters
rf = RandomForestClassifier(random_state=42)

# Train the model on the training data
rf.fit(X_train, y_train)

# Make predictions on the validation set (or test set if you have one)
y_pred = rf.predict(X_test)

import pickle

# Specify the filename for the model
filename = 'random_forest_model.pkl'

# Save the model to disk
with open(filename, 'wb') as file:
    pickle.dump(rf, file)

print(f"Model saved as {filename}")


Model saved as random_forest_model.pkl


In [3]:
import pickle

# Specify the filename for the model
filename = 'random_forest_model.pkl'

# Load the model from disk
with open(filename, 'rb') as file:
    rf = pickle.load(file)

print("Model loaded successfully")

y_pred = rf.predict(X_test)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Evaluate the model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Model loaded successfully
Accuracy: 0.8904511695538022
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.92      0.91    587318
           1       0.89      0.87      0.88    587318
           2       0.88      0.88      0.88    587318

    accuracy                           0.89   1761954
   macro avg       0.89      0.89      0.89   1761954
weighted avg       0.89      0.89      0.89   1761954



In [4]:
print(f"n_estimators: {rf.n_estimators}")
print(f"max_depth: {rf.max_depth}")
print(f"min_samples_split: {rf.min_samples_split}")
print(f"min_samples_leaf: {rf.min_samples_leaf}")

n_estimators: 100
max_depth: None
min_samples_split: 2
min_samples_leaf: 1


In [5]:
import pandas as pd
df_test = pd.read_csv('Guide_test.csv')

# 1. Drop `OSFamily` (Keep `OSVersion`)
df_test.drop(columns=['OSFamily'], inplace = True)


# 2. Drop `City` (Keep `State`)
df_test.drop(columns=['City'], inplace = True)

# 3. Drop `ApplicationId` (Keep `ApplicationName`)
df_test.drop(columns=['ApplicationId'], inplace = True)

# 4. Drop `AccountObjectId` (Keep `AccountSid`)
df_test.drop(columns=['AccountObjectId'], inplace = True)

# 5. Keep both `AccountName` and `AccountSid`

# 6. Drop `CountryCode` (Keep `State`)
df_test.drop(columns=['CountryCode'], inplace = True)


# Convert FolderPath and FileName to string in case they are not
df_test['FolderPath'] = df_test['FolderPath'].astype(str)
df_test['FileName'] = df_test['FileName'].astype(str)

# Combine `FileName` and `FolderPath` into `FullFilePath`, then drop the original columns
df_test['FullFilePath'] = df_test['FolderPath'].str.rstrip('/') + '/' + df_test['FileName']
df_test = df_test.drop(columns=['FileName', 'FolderPath'])

# 8. Keep both `RegistryValueName` and `RegistryValueData`

# 9. Drop `FileName` (Keep `Sha256`)
# Note: Already handled in step 7 by combining with `FolderPath`

# 10. Keep both `DeviceId` and `DeviceName`

# 11. Drop `AccountObjectId` (Keep `AccountUpn`)
# Note: Already handled in step 4

# Display the updated DataFrame

import numpy as np

hour = pd.to_datetime(df_test['Timestamp']).dt.hour
df_test['Timestamp'] = np.where((0 <= hour) & (hour < 2), 0,
                    np.where((2 <= hour) & (hour < 4), 1,
                    np.where((4 <= hour) & (hour < 6), 2,
                    np.where((6 <= hour) & (hour < 8), 3,
                    np.where((8 <= hour) & (hour < 10), 4,
                    np.where((10 <= hour) & (hour < 12), 5,
                    np.where((12 <= hour) & (hour < 14), 6,
                    np.where((14 <= hour) & (hour < 16), 7,
                    np.where((16 <= hour) & (hour < 18), 8,
                    np.where((18 <= hour) & (hour < 20), 9,
                    np.where((20 <= hour) & (hour < 22), 10, 11)))))))))))


  df_test = pd.read_csv('Guide_test.csv')


In [6]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import category_encoders as ce

# One-Hot Encoding
one_hot_columns = ['Category', 'EntityType', 'EvidenceRole']
df_test = pd.get_dummies(df_test, columns=one_hot_columns)

# Label Encoding
label_enc = LabelEncoder()
df_test['IncidentGrade'] = label_enc.fit_transform(df_test['IncidentGrade'])
df_test['State'] = label_enc.fit_transform(df_test['State'])
df_test['FullFilePath_encoded'] = label_enc.fit_transform(df_test['FullFilePath'])
df_test.drop(columns=['FullFilePath'], inplace = True)

# Target Encoding
target_enc = ce.TargetEncoder(cols=['ApplicationName', 'DeviceName', 'AccountName'])
df_test = target_enc.fit_transform(df_test, df_test['IncidentGrade'])


  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


In [7]:
# Verify the encoded data
print(df_test.head())

              Id  OrgId  IncidentId  AlertId  Timestamp  DetectorId  \
0  1245540519230    657       11767    87199         11         524   
1  1400159342154      3       91158   632273          6           2   
2  1279900255923    145       32247   131719          1        2932   
3    60129547292    222       15294   917686          6           0   
4   515396080539    363        7615     5944          8          27   

   AlertTitle                  MitreTechniques  IncidentGrade ActionGrouped  \
0         563      T1021;T1047;T1105;T1569.002              0           NaN   
1           2                              NaN              0           NaN   
2       10807  T1021;T1027.002;T1027.005;T1105              0           NaN   
3           0                  T1078;T1078.004              1           NaN   
4          18                  T1087;T1087.002              0           NaN   

   ... EntityType_OAuthApplication  EntityType_Process  \
0  ...                       False      

In [8]:
final_features = list(df.columns)

import pickle

# Specify the filename for the model
filename = 'final_features.pkl'

# Save the model to disk
with open(filename, 'wb') as file:
    pickle.dump(final_features, file)

print(f"Model saved as {filename}")

# Load the model from disk
with open('final_features.pkl', 'rb') as file:
    final_features = pickle.load(file)

Model saved as final_features.pkl


In [9]:
missing_columns = [col for col in final_features if col not in list(df_test.columns)]
for col in missing_columns:
    df_test[col] = 0

df_test1 = df_test[final_features]

In [10]:
df_test1

Unnamed: 0,Timestamp,DetectorId,AlertTitle,IncidentGrade,DeviceId,Sha256,IpAddress,Url,AccountSid,AccountUpn,...,EntityType_OAuthApplication,EntityType_Process,EntityType_RegistryKey,EntityType_RegistryValue,EntityType_SecurityGroup,EntityType_Url,EntityType_User,EvidenceRole_Impacted,EvidenceRole_Related,FullFilePath_encoded
0,11,524,563,0,98799,138268,360606,160396,2610,3699,...,False,False,False,False,False,False,True,True,False,55599
1,6,2,2,0,1239,138268,360606,160396,441377,673934,...,False,False,False,False,False,False,False,True,False,55599
2,1,2932,10807,0,98799,4296,360606,160396,441377,673934,...,False,True,False,False,False,False,False,False,True,86849
3,6,0,0,1,98799,138268,360606,160396,441377,673934,...,False,False,False,False,False,False,False,False,True,55599
4,8,27,18,0,98799,138268,360606,160396,133549,673934,...,False,False,False,False,False,False,True,True,False,55599
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4147987,1,139,120,0,98799,138268,360606,160396,13354,13012,...,False,False,False,False,False,False,True,True,False,55599
4147988,9,219,196,0,98799,138268,360606,160396,95744,172717,...,False,False,False,False,False,False,True,True,False,55599
4147989,0,57,29,1,98799,138268,1084,160396,441377,673934,...,False,False,False,False,False,False,False,False,True,55599
4147990,8,1,1,0,98799,138268,360606,160396,53146,59351,...,False,False,False,False,False,False,False,True,False,55599


In [12]:
# Load the model from disk
with open('random_forest_model.pkl', 'rb') as file:
    loaded_rf = pickle.load(file)

# Ensure the same preprocessing steps are applied to the test data
# For example, dropping unnecessary columns, handling missing values, etc.
X_test_final = df_test1.drop(columns=['IncidentGrade'])  # Drop the target variable if present
y_test_final = df_test1['IncidentGrade']  # Assuming 'IncidentGrade' is the target variable

# Make predictions on the final test set
y_pred_final = loaded_rf.predict(X_test_final)




In [13]:
from sklearn.metrics import classification_report, accuracy_score, f1_score

# Calculate accuracy
accuracy = accuracy_score(y_test_final, y_pred_final)
print(f"Accuracy: {accuracy}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test_final, y_pred_final))

# Calculate macro-F1 score
macro_f1 = f1_score(y_test_final, y_pred_final, average='macro')
print(f"Macro-F1 Score: {macro_f1}")

Accuracy: 0.8322851153039832

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.86      0.87   1752940
           1       0.70      0.79      0.74    902698
           2       0.88      0.82      0.85   1492354

    accuracy                           0.83   4147992
   macro avg       0.82      0.83      0.82   4147992
weighted avg       0.84      0.83      0.83   4147992

Macro-F1 Score: 0.820002170344648
