Model Training and Evaluation

In [1]:
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

In [3]:
#%%bigquery df_train
#SELECT * FROM `data.mscyberdataset.train_table`

In [9]:
# Load the dataset
df_train = pd.read_csv('C:/Users/MR/Desktop/Aproject4/test_data_processed.csv')

df_train.head()


Unnamed: 0,Id,OrgId,IncidentId,AlertId,DetectorId,AlertTitle,Category,IncidentGrade,EntityType,EvidenceRole,...,RegistryKey,RegistryValueName,ApplicationId,OAuthApplicationId,ResourceIdName,OSFamily,CountryCode,Day,Month,Hour
0,1245540519230,657,11767,87199,524,563,11,0,32,0,...,1631,635,2251,881,3586,5,242,4,6,22
1,1400159342154,3,91158,632273,2,2,1,0,19,0,...,1631,635,2251,881,3586,0,242,3,6,12
2,1279900255923,145,32247,131719,2932,10807,11,0,27,1,...,1631,635,2251,881,3586,5,242,8,6,3
3,60129547292,222,15294,917686,0,0,10,1,7,1,...,1631,635,2251,881,3586,5,242,12,6,12
4,515396080539,363,7615,5944,27,18,5,0,32,0,...,1631,635,2251,881,3586,5,242,6,6,17


In [10]:
#Splitting data
X= df_train.drop('IncidentGrade',axis=1)
y= df_train['IncidentGrade']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
#Selecting top features using anova 
from sklearn.feature_selection import f_classif, SelectKBest

selector = SelectKBest(score_func=f_classif, k=15)  # Adjust k as needed
X_new = selector.fit_transform(X_train, y_train)

selected_features = X_train.columns[selector.get_support()]
print("Selected Features:", selected_features)

Selected Features: Index(['OrgId', 'IncidentId', 'AlertId', 'DetectorId', 'AlertTitle',
       'Category', 'EntityType', 'EvidenceRole', 'Sha256', 'IpAddress',
       'DeviceName', 'NetworkMessageId', 'CountryCode', 'Day', 'Month'],
      dtype='object')


In [12]:
X_new=X[['OrgId', 'IncidentId', 'AlertId', 'DetectorId', 'AlertTitle',
       'Category', 'EntityType', 'EvidenceRole', 'Sha256', 'IpAddress',
       'AccountSid', 'DeviceName', 'NetworkMessageId', 'CountryCode', 'Day']]
X_new.head()

Unnamed: 0,OrgId,IncidentId,AlertId,DetectorId,AlertTitle,Category,EntityType,EvidenceRole,Sha256,IpAddress,AccountSid,DeviceName,NetworkMessageId,CountryCode,Day
0,657,11767,87199,524,563,11,32,0,138268,360606,2610,153085,529644,242,4
1,3,91158,632273,2,2,1,19,0,138268,360606,441377,2833,529644,242,3
2,145,32247,131719,2932,10807,11,27,1,4296,360606,441377,153085,529644,242,8
3,222,15294,917686,0,0,10,7,1,138268,360606,441377,153085,529644,242,12
4,363,7615,5944,27,18,5,32,0,138268,360606,133549,153085,529644,242,6


In [13]:
#Training train data with selected features

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

# Initialize the models
model_rf = RandomForestClassifier(random_state=42)
model_xgb = XGBClassifier(random_state=42)

#RANDOM FOREST
print(f"\nEvaluating RandomForest...")

# Fit the model
model_rf.fit(X_train, y_train)

# Predict on test data
y_pred_rf = model_rf.predict(X_test)

# Print accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))

#XGBOOST
print(f"\nEvaluating XGBoost...")

# Fit the model 
model_xgb.fit(X_train, y_train)

# Predict on test data
y_pred_xgb = model_xgb.predict(X_test)

# Print accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Classification Report:")
print(classification_report(y_test, y_pred_xgb))


Evaluating RandomForest...
Accuracy: 0.9860398901610217
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99    350749
           1       0.98      0.98      0.98    181180
           2       0.99      0.99      0.99    297649

    accuracy                           0.99    829578
   macro avg       0.99      0.98      0.98    829578
weighted avg       0.99      0.99      0.99    829578


Evaluating XGBoost...
Accuracy: 0.9232633941594401
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.96      0.93    350749
           1       0.94      0.87      0.90    181180
           2       0.95      0.91      0.93    297649

    accuracy                           0.92    829578
   macro avg       0.93      0.91      0.92    829578
weighted avg       0.92      0.92      0.92    829578



In [15]:
import os
import joblib

# Create directory if it doesn't exist
os.makedirs('model', exist_ok=True)

# Save the trained RandomForest model to the file
joblib.dump(model_rf, 'model/random_forest_model.pkl')

print("Model saved successfully!")



Model saved successfully!


In [17]:
import joblib

# Save the trained RandomForest model to a file
joblib.dump(model_rf, 'model/random_forest_model.pkl')

['model/random_forest_model.pkl']

In [18]:
#%%bigquery df_test
#SELECT * FROM `data.mscyberdataset.test_table`


In [21]:
import pandas as pd

# Load the test dataset
df_test = pd.read_csv("C:/Users/MR/Desktop/Aproject4/test_data_processed.csv")

# View the first few rows
print(df_test.head())


              Id  OrgId  IncidentId  AlertId  DetectorId  AlertTitle  \
0  1245540519230    657       11767    87199         524         563   
1  1400159342154      3       91158   632273           2           2   
2  1279900255923    145       32247   131719        2932       10807   
3    60129547292    222       15294   917686           0           0   
4   515396080539    363        7615     5944          27          18   

   Category  IncidentGrade  EntityType  EvidenceRole  ...  RegistryKey  \
0        11              0          32             0  ...         1631   
1         1              0          19             0  ...         1631   
2        11              0          27             1  ...         1631   
3        10              1           7             1  ...         1631   
4         5              0          32             0  ...         1631   

   RegistryValueName  ApplicationId  OAuthApplicationId  ResourceIdName  \
0                635           2251            

In [22]:
X2 = df_test[['OrgId', 'IncidentId', 'AlertId', 'DetectorId', 'AlertTitle',
       'Category', 'EntityType', 'EvidenceRole', 'Sha256', 'IpAddress',
       'AccountSid', 'DeviceName', 'NetworkMessageId', 'CountryCode', 'Day']]
y2= df_test['IncidentGrade']

In [23]:
# Load the model from file
loaded_model_rf = joblib.load('model/random_forest_model.pkl')

In [24]:
y_pred_new = loaded_model_rf.predict(X2)

In [25]:
print("Accuracy:", accuracy_score(y2, y_pred_new))
print("Classification Report:")
print(classification_report(y2, y_pred_new))


Accuracy: 0.997207253426322
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1752895
           1       1.00      1.00      1.00    902664
           2       1.00      1.00      1.00   1492329

    accuracy                           1.00   4147888
   macro avg       1.00      1.00      1.00   4147888
weighted avg       1.00      1.00      1.00   4147888

