In [1]:
import sys
import os

# Get the absolute path of the project root (assuming you run the notebook from /your_project/notebooks/)
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

# Now, import your module
from API.model.random_forest_classifier import RandomForestClassifier

In [2]:
import numpy as np
import pandas as pd

df=pd.read_csv(r"..\API\data\processed\train_data.csv")
df_test=pd.read_csv(r"..\API\data\processed\test_data.csv")
df.head()

Unnamed: 0,service,flag,src_bytes,dst_bytes,same_srv_rate,diff_srv_rate,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_serror_rate,class
0,-0.645371,0.753006,-0.009889,-0.039309,0.772093,-0.349275,-0.813968,-0.779141,-0.280668,-0.641791,1
1,0.76891,0.753006,-0.010032,-0.039309,-1.32054,0.490826,-1.030875,-1.157808,2.764349,-0.641791,1
2,1.090338,-0.739909,-0.010092,-0.039309,-1.388778,0.042772,-0.804931,-0.935063,-0.173825,1.603803,0
3,-0.452515,0.753006,-0.009996,0.052472,0.772093,-0.349275,1.264717,1.069642,-0.440932,-0.574424,1
4,-0.452515,0.753006,-0.01001,-0.034581,0.772093,-0.349275,1.264717,1.069642,-0.440932,-0.641791,1


In [3]:
df.shape

(25192, 11)

In [4]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test= train_test_split(df.drop(["class"],axis=1),df['class'],test_size=0.2,random_state=42)

In [5]:
classifier=RandomForestClassifier(n_learners=250,max_depth=10,min_samples_leaf=5,min_information_gain=0.01)

In [6]:
classifier.train(np.array(x_train),np.array(y_train))

In [7]:
y_pred=classifier.predict(np.array(x_test))

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

train_acc=accuracy_score(y_train, classifier.predict(np.array(x_train)))
test_acc=accuracy_score(y_test, y_pred)
precision=precision_score(y_test, y_pred, average="weighted")
recall=recall_score(y_test, y_pred, average="weighted")
f1_score=f1_score(y_test, y_pred, average="weighted")
conf_matrix=confusion_matrix(y_test, y_pred)
print(f'Training accuracy: {train_acc} \n Testing accuracy: {test_acc} \n Precision: {precision} \n Recall: {recall} \n F1 Score: {f1_score} \n \n Confusion Matrix: \n {conf_matrix} \n \n')
print(classification_report(y_test, y_pred))

Training accuracy: 0.9964273309184737 
 Testing accuracy: 0.9946417940067473 
 Precision: 0.9946428905100789 
 Recall: 0.9946417940067473 
 F1 Score: 0.9946419884432824 
 Confusion Matrix: 
 [[2353   12]
 [  15 2659]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2365
           1       1.00      0.99      0.99      2674

    accuracy                           0.99      5039
   macro avg       0.99      0.99      0.99      5039
weighted avg       0.99      0.99      0.99      5039



In [11]:
param_grid = {
    'n_learners': [100, 250, 500],
    'max_depth': [5, 10, 15],
    'min_samples_leaf': [1, 5, 10],
    'min_information_gain': [0.01, 0.05, 0.1]
}

best_score = 0
best_param = None

for n_learners in param_grid['n_learners']:
    for max_depth in param_grid['max_depth']:
        for min_samples_leaf in param_grid['min_samples_leaf']:
            for min_information_gain in param_grid['min_information_gain']:
                classifier=RandomForestClassifier(n_learners=n_learners,max_depth=max_depth,min_samples_leaf=min_samples_leaf,min_information_gain=min_information_gain)
                classifier.train(np.array(x_train),np.array(y_train))
                y_pred=classifier.predict(np.array(x_test))
                test_acc=accuracy_score(y_test, y_pred)
                print(f'Param: {n_learners, max_depth, min_samples_leaf, min_information_gain} \nScore: {test_acc} \n')
                if test_acc > best_score:
                    best_score = test_acc
                    best_param = {'n_learners': n_learners, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf, 'min_information_gain': min_information_gain}

print(f'Best score: {best_score} \n Best param: {best_param}')

Param: (100, 5, 1, 0.01) 
 Score: 0.9704306410001985 

Param: (100, 5, 1, 0.05) 
 Score: 0.9243897598729907 

Param: (100, 5, 1, 0.1) 
 Score: 0.9069259773764636 

Param: (100, 5, 5, 0.01) 
 Score: 0.9708275451478467 

Param: (100, 5, 5, 0.05) 
 Score: 0.9239928557253423 

Param: (100, 5, 5, 0.1) 
 Score: 0.9069259773764636 

Param: (100, 5, 10, 0.01) 
 Score: 0.9708275451478467 

Param: (100, 5, 10, 0.05) 
 Score: 0.9243897598729907 

Param: (100, 5, 10, 0.1) 
 Score: 0.9069259773764636 

Param: (100, 10, 1, 0.01) 
 Score: 0.9958325064496923 

Param: (100, 10, 1, 0.05) 
 Score: 0.9277634451280016 

Param: (100, 10, 1, 0.1) 
 Score: 0.9069259773764636 

Param: (100, 10, 5, 0.01) 
 Score: 0.9952371502282199 

Param: (100, 10, 5, 0.05) 
 Score: 0.9277634451280016 

Param: (100, 10, 5, 0.1) 
 Score: 0.9069259773764636 

Param: (100, 10, 10, 0.01) 
 Score: 0.9948402460805715 

Param: (100, 10, 10, 0.05) 
 Score: 0.9275649930541774 

Param: (100, 10, 10, 0.1) 
 Score: 0.9069259773764636 

P

In [12]:
import json

# Save best hyperparameters
with open(r"..\API\model\Hyperparams\Rf_hparam.json", "w") as f:
    json.dump(best_param, f)