# This notebook will try to choose the Best Random Forest Model 

## Import the necessary library

In [12]:
import pandas as pd
from src.models.RandomForestClassifier import RFClassifier
import os
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import time

## Load the necessary files

### Load the test data

In [15]:
test_with = pd.read_csv('../data/processed/with_outliers/test.csv')
y_test_with = test_with["class"]
X_test_with = test_with.drop("class", axis=1)
test_without = pd.read_csv('../data/processed/without_outliers/test.csv')
y_test_without = test_without["class"]
X_test_without = test_without.drop("class", axis=1)
y_train_with = pd.read_csv('../data/processed/with_outliers/train.csv')["class"]
y_train_without = pd.read_csv('../data/processed/without_outliers/train.csv')["class"]
#encode labels
label_encoder_with = LabelEncoder().fit(y_train_with)
y_test_with = label_encoder.transform(y_test_with)
label_encoder_without = LabelEncoder().fit(y_train_without)
y_test_without = label_encoder.transform(y_test_without)


### Load the models

In [9]:
models_dir = '../models/RF/'

models = {}

for filename in os.listdir(models_dir):
    clf = RFClassifier()
    clf.load(new_name=filename, path=models_dir)
    models[filename] = clf.model

## Evaluate the models

### Classification Report

In [17]:
for name, model in models.items():
    print(name)
    #check if name has a certain string
    if "_without_" in name :
        start_time = time.time()
        y_pred = model.predict(X_test_without)
        end_time = time.time()
        inference_time = (end_time - start_time)/len(y_pred)
        print("Inference time: ", inference_time)
        print(classification_report(label_encoder.inverse_transform(y_test_without), label_encoder.inverse_transform(y_pred)))
    else:
        start_time = time.time()
        y_pred = model.predict(X_test_with)
        end_time = time.time()
        inference_time = (end_time - start_time)/len(y_pred)
        print("Inference time: ", inference_time)
        print(classification_report(label_encoder_with.inverse_transform(y_test_with),label_encoder_with.inverse_transform(y_pred)))

RandomForest_with_outliers.joblib
Inference time:  4.04603362083435e-05
              precision    recall  f1-score   support

      GALAXY       0.98      0.99      0.98     11889
         QSO       0.96      0.93      0.94      3792
        STAR       1.00      1.00      1.00      4319

    accuracy                           0.98     20000
   macro avg       0.98      0.97      0.97     20000
weighted avg       0.98      0.98      0.98     20000

RandomForest_without_outliers_oversampled.joblib
Inference time:  4.765960077444712e-05
              precision    recall  f1-score   support

      GALAXY       0.98      0.98      0.98     11605
         QSO       0.95      0.94      0.94      3719
        STAR       0.99      1.00      1.00      3876

    accuracy                           0.98     19200
   macro avg       0.97      0.97      0.97     19200
weighted avg       0.98      0.98      0.98     19200

RandomForest_without_outliers.joblib
Inference time:  5.301563690106074e-05
  