# This notebook will try to choose the Best model from a category.

## Import the necessary library

In [3]:
import pandas as pd
from src.models.RandomForestClassifier import RFClassifier
from src.models.SVMClassifier import SVMClassifier
from src.models.NaiveBayesClassifier import NaiveBayesClassifier
from src.models.KNNClassifier import KNNClassifier
from src.models.LogisticRegClassifier import LogisticRegClassifier
from src.models.NNClassifier import NNClassifier

import os
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import time

# Logistic Regression Model Training Notebook

This Jupyter Notebook serves as a documentation and workflow for training the Logistic Regression model. 

## Data Processing

In the following cells, we import the necessary libraries and load the training and testing datasets as well as the trained models. There are 1 variable that can be changed depending on the model we want to compare:
* `model_choice` : RF | SVM | KNN | LR | NB | NN |


NOTE : Make sure you already made the datasets with the commands:
- `make data`
- `make features`

NOTE : Make sure you already trained the models or downloads them with the commands:
- `make download_models`

We also perform data preprocessing steps such as encoding labels. Other preprocessing steps such as removing correlations with pca, and scaling the data are done within the pipeline of the model.

In [4]:
#load testing data for outlier presence
test_with = pd.read_csv('../../data/processed/with_outliers/test.csv')
y_test_with = test_with["class"]
X_test_with = test_with.drop("class", axis=1)

#load the testing data without outlier
test_without = pd.read_csv('../../data/processed/without_outliers/test.csv')
y_test_without = test_without["class"]
X_test_without = test_without.drop("class", axis=1)

#load the training class to encode labels
y_train_with = pd.read_csv('../../data/processed/with_outliers/train.csv')["class"]
y_train_without = pd.read_csv('../../data/processed/without_outliers/train.csv')["class"]

#encode labels of testing class once fit on training class
label_encoder_with = LabelEncoder().fit(y_train_with)
y_test_with = label_encoder_with.transform(y_test_with)
label_encoder_without = LabelEncoder().fit(y_train_without)
y_test_without = label_encoder_without.transform(y_test_without)

In [7]:
# Set the type of model to use from this list: ['RF', 'SVM', 'KNN', 'LR', 'NB', 'NN']
model_choice = 'NB'

models_dir = '../../models/'+model_choice+'/'

choice_clf = {'RF': RFClassifier(),
            'SVM': SVMClassifier(),
            'KNN': KNNClassifier(),
            'LR': LogisticRegClassifier(),
            'NB': NaiveBayesClassifier(),
            'NN': NNClassifier()}
models = {}
for filename in os.listdir(models_dir):
    clf = choice_clf[model_choice]
    clf.load(new_name=filename[:-7], path=models_dir)
    models[filename] = clf.model

## Evaluate the models

### Classification Report

In [8]:
for name, model in models.items():
    print(name)
    #check if name has a certain string
    if "_without_" in name :
        start_time = time.time()
        y_pred = model.predict(X_test_without)
        end_time = time.time()
        inference_time = (end_time - start_time)/len(y_pred)
        print("Inference time: ", inference_time)
        print(classification_report(label_encoder_without.inverse_transform(y_test_without), label_encoder_without.inverse_transform(y_pred)))
    else:
        start_time = time.time()
        y_pred = model.predict(X_test_with)
        end_time = time.time()
        inference_time = (end_time - start_time)/len(y_pred)
        print("Inference time: ", inference_time)
        print(classification_report(label_encoder_with.inverse_transform(y_test_with),label_encoder_with.inverse_transform(y_pred)))

NaiveBayes_with_outliers_oversampled.joblib
Inference time:  1.393890380859375e-06
              precision    recall  f1-score   support

      GALAXY       0.97      0.93      0.95     11889
         QSO       0.82      0.91      0.86      3792
        STAR       0.98      0.99      0.98      4319

    accuracy                           0.94     20000
   macro avg       0.92      0.94      0.93     20000
weighted avg       0.94      0.94      0.94     20000

NaiveBayes_without_outliers.joblib
Inference time:  2.9258430004119875e-07
              precision    recall  f1-score   support

      GALAXY       0.95      0.96      0.95     11605
         QSO       0.89      0.84      0.86      3719
        STAR       0.98      0.99      0.99      3876

    accuracy                           0.94     19200
   macro avg       0.94      0.93      0.93     19200
weighted avg       0.94      0.94      0.94     19200

NaiveBayes_without_outliers_oversampled.joblib
Inference time:  2.94260680675506