# Notebook overview
Computes evaluation metrics for model predictions (MLP or k-NN) on test datasets and saves score reports and confusion matrices.

- Loads prediction CSVs for high/low ID and OOD test splits
- Computes balanced accuracy, precision, F1, classification reports, and confusion matrices
- Saves score reports and confusion matrices to the results folder

The notebook was used for both datasets just adapte the variables at "Path - df_dir_path, ...", selecting dataset ('origin' or 'resized') and model ('mlp' or 'knn') via variables.

# Preperation

### Import

In [61]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn import metrics

from typing import List

### Path - df_dir_path, ...

In [None]:
### The paths for the original or resized data must be adjusted for visualisation.
use_dataset = 'resized'         # Change the dataset name by knn to "high_id_test_prediction_k_13.csv"
# use_dataset = 'origin'        # Change the dataset name by knn to "low_id_test_prediction_k_10.csv"
### The paths for the knn or mlp model must be adjusted for visualisation.
# use_model = 'knn'
use_model = 'mlp'


# DF Folder to load predictions
DF_DIR_PATH = rf'/home/stud/jleick/masterArbeitProjekt/final_release/models/{use_model}/{use_dataset}/prediction'
df_dir_path = Path(DF_DIR_PATH)
if not df_dir_path.exists():
    raise FileNotFoundError(f"File does not exist: {DF_DIR_PATH}")

# Folder to save results
RESULT_DIR_PATH = rf'/home/stud/jleick/masterArbeitProjekt/final_release/models/{use_model}/{use_dataset}/scores'
result_dir_path = Path(RESULT_DIR_PATH)
if not result_dir_path.exists():
    raise FileNotFoundError(f"Folder does not exist: {RESULT_DIR_PATH}")

### Load df - high_id_test_prediction_df, ...

In [63]:
# Change the dataset name for knn origin to "high_id_test_prediction_k_10.csv" and for knn resized to "low_id_test_prediction_k_13.csv" and for mlp to "low_id_test_prediction.csv"

high_id_test_prediction_df = pd.read_csv( df_dir_path / 'high_id_test_prediction.csv', index_col=False )
low_id_test_prediction_df = pd.read_csv( df_dir_path / 'low_id_test_prediction.csv', index_col=False )

high_ood_test_prediction_df = pd.read_csv( df_dir_path / 'high_ood_test_prediction.csv', index_col=False )
low_ood_test_prediction_df = pd.read_csv( df_dir_path / 'low_ood_test_prediction.csv', index_col=False )

# Scores

### Calculate - balanced_accuracy_score

In [64]:
# Check Done
# https://scikit-learn.org/stable/modules/model_evaluation.html#balanced-accuracy-score # macro-average

high_id_test_balanced_accuracy_score = metrics.balanced_accuracy_score(high_id_test_prediction_df['label'], high_id_test_prediction_df['prediction'])
print(f'high_id_test balanced accuracy score: {high_id_test_balanced_accuracy_score}')

low_id_test_balanced_accuracy_score = metrics.balanced_accuracy_score(low_id_test_prediction_df['label'], low_id_test_prediction_df['prediction'])
print(f'high_id_test balanced accuracy score: {low_id_test_balanced_accuracy_score}')

high_id_test balanced accuracy score: 0.9232018313316476
high_id_test balanced accuracy score: 0.4378806316005604


### Calculate - precision_score

In [65]:
# Check
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
high_id_test_precision_score = metrics.precision_score(high_id_test_prediction_df['label'], high_id_test_prediction_df['prediction'], average='macro')
print(f'high_id_test precision score: {high_id_test_precision_score}')

low_id_test_precision_score = metrics.precision_score(low_id_test_prediction_df['label'], low_id_test_prediction_df['prediction'], average='macro')
print(f'high_id_test precision score: {low_id_test_precision_score}')

high_id_test precision score: 0.9247269707618303
high_id_test precision score: 0.4551161644280467


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Calculate - f1_score

In [66]:
# Check
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
high_id_test_f1_score = metrics.f1_score(high_id_test_prediction_df['label'], high_id_test_prediction_df['prediction'], average='macro')
print(f'high_id_test f1 score: {high_id_test_f1_score}')

low_id_test_f1_score = metrics.f1_score(low_id_test_prediction_df['label'], low_id_test_prediction_df['prediction'], average='macro')
print(f'high_id_test f1 score: {low_id_test_f1_score}')

high_id_test f1 score: 0.9231443454888021
high_id_test f1 score: 0.39195639356735373


### Save scores

In [67]:
# scores_df = pd.DataFrame({
#     "balanced_accuracy": [high_id_test_balanced_accuracy_score, low_id_test_balanced_accuracy_score],
#     "precision": [high_id_test_precision_score, low_id_test_precision_score],
#     "f1": [high_id_test_f1_score, low_id_test_f1_score]
# }, index=["high_id_test", "low_id_test"])

# scores_df.to_csv( result_dir_path / 'scores.csv')

# Function

### Function - calculate_scores

In [None]:
def calculate_scores( labels_high: List[int], prediction_high: List[int], labels_low: List[int], prediction_low: List[int], result_dir_path):
    # create score report
    high_id_test_score_report = metrics.classification_report(labels_high, prediction_high, output_dict=True)
    low_id_test_score_report = metrics.classification_report(labels_low, prediction_low, output_dict=True)
    # print(high_id_test_score_report)
    # print(low_id_test_score_report)

    # calculate balance accuracy
    high_id_test_balanced_accuracy_score = metrics.balanced_accuracy_score(labels_high, prediction_high)
    low_id_test_balanced_accuracy_score = metrics.balanced_accuracy_score(labels_low, prediction_low)
    # print(high_id_test_balanced_accuracy_score)
    # print(low_id_test_balanced_accuracy_score)

    # combine results
    high_id_test_score_report_df = pd.DataFrame(high_id_test_score_report)
    low_id_test_score_report_df = pd.DataFrame(low_id_test_score_report)
    high_id_test_score_report_df.loc['balanced_accuracy', 'macro avg'] = high_id_test_balanced_accuracy_score
    low_id_test_score_report_df.loc['balanced_accuracy', 'macro avg'] = low_id_test_balanced_accuracy_score

    # save scores
    high_id_test_score_report_df.to_csv( result_dir_path / 'high_id_test_score_report.csv')
    low_id_test_score_report_df.to_csv( result_dir_path / 'low_id_test_score_report.csv')

# calculate_scores

In [69]:
calculate_scores(high_id_test_prediction_df['label'], high_id_test_prediction_df['prediction'], low_id_test_prediction_df['label'], low_id_test_prediction_df['prediction'], result_dir_path)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Calculate - confusion_matrix

In [70]:
high_id_test_confusion_matrix = metrics.confusion_matrix(high_id_test_prediction_df['label'], high_id_test_prediction_df['prediction'])
low_id_test_confusion_matrix = metrics.confusion_matrix(low_id_test_prediction_df['label'], low_id_test_prediction_df['prediction'])

# save confusion matrix
np.save( result_dir_path / "high_id_test_confusion_matrix.npy", high_id_test_confusion_matrix)
np.save( result_dir_path / "low_id_test_confusion_matrix.npy", low_id_test_confusion_matrix)

# Review Result

In [71]:
mlc = metrics.multilabel_confusion_matrix(low_id_test_prediction_df['label'], low_id_test_prediction_df['prediction'])
print(mlc[12])
print(mlc[18])
print(mlc[35])
print(mlc[38])
print(mlc[42])
print(mlc[71])
print(mlc[74])
print(mlc[90])

[[2124    7]
 [   7    0]]
[[2118   13]
 [   6    1]]
[[2114    7]
 [  17    0]]
[[2121    0]
 [  17    0]]
[[2130    2]
 [   6    0]]
[[2131    1]
 [   6    0]]
[[2123    0]
 [  15    0]]
[[2132    0]
 [   6    0]]


In [72]:
print( metrics.classification_report(high_id_test_prediction_df['label'], high_id_test_prediction_df['prediction']) )
print( metrics.classification_report(low_id_test_prediction_df['label'], low_id_test_prediction_df['prediction']) ) # zero_division=np.nan

              precision    recall  f1-score   support

           0       0.91      0.89      0.90       199
           1       0.97      0.90      0.93       200
           2       0.96      0.98      0.97       167
           3       0.93      0.96      0.94       138
           4       0.94      0.91      0.93       164
           5       0.92      0.91      0.91       200
           6       0.98      0.97      0.98       200
           7       0.91      0.93      0.92       200
           8       0.98      0.95      0.96       200
           9       0.93      0.92      0.92       200
          10       0.90      0.92      0.91       200
          11       0.95      0.80      0.87       200
          12       0.96      0.88      0.92       199
          13       0.87      0.93      0.90       200
          14       0.90      0.96      0.93       200
          15       0.93      0.96      0.95       200
          16       0.94      0.94      0.94       200
          17       0.89    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
