## Model results

### Import libraries

In [None]:
import pandas as pd
import os
from IPython.display import display
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
import pygwalker as pyg

### Load results

In [None]:
# Path to results
results_path = '/home/juandres/aml/CheXBias/data/processed/'

# Choose model 
model = 'densenet121'

# Get all .csv files
for el in os.listdir(os.path.join(results_path,model)):
    print('Results :',el)

### Ground truth

In [None]:
df_gt = pd.read_csv('/home/juandres/aml/CheXBias/data/raw/CheXpert-v1.0/train_VisualCheXbert.csv')

### Model1

This is a model trained under the next parameters:
* lr : 0.00004
* epochs : 4
* ... all default

There are not subgroups in the training dataset based on age or sex. Given this, we want to analyze on this results if there is any bias

In [None]:
# Load predictions 
df_model1 = pd.read_csv(os.path.join(results_path,model,'model1.csv'))
classes = df_model1.columns[1:]

#### Get metrics per image

In [None]:
# Assuming `classes` variable contains the list of class names
# Calculate metrics for each class
metrics = {}

for class_name in classes:
    true_labels = df_gt[class_name]
    predicted_labels = df_model1[class_name]
    
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    
    metrics[class_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1': f1
    }

# Create a DataFrame from the metrics dictionary
metrics_per_class_df = pd.DataFrame(metrics).T


Sort the models by path. After verification, they coincide

In [None]:
df_model1 = df_model1.sort_values(by='Path')
df_model1 = df_model1.reset_index(drop=True)

df_gt = df_gt.sort_values(by='Path')
df_gt = df_gt.reset_index(drop=True)

Verification step

In [None]:
yes = 0
for i in tqdm(range(0,len(df_gt))):
    val = 1 if df_gt.loc[i]['Path'] == df_model1.loc[i]['Path'] else 0
    yes += val

if yes == len(df_gt):
    print('Same order!')
else:
    print('Something went wrong')

Convert df to np matrix, so the metrics can be calculated faster

In [None]:
# Convert to matrix
gt_matrix = np.array(df_gt[classes].astype(int))
model1_matrix = np.array(df_model1[classes].astype(int))

if model1_matrix.shape == gt_matrix.shape:
    print('Perfect match')

Get metrics

In [None]:
def calculate_metrics(y_pred, y):
    # Accuracy
    accuracy = np.mean(y_pred == y)

    # True Positives, False Positives, True Negatives, False Negatives
    tp = np.sum(np.logical_and(y_pred == 1, y == 1))
    fp = np.sum(np.logical_and(y_pred == 1, y == 0))
    tn = np.sum(np.logical_and(y_pred == 0, y == 0))
    fn = np.sum(np.logical_and(y_pred == 0, y == 1))

    # Precision, Recall, F1 Score
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    
    return accuracy, precision, recall, f1

Get all metrics per image

In [None]:
# Each dimension corresponds to: accuracy, precision, recall, f1
metrics = np.zeros((gt_matrix.shape[0],4))

for i in tqdm(range(gt_matrix.shape[0]),desc="Getting metrics", ncols=100):
    metrics[i,0], metrics[i,1], metrics[i,2], metrics[i,3] = calculate_metrics(model1_matrix[i],gt_matrix[i])


Add metrics to the gt_df

In [None]:
metrics_names = ['Accuracy','Precission','Recall','F1 Score']
for i,metric in enumerate(metrics_names):
    df_gt[metric] = metrics[:,i]  

In [None]:
df_gt.groupby('Sex')

In [None]:
walker = pyg.walk(df_gt)

In [None]:
walker = pyg.walk(metrics_per_class_df.reset_index())