### 7.Evaluation binary classification task

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

#load validation results
val_bin_results = pd.read_csv(r'.\Results\acsess_bin_results_val.csv', sep = '\t')
#load test results
test_bin_results = pd.read_csv(r'.\Results\acsess_bin_results_test.csv', sep = '\t')
 

### Change according to number of examples used during training
k=0  #zero-shot
#k=1 #one-shot
#k=5 #few-shot

In [None]:
val_bin_results.columns = val_bin_results.columns.str.replace('binary ', '', regex=False)
test_bin_results.columns = test_bin_results.columns.str.replace('binary ', '', regex=False)
#grouped = bin_results.groupby('note_nr').agg({'k=0': 'max', 'k=1': 'max', 'k=5': 'max'}).reset_index()

In [None]:
def calculate_metrics(df):
    # Initialize a dictionary to store the results
    results = {f'k={i}': {'precision': [], 'recall': [], 'f1_score': []} for i in range(k)} 

    # Loop over each fold and each k
    for fold in ['fold1', 'fold2', 'fold3', 'fold4', 'fold5']:
        for k in range(k): 
            # Filter the data for the current fold
            fold_data = df[df['fold'] == fold]

            # Get the ground truth and predictions, ignoring NaN values
            y_true = fold_data['relevance_manual']
            y_pred = fold_data[f'k={k}'].dropna()
            y_true = y_true[y_pred.index]

            # Calculate precision, recall, and F1-score
            print(y_true)
            print(y_pred)
            precision = precision_score(y_true, y_pred, zero_division=0)
            recall = recall_score(y_true, y_pred, zero_division=0)
            f1 = f1_score(y_true, y_pred, zero_division=0)

            # Append the scores to the results dictionary
            results[f'k={k}']['precision'].append(precision)
            results[f'k={k}']['recall'].append(recall)
            results[f'k={k}']['f1_score'].append(f1)

    return results

def generate_results_table(df):
    # Calculate metrics
    results = calculate_metrics(df)

    # Calculate averages and standard deviations for each k
    precision_avg_list, precision_std_list = [], []
    recall_avg_list, recall_std_list = [], []
    f1_avg_list, f1_std_list = [], []
    
    for k in range(k): 
        print(results[f'k={k}']['precision'])
        precision_avg = np.mean(results[f'k={k}']['precision'])
        precision_std = np.std(results[f'k={k}']['precision'])
        recall_avg = np.mean(results[f'k={k}']['recall'])
        recall_std = np.std(results[f'k={k}']['recall'])
        f1_avg = np.mean(results[f'k={k}']['f1_score'])
        f1_std = np.std(results[f'k={k}']['f1_score'])

        precision_avg_list.append(precision_avg)
        precision_std_list.append(precision_std)
        recall_avg_list.append(recall_avg)
        recall_std_list.append(recall_std)
        f1_avg_list.append(f1_avg)
        f1_std_list.append(f1_std)

    # Create a DataFrame to store the results
    results_table = pd.DataFrame({
        'k': [f'k={i}' for i in range(k)], 
        'Precision (Avg)': precision_avg_list,
        'Precision (Std)': precision_std_list,
        'Recall (Avg)': recall_avg_list,
        'Recall (Std)': recall_std_list,
        'F1-score (Avg)': f1_avg_list,
        'F1-score (Std)': f1_std_list
    })

    return results_table


In [None]:
# Results validation
print("ACSESS validation results")
results_table_val = generate_results_table(val_bin_results)
print(results_table_val.to_string(index=False))

In [None]:
# Results test
print("ACSESS test results")
results_table_test = generate_results_table(test_bin_results)
print(results_table_test.to_string(index=False))