# IMPORTS

## PACKAGES

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import classification_report, f1_score

## LABELS

In [None]:
ANOMALY_LABELS = ['Deviation / Discrepancy - Procedural',
                    'Aircraft Equipment',
                    'Conflict',
                    'Inflight Event / Encounter',
                    'ATC Issue',
                    'Deviation - Altitude',
                    'Deviation - Track / Heading',
                    'Ground Event / Encounter',
                    'Flight Deck / Cabin / Aircraft Event',
                    'Ground Incursion',
                    'Airspace Violation',
                    'Deviation - Speed',
                    'Ground Excursion',
                    'No Specific Anomaly Occurred']

## DATA

In [None]:
results = pd.read_csv('../out/results-sub.csv')
subset = pd.read_parquet('../out/subset_test_data.parquet')
fs_pred = joblib.load('../out/ypred_fs0.joblib')
zs_pred = joblib.load('../out/ypred_p5_zs_abv.joblib')  # joblib.load('../out/ypred_p5_zs.joblib') to load the inferences results with abbreviations

# DATA PREPARATION

In [None]:
ytrue = np.array(subset.Anomaly.values.tolist())
l_values = list(classification_report(ytrue, zs_pred, target_names=ANOMALY_LABELS, output_dict=True).values())
l_keys = list(classification_report(ytrue, zs_pred, target_names=ANOMALY_LABELS, output_dict=True).keys())

In [None]:
f1score = [item['f1-score'] for item in l_values]
llama_results = pd.DataFrame({'Model Name' : ['LLAMA-2-7B']*len(l_values), 'Category' : l_keys, 'f1-score': f1score})

In [None]:
results_full = results[llama_results.columns]
results_full = pd.concat([results_full, llama_results])

In [None]:
models = {'bert':'BERT',
          'nasa':'SafeAeroBERT',
          'alle': 'Longformer',
          'aero': 'AeroBOT',
          'llam':'LLAMA-2-7B'}
short_name = ['BertBase_Unfrz_BCE',
              'SafeAeroBERT_Unfrz_BCE',
              'Longformer_Unfrz_BCE',
              'AeroBOT_BertBase_Unfrz_BCE',
              'LLAMA-2-7B']
cat_exclude = ['No Specific Anomaly Occurred', 'micro avg', 'macro avg', 'weighted avg', 'samples avg']

In [None]:
color_palette = sns.color_palette()
color_palette = sns.color_palette()
model_order = ['SafeAeroBERT_Unfrz_BCE', 'Longformer_Unfrz_BCE', 'AeroBOT_BertBase_Unfrz_BCE', 'BertBase_Unfrz_BCE', 'LLAMA-2-7B']
model_order_bis = ['SafeAeroBERT', 'Longformer', 'AeroBOT', 'BERT', 'LLAMA-2-7B']
model_palette = {model: color_palette[index % len(color_palette)] for index, model in enumerate(model_order)}
model_palette_bis = {model: color_palette[index % len(color_palette)] for index, model in enumerate(model_order_bis)}

In [None]:
results_full['Model'] = results_full['Model Name'].apply(lambda cell: models[cell[:4].lower()])
map = dict(zip(results_full['Model Name'].unique().tolist(), short_name))

In [None]:
results_full['Model Name'] = results_full['Model Name'].apply(lambda cell: map[cell])
results_full = results_full.sort_values(by='f1-score', ascending=True)

# PLOTS

## ZERO SHOT

### W/ FORMAT ERRORS

In [None]:
print(classification_report(ytrue, np.array(zs_pred), target_names=ANOMALY_LABELS))

In [None]:
plt.figure(figsize=(8, 6))
sns.set_theme(style="ticks")
plt.grid()
sns.stripplot(data=results_full[results_full['Category'].isin(['macro avg'])], x="Model Name", y="f1-score", color='yellow', size=10, jitter=True)
sns.violinplot(data=results_full[~results_full['Category'].isin(cat_exclude)], x="Model Name", y="f1-score", inner="points", saturation=0.75, palette=model_palette)
plt.title('Violin Plot of F1-scores by best Unfrozen Model Configurations with LLAMA-2-7B (ABV)')
for i in range(len(results_full[results_full['Category'].isin(['macro avg'])])):
    plt.text(results_full[results_full['Category'].isin(['macro avg'])].iloc[i]['Model Name'], results_full[results_full['Category'].isin(['macro avg'])].iloc[i]['f1-score'] + 0.02, 
             f"{results_full[results_full['Category'].isin(['macro avg'])].iloc[i]['f1-score']:.3f}", ha='center', va='bottom', fontsize=9, color='yellow')

plt.axvline(x=1, color='red', linestyle='--')
plt.ylabel('F1 Score')
plt.xlabel('')
plt.xticks(rotation = 45, ha='right', fontsize=9)
plt.gca().set_ylim(0, 1)
plt.show()

### WO FORMAT ERRORS

In [None]:
errors_idx = np.where(np.sum(zs_pred, axis=1)==0)[0]
ytrue_gf = ytrue[~errors_idx]
zs_pred_gf = np.array(zs_pred)[~errors_idx]
l_gf_values = list(classification_report(ytrue_gf, zs_pred_gf, target_names=ANOMALY_LABELS, output_dict=True).values())
l_gf_keys = list(classification_report(ytrue_gf, zs_pred_gf, target_names=ANOMALY_LABELS, output_dict=True).keys())
f1score = [item['f1-score'] for item in l_gf_values]
llama_gf_results = pd.DataFrame({'Model Name' : ['LLAMA-2-7B']*len(l_gf_values), 'Category' : l_gf_keys, 'f1-score': f1score})
results_gf = results[llama_gf_results.columns]
results_gf = pd.concat([results_gf, llama_gf_results])
results_gf['Model'] = results_gf['Model Name'].apply(lambda cell: models[cell[:4].lower()])
map = dict(zip(results_gf['Model Name'].unique().tolist(), short_name))
results_gf['Model Name'] = results_gf['Model Name'].apply(lambda cell: map[cell])
results_gf = results_gf.sort_values(by='f1-score', ascending=True)

In [None]:
print(classification_report(ytrue_gf, np.array(zs_pred_gf), target_names=ANOMALY_LABELS))

In [None]:
plt.figure(figsize=(8, 6))
sns.set_theme(style="ticks")
plt.grid()
sns.stripplot(data=results_gf[results_gf['Category'].isin(['macro avg'])], x="Model Name", y="f1-score", color='yellow', size=10, jitter=True)
sns.violinplot(data=results_gf[~results_gf['Category'].isin(cat_exclude)], x="Model Name", y="f1-score",
               inner="points", saturation=0.75, palette=model_palette)
plt.title('Violin Plot of F1-scores by best Unfrozen Model Configurations with LLAMA-2-7B')
for i in range(len(results_gf[results_gf['Category'].isin(['macro avg'])])):
    plt.text(results_gf[results_gf['Category'].isin(['macro avg'])].iloc[i]['Model Name'], results_gf[results_gf['Category'].isin(['macro avg'])].iloc[i]['f1-score'] + 0.02, 
             f"{results_gf[results_gf['Category'].isin(['macro avg'])].iloc[i]['f1-score']:.3f}", ha='center', va='bottom', fontsize=9, color='yellow')
plt.ylabel('F1 Score')
plt.xticks(rotation = 45, ha='right', fontsize=9)
plt.gca().set_ylim(0, 1)
plt.show()

## FEW SHOT

### W/ FORMAT ERRORS

In [None]:
print(classification_report(ytrue[:,0], np.array(fs_pred)[:,0]))

### WO FORMAT ERRORS

In [None]:
fs_format_errors = np.where(np.sum(np.array(fs_pred), axis=1)==0)[0]

In [None]:
print(classification_report(ytrue[:,0][~fs_format_errors], np.array(fs_pred)[:,0][~fs_format_errors]))

### PLOT

In [None]:
gf_fs_f1_score = f1_score(ytrue[:,0][~fs_format_errors], np.array(fs_pred)[:,0][~fs_format_errors])
fs_f1_score = f1_score(ytrue[:,0], np.array(fs_pred)[:,0])

In [None]:
sns.barplot(y=[fs_f1_score, gf_fs_f1_score], x=['With Format Errors', 'Without Format Errors'], width=0.2)
plt.ylabel('F1 Score')
plt.title('F1 Scores for Category <<Deviation - Procedural / Discreperancy>>')
plt.grid()

## FORMAT ERRORS PERCENTAGES

In [None]:
fs_fe_per = (len(np.where(np.sum(np.array(fs_pred), axis=1)==0)[0])/len(np.array(fs_pred)))*100
zs_fe_per = (len(np.where(np.sum(np.array(zs_pred), axis=1)==0)[0])/len(np.array(zs_pred)))*100

In [None]:
sns.barplot(y=[zs_fe_per, fs_fe_per], x=['Zero Shot Prompting', 'Few Shot Prompting'], width=0.2)
plt.ylabel('%')
plt.title('Percentage of Format Errors by Prompting Strategy')
plt.grid()

## SUBSET DISTRIBUTION PLOT

In [None]:
plt.bar(x = ANOMALY_LABELS, height= np.sum(ytrue, axis=0), color='r', alpha=0.6, width=0.4)
plt.xticks(rotation = 45, ha='right', fontsize=9)
plt.title('Grouped anomalies types in the subset of the Test Set')
plt.ylabel('Frequency')
plt.grid()
plt.show()