# Model evaluation

We will assess model performance of the different classifiers and see if ensemble models will improve model fit

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import openai
import os

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report

# Set openai API key

openai.api_key = os.getenv("OPENAI_API_KEY")

In [2]:
os.getcwd()

'/Users/mn/Library/CloudStorage/OneDrive-UniversityofExeter/Projects/GitHub/contrarian-discourses-against-cap-and-trade/Code'

## Performance on the validation data

In [3]:
val = pd.read_json('../Classifiers/Data/cat_hearings_03_10_val.json')

# # Append the predicted_labels_adas to the validation set
val = val.merge(pd.read_json('../Classifiers/Data/cat_hearings_03_10_val_roberta_labels.json').loc[:,("id", "predicted_labels_roberta")])
val = val.merge(pd.read_json('../Classifiers/Data/cat_hearings_03_10_val_zeroshot_labels.json').loc[:,("id", "predicted_labels_chat")])
val = val.merge(pd.read_json('../Classifiers/Data/cat_hearings_03_10_val_zeroshot_labels.json').loc[:,("id", "predicted_labels_llm")])
val = val.merge(pd.read_json('../Classifiers/Data/cat_hearings_03_10_val_ada_labels.json').loc[:,("id", "predicted_labels_ada")])
val = val.merge(pd.read_json('../Classifiers/Data/cat_hearings_03_10_val_curie_labels.json').loc[:,("id", "predicted_labels_curie")])
val = val.merge(pd.read_json('../Classifiers/Data/cat_hearings_03_10_val_davinci_labels.json').loc[:,("id", "predicted_labels_davinci")])

val.iloc[0:25,179:186]

Unnamed: 0,labels,predicted_labels_roberta,predicted_labels_chat,predicted_labels_llm,predicted_labels_ada,predicted_labels_curie,predicted_labels_davinci
0,"[0, 0, 1]","[0, 0, 1]","[0, 0, 1]","[0, 0, 1]","[0, 0, 0]","[0, 0, 1]","[0, 0, 1]"
1,"[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]"
2,"[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]"
3,"[0, 0, 0]","[0, 0, 1]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 1]","[0, 0, 1]"
4,"[0, 1, 0]","[0, 1, 0]","[1, 0, 0]","[0, 1, 0]","[0, 0, 0]","[0, 1, 0]","[0, 1, 0]"
5,"[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]"
6,"[1, 0, 0]","[0, 0, 0]","[1, 0, 0]","[0, 1, 0]","[1, 0, 0]","[1, 0, 0]","[1, 0, 0]"
7,"[0, 0, 1]","[0, 0, 1]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]"
8,"[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]"
9,"[0, 1, 0]","[0, 0, 0]","[0, 1, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]","[0, 0, 0]"


In [7]:
# Add the no claim 4 label to the predicted labels whereever none of the three sublcaims are predicted

val.labels = val.labels.apply(lambda x: [0 if sum(x) > 0 else 1] +  x)
val.predicted_labels_roberta = val.predicted_labels_roberta.apply(lambda x: [0 if sum(x) > 0 else 1] +  x)
val.predicted_labels_chat = val.predicted_labels_chat.apply(lambda x: [0 if sum(x) > 0 else 1] +  x)
val.predicted_labels_llm = val.predicted_labels_llm.apply(lambda x: [0 if sum(x) > 0 else 1] +  x)
val.predicted_labels_ada = val.predicted_labels_ada.apply(lambda x: [0 if sum(x) > 0 else 1] +  x)
val.predicted_labels_curie = val.predicted_labels_curie.apply(lambda x: [0 if sum(x) > 0 else 1] +  x)
val.predicted_labels_davinci = val.predicted_labels_davinci.apply(lambda x: [0 if sum(x) > 0 else 1] +  x)

In [8]:
# Print the classification reports
print("Performance at the sub-claim level\n")

print("RoBERta\n-----------")
print(classification_report(val.labels.values.tolist(), 
                            val.predicted_labels_roberta.values.tolist()))

print("Chat\n-----------")
print(classification_report(val.labels.values.tolist(),
                            val.predicted_labels_chat.values.tolist()))

print("LLM\n-----------")
print(classification_report(val.labels.values.tolist(),
                            val.predicted_labels_llm.values.tolist()))

print("ADA\n-----------")
print(classification_report(val.labels.values.tolist(),
                            val.predicted_labels_ada.values.tolist()))

print("Curie\n-----------")
print(classification_report(val.labels.values.tolist(),
                            val.predicted_labels_curie.values.tolist()))


print("Davinci\n-----------")
print(classification_report(val.labels.values.tolist(),
                            val.predicted_labels_davinci.values.tolist()))

Performance at the sub-claim level

RoBERta
-----------
              precision    recall  f1-score   support

           0       0.90      0.84      0.87       153
           1       0.83      0.75      0.79        60
           2       0.75      0.67      0.71        27
           3       0.58      0.72      0.64        36

   micro avg       0.82      0.79      0.80       276
   macro avg       0.76      0.75      0.75       276
weighted avg       0.83      0.79      0.81       276
 samples avg       0.82      0.81      0.81       276

Chat
-----------
              precision    recall  f1-score   support

           0       0.84      0.84      0.84       153
           1       0.68      0.78      0.73        60
           2       0.57      0.30      0.39        27
           3       0.56      0.25      0.35        36

   micro avg       0.76      0.70      0.73       276
   macro avg       0.66      0.54      0.58       276
weighted avg       0.74      0.70      0.71       276
 sam

In [9]:
# Print the classification reports
print("Performance at the claim level\n")

print("RoBERta\n-----------")
print(classification_report([1 if sum(i)>0 else 0 for i in val.labels.apply(lambda x: x[1:4])], 
                            [1 if sum(i)>0 else 0 for i in val.predicted_labels_roberta.apply(lambda x: x[1:4])]))

print("Chat\n-----------")
print(classification_report([1 if sum(i)>0 else 0 for i in val.labels.apply(lambda x: x[1:4])],
                            [1 if sum(i)>0 else 0 for i in val.predicted_labels_chat.apply(lambda x: x[1:4])]))

print("LLM\n-----------")
print(classification_report([1 if sum(i)>0 else 0 for i in val.labels.apply(lambda x: x[1:4])],
                            [1 if sum(i)>0 else 0 for i in val.predicted_labels_llm.apply(lambda x: x[1:4])]))

print("ADA\n-----------")
print(classification_report([1 if sum(i)>0 else 0 for i in val.labels.apply(lambda x: x[1:4])],
                            [1 if sum(i)>0 else 0 for i in val.predicted_labels_ada.apply(lambda x: x[1:4])]))

print("Curie\n-----------")
print(classification_report([1 if sum(i)>0 else 0 for i in val.labels.apply(lambda x: x[1:4])],
                            [1 if sum(i)>0 else 0 for i in val.predicted_labels_curie.apply(lambda x: x[1:4])]))

print("Davinci\n-----------")
print(classification_report([1 if sum(i)>0 else 0 for i in val.labels.apply(lambda x: x[1:4])],
                            [1 if sum(i)>0 else 0 for i in val.predicted_labels_davinci.apply(lambda x: x[1:4])]))
                                  

Performance at the claim level

RoBERta
-----------
              precision    recall  f1-score   support

           0       0.90      0.84      0.87       153
           1       0.78      0.85      0.81       100

    accuracy                           0.85       253
   macro avg       0.84      0.85      0.84       253
weighted avg       0.85      0.85      0.85       253

Chat
-----------
              precision    recall  f1-score   support

           0       0.84      0.84      0.84       153
           1       0.76      0.75      0.75       100

    accuracy                           0.81       253
   macro avg       0.80      0.80      0.80       253
weighted avg       0.81      0.81      0.81       253

LLM
-----------
              precision    recall  f1-score   support

           0       0.74      0.95      0.83       153
           1       0.86      0.50      0.63       100

    accuracy                           0.77       253
   macro avg       0.80      0.72      0.73

In [16]:
# Calculate ensemble predictions where a labels is predicted if it is predicted by at least 2 models

minimum_agreement = 1

val['predicted_labels_ensemble'] = pd.DataFrame(zip(
    val.apply(lambda x: 1 if (
          x['predicted_labels_roberta'][0]
        #   + x['predicted_labels_curie'][0]
          + x['predicted_labels_davinci'][0]
          >= minimum_agreement) else 0, axis=1),
    val.apply(lambda x: 1 if (
          x['predicted_labels_roberta'][1]
        #   + x['predicted_labels_curie'][1]
          + x['predicted_labels_davinci'][1]
          >= minimum_agreement) else 0, axis=1),
    val.apply(lambda x: 1 if (
          x['predicted_labels_roberta'][2]
        #   + x['predicted_labels_curie'][2]
          + x['predicted_labels_davinci'][2]
          >= minimum_agreement) else 0, axis=1),
    val.apply(lambda x: 1 if (
          x['predicted_labels_roberta'][3]
        #   + x['predicted_labels_curie'][3]
          + x['predicted_labels_davinci'][3]
          >= minimum_agreement) else 0, axis=1),
    )).values.tolist()

# Make logical correction to the "no-claim" category (If no sub-claim is predicted, then the claim is predicted as "no-claim")
val.predicted_labels_ensemble = val.predicted_labels_ensemble.apply(lambda x: [0 if sum(x[1:4])> 0 else 1] + x[1:4])


# Print the classification report
print("Ensemble model\n-----------")
print(classification_report(val['labels'].values.tolist(), val['predicted_labels_ensemble'].values.tolist()))
print(classification_report([1 if sum(i)>0 else 0 for i in val['labels'].apply(lambda x: x[1:4])], 
                            [1 if sum(i)>0 else 0 for i in val['predicted_labels_ensemble'].apply(lambda x: x[1:4])]))


Ensemble model
-----------
              precision    recall  f1-score   support

           0       0.94      0.82      0.87       153
           1       0.75      0.87      0.81        60
           2       0.70      0.78      0.74        27
           3       0.55      0.75      0.64        36

   micro avg       0.80      0.82      0.81       276
   macro avg       0.74      0.80      0.76       276
weighted avg       0.83      0.82      0.81       276
 samples avg       0.82      0.82      0.81       276

              precision    recall  f1-score   support

           0       0.94      0.82      0.87       153
           1       0.77      0.92      0.84       100

    accuracy                           0.86       253
   macro avg       0.85      0.87      0.86       253
weighted avg       0.87      0.86      0.86       253



In [17]:
# Calculate ensemble predictions where a labels is predicted if it is predicted by at least 2 models

minimum_agreement = 2

val['predicted_labels_ensemble'] = pd.DataFrame(zip(
    val.apply(lambda x: 1 if (
          x['predicted_labels_roberta'][0]
        #   + x['predicted_labels_curie'][0]
          + x['predicted_labels_davinci'][0]
          >= minimum_agreement) else 0, axis=1),
    val.apply(lambda x: 1 if (
          x['predicted_labels_roberta'][1]
        #   + x['predicted_labels_curie'][1]
          + x['predicted_labels_davinci'][1]
          >= minimum_agreement) else 0, axis=1),
    val.apply(lambda x: 1 if (
          x['predicted_labels_roberta'][2]
        #   + x['predicted_labels_curie'][2]
          + x['predicted_labels_davinci'][2]
          >= minimum_agreement) else 0, axis=1),
    val.apply(lambda x: 1 if (
          x['predicted_labels_roberta'][3]
        #   + x['predicted_labels_curie'][3]
          + x['predicted_labels_davinci'][3]
          >= minimum_agreement) else 0, axis=1),
    )).values.tolist()

# Make logical correction to the "no-claim" category (If no sub-claim is predicted, then the claim is predicted as "no-claim")
val.predicted_labels_ensemble = val.predicted_labels_ensemble.apply(lambda x: [0 if sum(x[1:4])> 0 else 1] + x[1:4])


# Print the classification report
print("Ensemble model\n-----------")
print(classification_report(val['labels'].values.tolist(), val['predicted_labels_ensemble'].values.tolist()))
print(classification_report([1 if sum(i)>0 else 0 for i in val['labels'].apply(lambda x: x[1:4])], 
                            [1 if sum(i)>0 else 0 for i in val['predicted_labels_ensemble'].apply(lambda x: x[1:4])]))


Ensemble model
-----------
              precision    recall  f1-score   support

           0       0.85      0.93      0.88       153
           1       0.88      0.72      0.79        60
           2       0.86      0.67      0.75        27
           3       0.67      0.50      0.57        36

   micro avg       0.83      0.80      0.82       276
   macro avg       0.81      0.70      0.75       276
weighted avg       0.83      0.80      0.81       276
 samples avg       0.83      0.82      0.82       276

              precision    recall  f1-score   support

           0       0.85      0.93      0.88       153
           1       0.87      0.74      0.80       100

    accuracy                           0.85       253
   macro avg       0.86      0.83      0.84       253
weighted avg       0.86      0.85      0.85       253



## Performance of the final model on the testing data

In [23]:
test = pd.read_json('../Classifiers/Data/cat_hearings_03_10_test.json')

# # Append the predicted_labels_adas to the validation set
test = test.merge(pd.read_json('../Classifiers/Data/cat_hearings_03_10_test_roberta_labels.json').loc[:,("id", "predicted_labels_roberta")])

test.iloc[0:25,179:186]

Unnamed: 0,labels,predicted_labels_roberta
0,"[0, 0, 0]","[0, 1, 0]"
1,"[0, 0, 1]","[0, 0, 1]"
2,"[0, 0, 0]","[0, 0, 0]"
3,"[0, 0, 0]","[0, 0, 0]"
4,"[0, 0, 0]","[0, 0, 0]"
5,"[0, 0, 0]","[0, 0, 0]"
6,"[0, 0, 0]","[0, 0, 0]"
7,"[0, 0, 0]","[0, 0, 0]"
8,"[0, 0, 0]","[0, 0, 0]"
9,"[0, 0, 0]","[0, 0, 0]"


In [24]:
# Add the no claim 4 label to the predicted labels whereever none of the three sublcaims are predicted

test.labels = test.labels.apply(lambda x: [0 if sum(x) > 0 else 1] +  x)
test.predicted_labels_roberta = test.predicted_labels_roberta.apply(lambda x: [0 if sum(x) > 0 else 1] +  x)

In [26]:
# Print the classification reports
print("Performance at the sub-claim level\n")

print("RoBERta\n-----------")
print(classification_report(test.labels.values.tolist(), 
                            test.predicted_labels_roberta.values.tolist()))


Performance at the sub-claim level

RoBERta
-----------
              precision    recall  f1-score   support

           0       0.95      0.81      0.88       178
           1       0.67      0.84      0.74        38
           2       0.44      0.41      0.42        17
           3       0.53      0.74      0.62        35

   micro avg       0.79      0.78      0.79       268
   macro avg       0.65      0.70      0.67       268
weighted avg       0.83      0.78      0.80       268
 samples avg       0.81      0.80      0.80       268



In [25]:
# Print the classification reports
print("Performance at the claim level\n")

print("RoBERta\n-----------")
print(classification_report([1 if sum(i)>0 else 0 for i in test.labels.apply(lambda x: x[1:4])], 
                            [1 if sum(i)>0 else 0 for i in test.predicted_labels_roberta.apply(lambda x: x[1:4])]))


Performance at the claim level

RoBERta
-----------
              precision    recall  f1-score   support

           0       0.95      0.81      0.88       178
           1       0.67      0.91      0.77        75

    accuracy                           0.84       253
   macro avg       0.81      0.86      0.83       253
weighted avg       0.87      0.84      0.85       253

