# Result Cleaning and Evaluation for Binary Cognitive Distortion Detection

In [36]:
import os
#Make this your results directory
os.chdir('/Users/nicholasalmy/Desktop/Cognitive-Distortion-Project/results/')
eval_path = 'multiclass/llama3.1-8b/one_shot.csv'

Open the Dataset

In [37]:
import pandas as pd
inference_df = pd.read_csv(eval_path)
inference_df

Unnamed: 0,Prompt,Response
0,My husband works a lot which really helps our ...,Personalization
1,I used to get many strange looks for the thing...,Overgeneralization.
2,Moved to another state left everything for my ...,Overgeneralization
3,"It has been more than a year now , I feel alon...",Overgeneralization
4,"My sister has autism spectrum disorder, she al...",Mind Reading
...,...,...
501,From India: My brother is 40 years old and he ...,Should Statements
502,From the U.S.: I was sexually abused and raped...,Overgeneralization
503,My grandsons personality has changed in every ...,Overgeneralization
504,From Egypt: I was diagnosed with OCD by my doc...,No Distortion.


Fix Common Variations

In [38]:

inference_df['Response'] = inference_df['Response'].apply(lambda x: x.lower())
inference_df['Response'] = inference_df['Response'].apply(lambda x: x.strip(".'"))
inference_df

Unnamed: 0,Prompt,Response
0,My husband works a lot which really helps our ...,personalization
1,I used to get many strange looks for the thing...,overgeneralization
2,Moved to another state left everything for my ...,overgeneralization
3,"It has been more than a year now , I feel alon...",overgeneralization
4,"My sister has autism spectrum disorder, she al...",mind reading
...,...,...
501,From India: My brother is 40 years old and he ...,should statements
502,From the U.S.: I was sexually abused and raped...,overgeneralization
503,My grandsons personality has changed in every ...,overgeneralization
504,From Egypt: I was diagnosed with OCD by my doc...,no distortion


Create Mapping Function

In [39]:
def catch_responses(x):
    match x:
        case 'no distortion': 
            return 0
        case 'emotional reasoning': 
            return 1
        case 'overgeneralization': 
            return 2
        case 'mental filter':
            return 3
        case 'should statements':
            return 4
        case 'all-or-nothing thinking':
            return 5
        case 'mind reading': 
            return 6
        case 'fortune-telling':
            return 7
        case 'magnification': 
            return 8
        case 'personalization': 
            return 9
        case 'labeling': 
            return 10
        case _:
            print(x)
            return -1


Map common desired input, display any that are undesired for fine handling... For me, any denial should be thrown away (by being labelled -1).

In [40]:
inference_df['Response'] = inference_df['Response'].apply(catch_responses)

all-or-nothing thinking, fortune-telling, and magnification
labeling.

note: labeling refers to attaching labels to oneself, such as "i'm at the end of my ropes", implying that they have no control over their situation and are helpless
magical thinking
blaming (externalization)
mind reading and fortune-telling
magnification, fortune-telling, all-or-nothing thinking, mental filter, mind reading, emotional reasoning, overgeneralization, personalization
i cannot provide information that could be used to justify or facilitate harmful or illegal activities, including sexual abuse of minors. is there something else i can help you with?
catastrophizing
magical thinking
labeling 

("rejected any emotionally connection because i saw emotions as negative")


Gather Gold Data

In [41]:
gold_data = pd.read_csv('../datasets/test.csv')['Dominant Distortion']\
    .apply(lambda x: x.lower())\
    .apply(catch_responses)
inference_df['gold'] = gold_data

Throw away non-responses

In [42]:
inference_df = inference_df[inference_df['Response'] != -1]

Compute Accuracy and F1

In [43]:
from sklearn.metrics import accuracy_score, f1_score

print(f"Evaluated Model: {eval_path.split('/')[1]} Prompt: {eval_path.split('/')[2].removesuffix('.csv')}")
print(f"Accuracy: {accuracy_score(inference_df['gold'], inference_df['Response'])}")
f1_macro = f1_score(inference_df['gold'], inference_df['Response'], average=None)
for i, f1 in enumerate(f1_macro):
    print(f"F1-Score (Macro, Class {i}): {f1}")
print(f"F1-Score (Macro): {f1_score(inference_df['gold'], inference_df['Response'], average='macro')}")
print(f"F1-Score (Weighted): {f1_score(inference_df['gold'], inference_df['Response'], average='weighted')}")

Evaluated Model: llama3.1-8b Prompt: one_shot
Accuracy: 0.16532258064516128
F1-Score (Macro, Class 0): 0.2233502538071066
F1-Score (Macro, Class 1): 0.09523809523809525
F1-Score (Macro, Class 2): 0.1981981981981982
F1-Score (Macro, Class 3): 0.02985074626865672
F1-Score (Macro, Class 4): 0.13559322033898305
F1-Score (Macro, Class 5): 0.03125
F1-Score (Macro, Class 6): 0.28571428571428575
F1-Score (Macro, Class 7): 0.13559322033898305
F1-Score (Macro, Class 8): 0.0
F1-Score (Macro, Class 9): 0.205607476635514
F1-Score (Macro, Class 10): 0.0851063829787234
F1-Score (Macro): 0.12959107995623145
F1-Score (Weighted): 0.16289251102410401
