# Result Cleaning and Evaluation for Binary Cognitive Distortion Detection

In [4]:
import os
#Make this your results directory
os.chdir('/Users/ulugsali/Desktop/Cognitive-Distortion-Project/results/')
eval_path = 'multiclass/baseline/llama3.1-8b/zero_shot.csv'

Open the Dataset

In [5]:
import pandas as pd
inference_df = pd.read_csv(eval_path)
inference_df

Unnamed: 0,Prompt,Response
0,My husband works a lot which really helps our ...,Mental Filter
1,I used to get many strange looks for the thing...,Overgeneralization
2,Moved to another state left everything for my ...,Overgeneralization
3,"It has been more than a year now , I feel alon...",Mind Reading
4,"My sister has autism spectrum disorder, she al...",Mind Reading
...,...,...
501,From India: My brother is 40 years old and he ...,Overgeneralization
502,From the U.S.: I was sexually abused and raped...,All-or-Nothing Thinking
503,My grandsons personality has changed in every ...,Overgeneralization
504,From Egypt: I was diagnosed with OCD by my doc...,Emotional Reasoning


Fix Common Variations

In [6]:
inference_df['Response'] = inference_df['Response'].apply(lambda x: x.lower())
inference_df['Response'] = inference_df['Response'].apply(lambda x: x.split('\n')[0])
inference_df['Response'] = inference_df['Response'].apply(lambda x: x.strip(".'\n "))
inference_df

Unnamed: 0,Prompt,Response
0,My husband works a lot which really helps our ...,mental filter
1,I used to get many strange looks for the thing...,overgeneralization
2,Moved to another state left everything for my ...,overgeneralization
3,"It has been more than a year now , I feel alon...",mind reading
4,"My sister has autism spectrum disorder, she al...",mind reading
...,...,...
501,From India: My brother is 40 years old and he ...,overgeneralization
502,From the U.S.: I was sexually abused and raped...,all-or-nothing thinking
503,My grandsons personality has changed in every ...,overgeneralization
504,From Egypt: I was diagnosed with OCD by my doc...,emotional reasoning


Create Mapping Function

In [7]:
def catch_responses(x):
    match x:
        case 'no distortion': 
            return 0
        case 'emotional reasoning': 
            return 1
        case 'overgeneralization': 
            return 2
        case 'mental filter':
            return 3
        case 'should statements':
            return 4
        case 'all-or-nothing thinking':
            return 5
        case 'mind reading': 
            return 6
        case 'fortune-telling':
            return 7
        case 'fortune telling':
            return 7
        case 'magnification': 
            return 8
        case 'personalization': 
            return 9
        case 'labeling': 
            return 10
        case _:
            print(x)
            return -1


Map common desired input, display any that are undesired for fine handling... For me, any denial should be thrown away (by being labelled -1).

In [8]:
inference_df['Response'] = inference_df['Response'].apply(catch_responses)

should statements, labeling, personalization, mental filter
magnification, fortune-telling, and emotional reasoning
overgeneralization, fortune-telling, magnification, labeling (twice), emotional reasoning, personalization, should statements
overgeneralization, and also all-or-nothing thinking
overgeneralization, fortune-telling, magnification, mental filter, emotional reasoning, mind reading, all-or-nothing thinking, personalization, labeling
fortune-telling (predicting that an event will always result in the worst possible outcome)
all-or-nothing thinking, personalization, fortune-telling, magnification, labeling
should statements, all-or-nothing thinking, fortune-telling, personalization, labeling, emotional reasoning
emotional reasoning, mind reading
personalization and labeling
all-or-nothing thinking and personalization
mental filter and labeling are present
fortune-telling, mental filter, labeling, personalization, all-or-nothing thinking, and magnification
emotional reasoning -

Gather Gold Data

In [9]:
gold_dominant_data = pd.read_csv('../datasets/test.csv')['Dominant Distortion']\
    .apply(lambda x: x.lower())\
    .apply(catch_responses)
gold_secondary_data = pd.read_csv('../datasets/test.csv')['Secondary Distortion (Optional)']\
    .apply(lambda x: x if x is None else str(x).lower())\
    .apply(catch_responses)

inference_df['gold_dominant'] = gold_dominant_data
inference_df['gold_secondary'] = gold_secondary_data

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


In [10]:
def match_either(row):
    if row['Response'] == -1:
        row['gold'] = row['gold_dominant']
        return row
    if row['Response'] == row['gold_secondary']:
        row['gold'] = row['gold_secondary']
    else:
        row['gold'] = row['gold_dominant']
    return row

In [11]:
inference_df = inference_df.apply(match_either, axis=1)

Throw away non-responses

In [12]:
#inference_df = inference_df[inference_df['Response'] != -1]

Compute Accuracy and F1

In [13]:
from sklearn.metrics import accuracy_score, f1_score

print(f"Evaluated Model: {eval_path.split('/')[1]} Prompt: {eval_path.split('/')[2].removesuffix('.csv')}")
f1_macro = f1_score(inference_df['gold'], inference_df['Response'], average=None)
for i, f1 in enumerate(f1_macro):
    print(f1)
print()
print(f1_score(inference_df['gold'], inference_df['Response'], labels=range(0,11), average='macro'))
print(accuracy_score(inference_df['gold'], inference_df['Response']))
#print(f"F1-Score (Weighted): {f1_score(inference_df['gold'], inference_df['Response'], average='weighted')}")

Evaluated Model: baseline Prompt: llama3.1-8b
0.0
0.11363636363636363
0.08695652173913043
0.23754789272030652
0.07894736842105263
0.12903225806451613
0.058823529411764705
0.18421052631578946
0.15625
0.0
0.1941747572815534
0.13333333333333333

0.12481023190216457
0.1482213438735178


In [14]:
from sklearn.metrics import accuracy_score, f1_score

print(f"Evaluated Model: {eval_path.split('/')[1]} Prompt: {eval_path.split('/')[2].removesuffix('.csv')}")
print(f"Accuracy: {accuracy_score(inference_df['gold_dominant'], inference_df['Response'])}")
f1_macro = f1_score(inference_df['gold_dominant'], inference_df['Response'], labels=[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], average=None)
for i, f1 in enumerate(f1_macro):
    print(f"F1-Score (Macro, Class {i}): {f1}")
print(f"F1-Score (Macro): {f1_score(inference_df['gold_dominant'], inference_df['Response'], average='macro')}")
print(f"F1-Score (Weighted): {f1_score(inference_df['gold_dominant'], inference_df['Response'], average='weighted')}")

Evaluated Model: baseline Prompt: llama3.1-8b
Accuracy: 0.116600790513834
F1-Score (Macro, Class 0): 0.0
F1-Score (Macro, Class 1): 0.11363636363636363
F1-Score (Macro, Class 2): 0.0851063829787234
F1-Score (Macro, Class 3): 0.20077220077220076
F1-Score (Macro, Class 4): 0.02702702702702703
F1-Score (Macro, Class 5): 0.12121212121212122
F1-Score (Macro, Class 6): 0.057971014492753624
F1-Score (Macro, Class 7): 0.13333333333333333
F1-Score (Macro, Class 8): 0.09230769230769231
F1-Score (Macro, Class 9): 0.0
F1-Score (Macro, Class 10): 0.10204081632653061
F1-Score (Macro, Class 11): 0.12244897959183673
F1-Score (Macro): 0.08798799430654856
F1-Score (Weighted): 0.10803732135734104
