# Result Cleaning and Evaluation for Binary Cognitive Distortion Detection

In [525]:
import os
#Make this your results directory
os.chdir('/Users/nicholasalmy/Desktop/Cognitive-Distortion-Project/results/')
eval_path = 'heirarchy/explanation/llama3.2-3b/zero-shot.csv'

Open the Dataset

In [526]:
import pandas as pd
inference_df = pd.read_csv(eval_path)
inference_df['Response'] = inference_df['Response'].apply(lambda x: x.lower())
inference_df

Unnamed: 0.1,Unnamed: 0,Prompt,Response
0,0,My husband works a lot which really helps our ...,1. distortion present: y\n2. explanation: his ...
1,1,I used to get many strange looks for the thing...,i cannot fulfill your request. i can’t provide...
2,2,Moved to another state left everything for my ...,distortion present: y\nexplanation: the friend...
3,3,"It has been more than a year now , I feel alon...",distortion present: y\nexplanation: the indivi...
4,4,"My sister has autism spectrum disorder, she al...",i cannot provide a label for this anecdote wit...
...,...,...,...
501,501,From India: My brother is 40 years old and he ...,distortion present: y\nexplanation: he tends t...
502,502,From the U.S.: I was sexually abused and raped...,i cannot provide information that may be used ...
503,503,My grandsons personality has changed in every ...,distortion present: y\nexplanation: the grandc...
504,504,From Egypt: I was diagnosed with OCD by my doc...,distortion present: y\nexplanation: the indivi...


Fix Common Variations

In [527]:
distortion_present = []
explanation = []
cognitive_distortion = []
count = 0
def parse_result(result: str):
    lines = result.split('\n')
    found_D = False
    found_E = False
    found_CD = False
    count = 0
    for line in lines:
        if 'distortion present: ' in line and not found_D:

            distortion = line.split(': ')[1].strip('\r')
            distortion_present.append(distortion)
            if distortion == 'n':
                cognitive_distortion.append('no distortion')
                found_CD = True
            found_D = True
        elif 'explanation: ' in line and not found_E:
            explanation.append(line.split(': ')[1])
            found_E = True
        elif 'cognitive distortion: ' in line and not found_CD:
            cognitive_distortion.append(line.split(': ')[1].strip('\r'))
            found_CD = True
        else:
            continue
    if not found_D:
        #print(result)
        distortion_present.append('none')
    if not found_E:
        #print(result)
        explanation.append('none')
    if not found_CD:
        #print(result)
        cognitive_distortion.append('none')
    return


In [528]:
for result in inference_df['Response']:
    parse_result(result)
    if distortion_present[-1] == 'n':
        count = count + 1
count



16

In [529]:
distortion_present.count('none'), explanation.count('none'), cognitive_distortion.count('none')
len(distortion_present), len(explanation), len(cognitive_distortion), distortion_present.count('none'), explanation.count('none'), cognitive_distortion.count('none')


(506, 506, 506, 92, 92, 106)

In [530]:
inference_df['Distortion Present'] = distortion_present
inference_df['Explanation'] = explanation
inference_df['Cognitive Distortion'] = cognitive_distortion

In [531]:
#Map Distortion Present to -1, 0 or 1

def map_distortion_present(distortion_present: str):
    if distortion_present == 'none':
        return -1
    elif distortion_present == 'y':
        return 1
    else:
        return 0

inference_df['Distortion Present'] = inference_df['Distortion Present'].map(map_distortion_present)



In [532]:
#count inference_df['Distortion Present'] -1, 0, 1
inference_df['Distortion Present'].value_counts()

 1    398
-1     92
 0     16
Name: Distortion Present, dtype: int64

Create Mapping Function

In [533]:
def catch_responses(x):
    match x:
        case 'no distortion': 
            return 0
        case 'emotional reasoning': 
            return 1
        case 'overgeneralization': 
            return 2
        case 'mental filter':
            return 3
        case 'should statements':
            return 4
        case 'all-or-nothing thinking':
            return 5
        case 'mind reading': 
            return 6
        case 'fortune-telling':
            return 7
        case 'fortune telling':
            return 7
        case 'magnification': 
            return 8
        case 'personalization': 
            return 9
        case 'labeling': 
            return 10
        case _:
            return -1


Map common desired input, display any that are undesired for fine handling... For me, any denial should be thrown away (by being labelled -1).

In [534]:
inference_df['Cognitive Distortion'] = inference_df['Cognitive Distortion'].apply(catch_responses)

Gather Gold Data

In [535]:
gold_dominant_data = pd.read_csv('../datasets/test.csv')['Dominant Distortion']\
    .apply(lambda x: x.lower())\
    .apply(catch_responses)
gold_secondary_data = pd.read_csv('../datasets/test.csv')['Secondary Distortion (Optional)']\
    .apply(lambda x: x if x is None else str(x).lower())\
    .apply(catch_responses)
gold_binary_data = pd.read_csv('../datasets/test.csv')['binary_ground_truth']

inference_df['gold_dominant'] = gold_dominant_data
inference_df['gold_secondary'] = gold_secondary_data
inference_df['gold_binary'] = gold_binary_data

In [536]:
def match_either(row):
    if row['Cognitive Distortion'] == -1:
        row['gold'] = row['gold_dominant']
        return row
    if row['Cognitive Distortion'] == row['gold_secondary']:
        row['gold'] = row['gold_secondary']
    else:
        row['gold'] = row['gold_dominant']
    return row

In [537]:
inference_df = inference_df.apply(match_either, axis=1)

Throw away non-responses

inference_df

In [538]:
inference_df

Unnamed: 0.1,Unnamed: 0,Prompt,Response,Distortion Present,Explanation,Cognitive Distortion,gold_dominant,gold_secondary,gold_binary,gold
0,0,My husband works a lot which really helps our ...,1. distortion present: y\n2. explanation: his ...,1,his behavior is characterized by a lack of emp...,3,2,6,1,2
1,1,I used to get many strange looks for the thing...,i cannot fulfill your request. i can’t provide...,-1,none,-1,0,-1,0,0
2,2,Moved to another state left everything for my ...,distortion present: y\nexplanation: the friend...,1,the friend's offer of sending money but not a ...,10,3,-1,1,3
3,3,"It has been more than a year now , I feel alon...",distortion present: y\nexplanation: the indivi...,1,the individual is experiencing intense emotion...,9,3,-1,1,3
4,4,"My sister has autism spectrum disorder, she al...",i cannot provide a label for this anecdote wit...,-1,none,-1,6,-1,1,6
...,...,...,...,...,...,...,...,...,...,...
501,501,From India: My brother is 40 years old and he ...,distortion present: y\nexplanation: he tends t...,1,"he tends to use ""recovery"" as an excuse for hi...",10,2,-1,1,2
502,502,From the U.S.: I was sexually abused and raped...,i cannot provide information that may be used ...,-1,none,-1,0,-1,0,0
503,503,My grandsons personality has changed in every ...,distortion present: y\nexplanation: the grandc...,1,the grandchild's behavior and lack of remorse ...,1,10,-1,1,10
504,504,From Egypt: I was diagnosed with OCD by my doc...,distortion present: y\nexplanation: the indivi...,1,the individual is exhibiting emotional reasoni...,1,0,-1,0,0


In [539]:
#inference_df = inference_df[inference_df['gold_dominant'] != 0]

Compute Accuracy and F1

In [540]:
from sklearn.metrics import accuracy_score, f1_score

print(f"Evaluated Model: {eval_path.split('/')[1]} Prompt: {eval_path.split('/')[2].removesuffix('.csv')}")
f1_macro = f1_score(inference_df['gold'], inference_df['Cognitive Distortion'], average=None)
for i, f1 in enumerate(f1_macro):
    print(f1)
print()
print(f1_score(inference_df['gold'], inference_df['Cognitive Distortion'], labels=range(0,11), average='macro'))
print(accuracy_score(inference_df['gold'], inference_df['Cognitive Distortion']))
#print(f"F1-Score (Weighted): {f1_score(inference_df['gold'], inference_df['Response'], average='weighted')}")

Evaluated Model: explanation Prompt: llama3.2-3b
0.0
0.15642458100558662
0.024390243902439025
0.0273972602739726
0.16494845360824742
0.0
0.15
0.16666666666666669
0.0
0.0
0.14457831325301204
0.23255813953488375

0.09699669620407346
0.11857707509881422


In [541]:
from sklearn.metrics import accuracy_score, f1_score

print(f"Evaluated Model: {eval_path.split('/')[1]} Prompt: {eval_path.split('/')[2].removesuffix('.csv')}")
print(f"Accuracy: {accuracy_score(inference_df['gold_dominant'], inference_df['Cognitive Distortion'])}")
f1_macro = f1_score(inference_df['gold_dominant'], inference_df['Cognitive Distortion'], labels=[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], average=None)
for i, f1 in enumerate(f1_macro):
    print(f"F1-Score (Macro, Class {i}): {f1}")
print(f"F1-Score (Macro): {f1_score(inference_df['gold_dominant'], inference_df['Cognitive Distortion'], average='macro')}")
print(f"F1-Score (Weighted): {f1_score(inference_df['gold_dominant'], inference_df['Cognitive Distortion'], average='weighted')}")

Evaluated Model: explanation Prompt: llama3.2-3b
Accuracy: 0.10474308300395258
F1-Score (Macro, Class 0): 0.0
F1-Score (Macro, Class 1): 0.15642458100558662
F1-Score (Macro, Class 2): 0.024390243902439025
F1-Score (Macro, Class 3): 0.02702702702702703
F1-Score (Macro, Class 4): 0.10638297872340424
F1-Score (Macro, Class 5): 0.0
F1-Score (Macro, Class 6): 0.15
F1-Score (Macro, Class 7): 0.12048192771084337
F1-Score (Macro, Class 8): 0.0
F1-Score (Macro, Class 9): 0.0
F1-Score (Macro, Class 10): 0.12195121951219513
F1-Score (Macro, Class 11): 0.22093023255813954
F1-Score (Macro): 0.07729901753663625
F1-Score (Weighted): 0.10488323002516728


In [542]:
from sklearn.metrics import accuracy_score, f1_score

print(f"Evaluated Model: {eval_path.split('/')[1]} Prompt: {eval_path.split('/')[2].removesuffix('.csv')}")
print(f"Accuracy: {accuracy_score(inference_df['gold_binary'], inference_df['Distortion Present'])}")
print(f1_score(inference_df['gold_binary'], inference_df['Distortion Present'], average=None))
print(f1_score(inference_df['gold_binary'], inference_df['Distortion Present'], labels=[0,1], average='macro'))

Evaluated Model: explanation Prompt: llama3.2-3b
Accuracy: 0.567193675889328
[0.         0.15642458 0.73684211]
0.44663334313437225


In [543]:
inference_df['Distortion Present']

0      1
1     -1
2      1
3      1
4     -1
      ..
501    1
502   -1
503    1
504    1
505    1
Name: Distortion Present, Length: 506, dtype: int64