# Uploading dataset

In [None]:
# Import frequently used libraries
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
import re
import string

# Install Dutch spacy model
!python -m spacy download nl_core_news_sm

# Install rouge_score
!pip install rouge_score
from rouge_score import rouge_scorer

In [None]:
# Upload dataset
df = pd.read_csv(
    'INSERT DATASET',
    encoding='utf-8',
    header=0,
    sep=';',
    quotechar='"',   # handles commas inside quotes
)


# Preview df
print(df.head())

In [None]:
# Check for column types and completeness
df.info()

In [None]:
# Delete column 'Unnamed: 12' - transformation error
df.drop(columns='Unnamed: 12', inplace=True)

In [None]:
# Check df again after deletion of column
df.info()

In [None]:
'''
Transform data into string objects for subsequent analysis.
'''

In [None]:
# Ensure correct column types for analysis
df['human_label'] = df['human_label'].astype(str)
df['cot_gpt_label'] = df['cot_gpt_label'].astype(str)
df['mp_gpt_label'] = df['mp_gpt_label'].astype(str)
df['ps_gpt_label'] = df['ps_gpt_label'].astype(str)
df['sc_gpt_label'] = df['sc_gpt_label'].astype(str)

In [None]:
# Ensure correct column types for analysis
df['human_explanation'] = df['human_explanation'].astype(str)
df['cot_gpt_explanation'] = df['cot_gpt_explanation'].astype(str)
df['mp_gpt_explanation'] = df['mp_gpt_explanation'].astype(str)
df['ps_gpt_explanation'] = df['ps_gpt_explanation'].astype(str)
df['sc_gpt_explanation'] = df['sc_gpt_explanation'].astype(str)

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
'''The preliminary checks are complete and analysis can be started.
We start with the Cohen's Kappa agreement between the human expert and
the gpt evaluations.
'''

# Cohen's Kappa human expert-gpt

COHEN'S KAPPA

In [None]:
'''
This section calculates Cohen's Kappa score to measure the agreement between human expert classifications
and the classifications generated by various GPT prompting strategies (CoT, MP, PS, SC).
Scores are calculated overall, per specific criterion, and per individual case.
'''

# Unique values
criteria = df['criteria'].unique()
case_ids = df['case_id'].unique()
gpt_variants = ['cot', 'mp', 'ps', 'sc']

# Cohen's Kappa per prompt strategy
print("\n Cohen’s Kappa per prompt strategy:")
for variant in gpt_variants:
    kappa = cohen_kappa_score(df['human_label'], df[f'{variant}_gpt_label'])
    print(f"  {variant.upper():>3} vs Human: {kappa:.3f}")

# Cohen's Kappa Per Criterium
print("\n Cohen's Kappa per criterium:")
for variant in gpt_variants:
    print(f"\n{variant.upper()} vs Human:")
    for crit in criteria:
        subset = df[df['criteria'] == crit]
        kappa = cohen_kappa_score(subset['human_label'], subset[f'{variant}_gpt_label'])
        print(f"  {crit:<15}: {kappa:.3f}")

# Cohen's Kappa Per Case
print("\n Cohen's Kappa per case:")
for variant in gpt_variants:
    print(f"\n{variant.upper()} vs Human:")
    for case in case_ids:
        subset = df[df['case_id'] == case]
        kappa = cohen_kappa_score(subset['human_label'], subset[f'{variant}_gpt_label'])
        print(f"  Case {case}: {kappa:.3f}")


In [None]:
'''
For better interpretability, the scores will also be plotted using bar plots.
'''

In [None]:
# Prepare summary df per criterium
results = []
for variant in gpt_variants:
    for crit in criteria:
        subset = df[df['criteria'] == crit]
        kappa = cohen_kappa_score(subset['human_label'], subset[f'{variant}_gpt_label'])
        results.append({'Prompt Strategy': variant.upper(), 'Criterion': crit, 'Cohen Kappa': kappa})

summary_df = pd.DataFrame(results)

# Plot Cohen's Kappa per criterium
plt.figure(figsize=(10,6))
sns.barplot(data=summary_df, x='Criterion', y='Cohen Kappa', hue='Prompt Strategy')
plt.title('Cohen\'s Kappa per Criterion by Prompting Stategy')
plt.ylim(0,1)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Prepare summary df per case
results = []
for variant in gpt_variants:
    for case in case_ids:
        subset = df[df['case_id'] == case]
        kappa = cohen_kappa_score(subset['human_label'], subset[f'{variant}_gpt_label'])
        results.append({'Prompt Strategy': variant.upper(), 'Case': case, 'Cohen Kappa': kappa})

case_df = pd.DataFrame(results)

# Plot Cohen's Kappa per case
plt.figure(figsize=(10,6))
sns.barplot(data=case_df, x='Case', y='Cohen Kappa', hue='Prompt Strategy')
plt.title('Cohen\'s Kappa per Case by Prompt Strategy')
plt.ylim(0,1)
plt.tight_layout()
plt.show()


In [None]:
'''
To visualize the inter-relationships better,
the scores are also represented as a heatmap.
'''
gpt_variants = ['cot', 'mp', 'ps', 'sc']
criteria = df['criteria'].unique()
case_ids = df['case_id'].unique()

# Define the min and max values to ensure consistent scaling across heatmaps
vmin, vmax = -0.4, 1

# Heatmap per Criterion
results_crit = []
for crit in criteria:
    for variant in gpt_variants:
        subset = df[df['criteria'] == crit]
        kappa = cohen_kappa_score(subset['human_label'], subset[f'{variant}_gpt_label'])
        results_crit.append({'Criterion': crit, 'Prompt Strategy': variant.upper(), 'Kappa': kappa})

heatmap_data_crit = pd.DataFrame(results_crit).pivot(index='Criterion', columns='Prompt Strategy', values='Kappa')

plt.figure(figsize=(8,6))
sns.heatmap(heatmap_data_crit, annot=True, cmap='coolwarm', vmin=vmin, vmax=vmax, fmt=".3f")
plt.title("Cohen's Kappa per Criterion by Prompt Strategy")
plt.tight_layout()
plt.show()

# Heatmap per Case
results_case = []
for case in case_ids:
    for variant in gpt_variants:
        subset = df[df['case_id'] == case]
        kappa = cohen_kappa_score(subset['human_label'], subset[f'{variant}_gpt_label'])
        results_case.append({'Case': case, 'Prompt Strategy': variant.upper(), 'Kappa': kappa})

heatmap_data_case = pd.DataFrame(results_case).pivot(index='Case', columns='Prompt Strategy', values='Kappa')

plt.figure(figsize=(8,6))
sns.heatmap(heatmap_data_case, annot=True, cmap='coolwarm', vmin=vmin, vmax=vmax, fmt=".3f")
plt.title("Cohen's Kappa per Case by Prompt Strategy")
plt.tight_layout()
plt.show()


# ROUGE EVALUATION

In [None]:
'''
This section evaluates the quality of GPT-generated explanations by comparing
them against human expert explanations using ROUGE scores.
ROUGE-1, ROUGE-2, and ROUGE-L F1 scores are calculated to assess
unigram, bigram, and longest sequence overlap.
'''

ROUGE

In [None]:
# Load Dutch Spacy model for tokenization and text processing
nlp = spacy.load("nl_core_news_sm")

def preprocess(text):
    # Lowercase
    text = text.lower()

    # Normalize percentages: replace "10%" -> "10 procent"
    text = re.sub(r'(\d+)%', r'\1 procent', text)

    # Remove punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)

    # Use spacy tokenizer
    doc = nlp(text)

    # Normalize white space
    tokens = [token.text for token in doc if not token.is_space]

    return " ".join(tokens)


In [None]:
# Select rouge1, rouge2, rougeL
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Function to calculate rouge
def compute_rouge(human_text, gpt_text):
    human_text = preprocess(human_text)
    gpt_text = preprocess(gpt_text)
    scores = scorer.score(human_text, gpt_text)
    return scores


In [None]:
# Calculate ROUGE scores
gpt_variants = ['cot', 'mp', 'ps', 'sc']

for variant in gpt_variants:
    df[f'rouge1_f1_{variant}'] = df.apply(
        lambda row: compute_rouge(row['human_explanation'], row[f'{variant}_gpt_explanation'])['rouge1'].fmeasure,
        axis=1
    )
    df[f'rouge2_f1_{variant}'] = df.apply(
        lambda row: compute_rouge(row['human_explanation'], row[f'{variant}_gpt_explanation'])['rouge2'].fmeasure,
        axis=1
    )
    df[f'rougeL_f1_{variant}'] = df.apply(
        lambda row: compute_rouge(row['human_explanation'], row[f'{variant}_gpt_explanation'])['rougeL'].fmeasure,
        axis=1
    )


In [None]:
# Preview df to verify output successful
df.head()

In [None]:
# Calculate ROUGE mean scores
rouge_means = df[
    [
        'rouge1_f1_cot', 'rouge2_f1_cot', 'rougeL_f1_cot',
        'rouge1_f1_mp',  'rouge2_f1_mp',  'rougeL_f1_mp',
        'rouge1_f1_ps',  'rouge2_f1_ps',  'rougeL_f1_ps',
        'rouge1_f1_sc',  'rouge2_f1_sc',  'rougeL_f1_sc'
    ]
].mean()

# Calculate ROUGE standard deviation
rouge_stds = df[
    [
        'rouge1_f1_cot', 'rouge2_f1_cot', 'rougeL_f1_cot',
        'rouge1_f1_mp',  'rouge2_f1_mp',  'rougeL_f1_mp',
        'rouge1_f1_ps',  'rouge2_f1_ps',  'rougeL_f1_ps',
        'rouge1_f1_sc',  'rouge2_f1_sc',  'rougeL_f1_sc'
    ]
].std()

print(rouge_means)
print(rouge_stds)


In [None]:
# Determine low, moderate, and high range

# Select ROUGE columns
rouge_cols = [
    'rouge1_f1_cot', 'rouge2_f1_cot', 'rougeL_f1_cot',
    'rouge1_f1_mp',  'rouge2_f1_mp',  'rougeL_f1_mp',
    'rouge1_f1_ps',  'rouge2_f1_ps',  'rougeL_f1_ps',
    'rouge1_f1_sc',  'rouge2_f1_sc',  'rougeL_f1_sc'
]

rouge_df = df[rouge_cols]

# Restructure for analysis
rouge_long = rouge_df.melt(var_name='rouge_metric', value_name='score')

# Plot distributions
plt.figure(figsize=(12, 7))
sns.kdeplot(data=rouge_long, x='score', hue='rouge_metric', fill=True, common_norm=False, alpha=0.5)
plt.title('ROUGE Score Distributions')
plt.xlabel('ROUGE Score')
plt.ylabel('Density')
plt.legend(loc='upper right', fontsize='small')
plt.show()

# Print quantiles
quantiles = rouge_long.groupby('rouge_metric')['score'].quantile([0.25, 0.5, 0.75]).unstack()
print(quantiles)



In [None]:
'''
Visualizing Agreement and ROUGE scores together. This section prepares a
styled df that display binary label agreement and ROUGE-L F1 scores
for each 'case_id' and 'criteria'. The cells are colour coded to visually distinguish
between low, moderate, and high scores/agreement levels for quicker inspection.
'''
# Determine binary agreement (1 if labels equal, else 0)
for variant in ['cot', 'mp', 'ps', 'sc']:
    df[f'agreement_{variant}'] = (df['human_label'] == df[f'{variant}_gpt_label']).astype(int)

# Select columns
columns_to_show = ['case_id', 'criteria']

# Add agreement and rouge columns per variant in desired order
for variant in ['cot', 'mp', 'ps', 'sc']:
    columns_to_show.append(f'agreement_{variant}')
    for metric in ['rouge1_f1', 'rouge2_f1', 'rougeL_f1']:
        columns_to_show.append(f'{metric}_{variant}')

df_display = df[columns_to_show].copy()

# Define color functions to present low, moderate, high range

def color_agreement(val):
    if val == 1:
        return 'background-color: lightgreen'
    else:
        return 'background-color: salmon'

def color_rouge(val):
    if val < 0.05:
        return 'background-color: orange'
    elif val < 0.15:
        return 'background-color: yellow'
    else:
        return 'background-color: lightgreen'

# Apply styling
agreement_cols = [col for col in df_display.columns if col.startswith('agreement')]
rouge_cols = [col for col in df_display.columns if col.startswith('rouge')]

styled_df = df_display.style.applymap(color_agreement, subset=agreement_cols).applymap(color_rouge, subset=rouge_cols)

# Print
styled_df


In [None]:
'''
This section identifies specific rows from the dataset that represent
interesting cases for deeper qualitative analysis based on label agreement
and ROUGE-L scores.
'''
rows_of_interest = []

for variant in ['cot', 'mp', 'ps', 'sc']:
    agree_col = f'agreement_{variant}'
    rouge_col = f'rougeL_f1_{variant}'
    human_exp_col = 'human_explanation'
    gpt_exp_col = f'{variant}_gpt_explanation'
    gpt_label_col = f'{variant}_gpt_label'

    # CASE 1: Agreement with low ROUGE
    mask1 = (df_display[agree_col] == 1) & (df_display[rouge_col] < 0.05)
    case1 = df.loc[mask1, [
        'case_id', 'criteria', 'human_label', gpt_label_col, human_exp_col, gpt_exp_col
    ]].copy()
    case1['variant'] = variant.upper()
    case1['flag'] = 'AGREE / LOW ROUGE'

    # CASE 2: Disagreement with high ROUGE
    mask2 = (df_display[agree_col] == 0) & (df_display[rouge_col] > 0.15)
    case2 = df.loc[mask2, [
        'case_id', 'criteria', 'human_label', gpt_label_col, human_exp_col, gpt_exp_col
    ]].copy()
    case2['variant'] = variant.upper()
    case2['flag'] = 'DISAGREE / HIGH ROUGE'

    # Standardize column names
    case1.rename(columns={
        gpt_label_col: 'gpt_label',
        human_exp_col: 'human_explanation',
        gpt_exp_col: 'gpt_explanation'
    }, inplace=True)

    case2.rename(columns={
        gpt_label_col: 'gpt_label',
        human_exp_col: 'human_explanation',
        gpt_exp_col: 'gpt_explanation'
    }, inplace=True)

    rows_of_interest.append(case1)
    rows_of_interest.append(case2)

# Combine case 1 and case 2 cases
df_review = pd.concat(rows_of_interest, ignore_index=True)
df_review.sort_values(['case_id', 'criteria', 'variant'], inplace=True)

# Print results
df_review

In [None]:
# Count occurrences case 1 and case 2
flag_counts = df_review['flag'].value_counts()

# Print counts
print("Case Counts:")
print(flag_counts)

In [None]:
# Threshold for low ROUGE
rouge_threshold = 0.05

disagree_low_rouge_cases = []

for variant in ['cot', 'mp', 'ps', 'sc']:
    agreement_col = f'agreement_{variant}'
    rouge_col = f'rougeL_f1_{variant}'
    human_exp_col = 'human_explanation'
    gpt_exp_col = f'{variant}_gpt_explanation'
    human_label_col = 'human_label'
    gpt_label_col = f'{variant}_gpt_label'

    # Filter on original df
    mask = (df[agreement_col] == 0) & (df[rouge_col] < rouge_threshold)

    filtered = df.loc[mask, [
        'case_id', 'criteria',
        human_label_col, gpt_label_col,
        human_exp_col, gpt_exp_col
    ]].copy()

    filtered['variant'] = variant.upper()
    filtered['flag'] = 'DISAGREE / LOW ROUGE'

    filtered.rename(columns={
        human_label_col: 'human_label',
        gpt_label_col: 'gpt_label',
        human_exp_col: 'human_explanation',
        gpt_exp_col: 'gpt_explanation'
    }, inplace=True)

    disagree_low_rouge_cases.append(filtered)

df_disagree_low_rouge = pd.concat(disagree_low_rouge_cases, ignore_index=True)
df_disagree_low_rouge.sort_values(['case_id', 'criteria', 'variant'], inplace=True)

# Print results
df_disagree_low_rouge


# BERTSCORE

In [None]:
'''
This section assesses the semantic similarity of ChatGPT-generated explanations
compared to human evaluations using a pre-trained Sentence BERT transformer model.
This method captures contextual meaning beyond N-gram overlap, offering
a deeper insight into explanation quality.
'''

import re
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm

In [None]:
'''
'paraphrase-multilingual-MiniLM-L12-v2' is a good choice for efficiency and performance across language
- Dutch in this case.
'''
# Load model
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

In [None]:
'''
Note: the 'preprocess' function defined earlier for ROUGE will also be used here.
This ensures consistent text preparation for both similarity metrics.
'''
def preprocess(text):
    # Lowercase
    text = text.lower()

    # Normalize percentages: replace "10%" -> "10 procent"
    text = re.sub(r'(\d+)%', r'\1 procent', text)

    # Remove punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
# Apply preprocessing to all explanation columns
for col in ['human_explanation', 'cot_gpt_explanation', 'mp_gpt_explanation', 'ps_gpt_explanation']:
    df[col] = df[col].apply(preprocess)

# Compute semantic similarity for each prompt variant
variants = ['cot', 'mp', 'ps', 'sc']

for variant in variants:
    col_human = 'human_explanation'
    col_gpt = f'{variant}_gpt_explanation'
    sim_scores = []

    print(f"Computing semantic similarity for: {variant.upper()}")

    for _, row in tqdm(df.iterrows(), total=len(df)):
        emb1 = model.encode(row[col_human], convert_to_tensor=True)
        emb2 = model.encode(row[col_gpt], convert_to_tensor=True)
        score = util.pytorch_cos_sim(emb1, emb2).item()
        sim_scores.append(score)

    df[f'semantic_similarity_{variant}'] = sim_scores

# Preview results
df[['case_id', 'criteria',
    'semantic_similarity_cot',
    'semantic_similarity_mp',
    'semantic_similarity_ps',
    'semantic_similarity_sc']].head()

In [None]:
# Determine low, moderate, and high range

# Select relevant columns
sim_df = df[['semantic_similarity_cot', 'semantic_similarity_mp', 'semantic_similarity_ps', 'semantic_similarity_sc']]

# Restructure for analysis
sim_long = sim_df.melt(var_name='variant', value_name='similarity_score')

# Plot distributions
plt.figure(figsize=(10,6))
sns.kdeplot(data=sim_long, x='similarity_score', hue='variant', fill=True, common_norm=False, alpha=0.5)
plt.title('Semantic Similarity Score Distributions by Variant')
plt.xlabel('Semantic Similarity Score')
plt.ylabel('Density')
plt.show()

# Print quantiles
quantiles = sim_long.groupby('variant')['similarity_score'].quantile([0.25, 0.5, 0.75]).unstack()
print(quantiles)


In [None]:
'''
Visualizing Agreement and cosine similarity scores together. This section prepares a
styled df that display binary label agreement and similarity scores
for each 'case_id' and 'criteria'. The cells are colour coded to visually distinguish
between low, moderate, and high scores/agreement levels for quicker inspection.
'''
# Select columns to show
columns_to_show = ['case_id', 'criteria']

# Add agreement and semantic similarity columns for each variant
for variant in ['cot', 'mp', 'ps', 'sc']:
    columns_to_show.append(f'agreement_{variant}')
    columns_to_show.append(f'semantic_similarity_{variant}')

# Slice relevant columns
df_semantic = df[columns_to_show].copy()

# Define color functions for low, moderate, and high range

def color_agreement(val):
    if val == 1:
        return 'background-color: lightgreen'
    else:
        return 'background-color: salmon'

def color_similarity(val):
    if val < 0.2:
        return 'background-color: orange'
    elif val < 0.65:
        return 'background-color: yellow'
    else:
        return 'background-color: lightgreen'

# Apply styling
agreement_cols = [col for col in df_semantic.columns if col.startswith('agreement')]
similarity_cols = [col for col in df_semantic.columns if col.startswith('semantic_similarity')]

styled_semantic = df_semantic.style \
    .applymap(color_agreement, subset=agreement_cols) \
    .applymap(color_similarity, subset=similarity_cols)

# Print results
styled_semantic


In [None]:
'''
This next section again identifies specific rows that represent intersting cases for
further qualitative analysis based on label agreement and semantic similarity scores.
'''

In [None]:
'''
Merge semantic similarity scores back into the main df for combined analysis.
'''
# Merge semantic similarity scores into main df by case_id and criteria
df_merged = df.merge(
    df_semantic[['case_id', 'criteria',
                 'semantic_similarity_cot',
                 'semantic_similarity_mp',
                 'semantic_similarity_ps',
                 'semantic_similarity_sc']],
    on=['case_id', 'criteria'],
    how='left'
)


In [None]:
case1_list = []
case2_list = []

# Define thresholds (based on quantile analysis)
low_sim_threshold = 0.2
high_sim_threshold = 0.65

for variant in ['cot', 'mp', 'ps', 'sc']:
    agree_col = f'agreement_{variant}'
    sim_col = f'semantic_similarity_{variant}_x'
    human_exp_col = 'human_explanation'
    gpt_exp_col = f'{variant}_gpt_explanation'
    gpt_label_col = f'{variant}_gpt_label'

    # CASE 1: Agreement with low semantic similarity
    mask1 = (df_merged[agree_col] == 1) & (df_merged[sim_col] < low_sim_threshold)
    case1 = df_merged.loc[mask1, [
        'case_id', 'criteria', 'human_label', gpt_label_col, human_exp_col, gpt_exp_col, sim_col
    ]].copy()
    case1['variant'] = variant.upper()
    case1['flag'] = 'AGREE / LOW SEMANTIC SIMILARITY'

    # Rename columns for clarity
    case1.rename(columns={
        gpt_label_col: 'gpt_label',
        human_exp_col: 'human_explanation',
        gpt_exp_col: 'gpt_explanation',
        sim_col: 'semantic_similarity'
    }, inplace=True)
    case1_list.append(case1)

    # CASE 2: Disagreement with high semantic similarity
    mask2 = (df_merged[agree_col] == 0) & (df_merged[sim_col] > high_sim_threshold)
    case2 = df_merged.loc[mask2, [
        'case_id', 'criteria', 'human_label', gpt_label_col, human_exp_col, gpt_exp_col, sim_col
    ]].copy()
    case2['variant'] = variant.upper()
    case2['flag'] = 'DISAGREE / HIGH SEMANTIC SIMILARITY'

    # Rename columns for clarity
    case2.rename(columns={
        gpt_label_col: 'gpt_label',
        human_exp_col: 'human_explanation',
        gpt_exp_col: 'gpt_explanation',
        sim_col: 'semantic_similarity'
    }, inplace=True)
    case2_list.append(case2)

# Concatenate separately
df_agree_low_sim = pd.concat(case1_list, ignore_index=True)
df_agree_low_sim.sort_values(['case_id', 'criteria', 'variant'], inplace=True)

df_disagree_high_sim = pd.concat(case2_list, ignore_index=True)
df_disagree_high_sim.sort_values(['case_id', 'criteria', 'variant'], inplace=True)

df_agree_low_sim
df_disagree_high_sim


In [None]:
# Set threshold for low semantic similarity
similarity_threshold = 0.2

rows_of_interest = []

for variant in ['cot', 'mp', 'ps', 'sc']:
    agree_col = f'agreement_{variant}'
    sim_col = f'semantic_similarity_{variant}_x'
    human_label_col = 'human_label'
    gpt_label_col = f'{variant}_gpt_label'
    human_exp_col = 'human_explanation'
    gpt_exp_col = f'{variant}_gpt_explanation'

    mask = (df_merged[agree_col] == 0) & (df_merged[sim_col] < similarity_threshold)

    filtered = df_merged.loc[mask, [
        'case_id', 'criteria', human_label_col, gpt_label_col, human_exp_col, gpt_exp_col, sim_col
    ]].copy()

    filtered['variant'] = variant.upper()
    filtered['flag'] = 'DISAGREE / LOW SEMANTIC SIMILARITY'

    filtered.rename(columns={
        gpt_label_col: 'gpt_label',
        human_exp_col: 'human_explanation',
        gpt_exp_col: 'gpt_explanation',
        sim_col: 'semantic_similarity'
    }, inplace=True)

    rows_of_interest.append(filtered)

df_disagree_low_sim = pd.concat(rows_of_interest, ignore_index=True)
df_disagree_low_sim.sort_values(['case_id', 'criteria', 'variant'], inplace=True)

# Print results
df_disagree_low_sim


# Mean scores per prompting strategy

In [None]:
'''
This section provides an aggregated view of the mean performance metrics
for each prompting strategy. This table offers quick overview and comparison of
the overall effectiveness of each method.
'''

In [None]:
# Define prompting strategies
variants = ['cot', 'mp', 'ps', 'sc']

metrics = {
    'agreement': lambda v: f'agreement_{v}',
    'rouge1_f1': lambda v: f'rouge1_f1_{v}',
    'rouge2_f1': lambda v: f'rouge2_f1_{v}',
    'rougeL_f1': lambda v: f'rougeL_f1_{v}',
    'semantic_similarity': lambda v: f'semantic_similarity_{v}_x'
}

# Prepare a results dictionary
results = {v.upper(): {} for v in variants}

# Calculate mean values
for variant in variants:
    var_upper = variant.upper()
    for metric_name, col_func in metrics.items():
        col_name = col_func(variant)
        if col_name in df_merged.columns:
            mean_val = df_merged[col_name].mean()
            results[var_upper][metric_name] = mean_val
        else:
            results[var_upper][metric_name] = None  # or np.nan

# Convert to df for nicer display
results_df = pd.DataFrame(results).T
results_df.index.name = 'Variant'

print(results_df)


# CONSISTENCY CHECK

In [None]:
'''
This section focuses on evaluating the internal consistency of the GPT model
themselves by comparing outputs from two separate runs for the same prompt
strategy.
'''

In [None]:
'''
Upload the dataset specifically for consistency check. This dataset is expected to contain
columns like cot_run1, cot_run2 etc.
'''
# Upload data
import pandas as pd
df = pd.read_csv(
    'INSERT DATASET',
    encoding='utf-8',
    header=0,
    sep=';',
    quotechar='"',            # handles commas inside quotes
)


print(df.head())

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
'''
No missing values indicate that the transformation into csv datset was succesful
and no data was lost. Now we can proceed to analysis.
First, we calculate the Cohen's Kappa score across all CoT, MP, and PS data.
'''

In [None]:
# Overall kappa for cot:
kappa_cot = cohen_kappa_score(df['cot1'], df['cot2'])

# Overall kappa for mp:
kappa_mp = cohen_kappa_score(df['mp1'], df['mp2'])

# Overall kappa for ps:
kappa_ps = cohen_kappa_score(df['ps1'], df['ps2'])

print(kappa_cot)
print(kappa_mp)
print(kappa_ps)

In [None]:
'''
Next, we calculate Cohen's Kappa for CoT, MP, and PS per case.
'''

In [None]:
# Per case (for cot):
for case in df['case_id'].unique():
    subset = df[df['case_id'] == case]
    print(f"Case {case} kappa cot: ", cohen_kappa_score(subset['cot1'], subset['cot2']))

In [None]:
# Per case (for mp):
for case in df['case_id'].unique():
    subset = df[df['case_id'] == case]
    print(f"Case {case} kappa cot: ", cohen_kappa_score(subset['mp1'], subset['mp2']))

In [None]:
# Per case (for ps):
for case in df['case_id'].unique():
    subset = df[df['case_id'] == case]
    print(f"Case {case} kappa cot: ", cohen_kappa_score(subset['ps1'], subset['ps2']))

In [None]:
'''
Lastly, we look at Cohen's Kappa score per criteria
'''

In [None]:
criteria_list = df['criteria'].unique()

for crit in criteria_list:
    subset = df[df['criteria'] == crit]

    kappa_cot = cohen_kappa_score(subset['cot1'], subset['cot2'])
    kappa_mp = cohen_kappa_score(subset['mp1'], subset['mp2'])
    kappa_ps = cohen_kappa_score(subset['ps1'], subset['ps2'])

    print(f"Criteria: {crit}")
    print(f"  Cohen's Kappa cot1 vs cot2: {kappa_cot:.3f}")
    print(f"  Cohen's Kappa mp1 vs mp2:   {kappa_mp:.3f}")
    print(f"  Cohen's Kappa ps1 vs ps2:   {kappa_ps:.3f}")
    print("-" * 40)


In [None]:
'''
Check the unique values for cot1, cot2, ps1 and ps2.
The output for criteria 'measures_and_actions' is nan.
This means that there is no variability in the classification labels.
To check whether this is correct we look at the unique values.
'''

In [None]:
crit = "measures_and_actions"
subset = df[df['criteria'] == crit]

print("cot1 unique values:", subset['cot1'].unique())
print("cot2 unique values:", subset['cot2'].unique())
print("ps1 unique values:", subset['ps1'].unique())
print("ps2 unique values:", subset['ps2'].unique())


In [None]:
'''
Indeed, only 'accept' has been put as input classification, meaning that
there is perfect agreement. However, Cohen's kappa can not determine its
meaningfulness.
'''

In [None]:
# List of dictionaries or rows with criteria and kappa values
kappa_results = []

criteria_list = df['criteria'].unique()
for crit in criteria_list:
    subset = df[df['criteria'] == crit]
    kappa_results.append({
        'criteria': crit,
        'type': 'cot',
        'kappa': cohen_kappa_score(subset['cot1'], subset['cot2'])
    })
    kappa_results.append({
        'criteria': crit,
        'type': 'mp',
        'kappa': cohen_kappa_score(subset['mp1'], subset['mp2'])
    })
    kappa_results.append({
        'criteria': crit,
        'type': 'ps',
        'kappa': cohen_kappa_score(subset['ps1'], subset['ps2'])
    })

kappa_df = pd.DataFrame(kappa_results)

# Replace NaN with 0 for visualization
kappa_df['kappa'] = kappa_df['kappa'].fillna(0)

plt.figure(figsize=(12, 6))
sns.barplot(data=kappa_df, x='criteria', y='kappa', hue='type')
plt.axhline(0, color='grey', linewidth=0.8)
plt.title("Cohen's Kappa Scores per Criteria and Classification Type")
plt.xticks(rotation=45)
plt.ylim(-0.2, 1.05)
plt.ylabel("Cohen's Kappa")
plt.legend(title='Classification Type')
plt.tight_layout()
plt.show()


In [None]:
# Pivot data for heatmap
heatmap_data = kappa_df.pivot(index='criteria', columns='type', values='kappa')

plt.figure(figsize=(8, 6))
sns.heatmap(heatmap_data, annot=True, cmap='coolwarm', vmin=-0.2, vmax=1, center=0)
plt.title("Heatmap of Cohen's Kappa Scores")
plt.ylabel("Criteria")
plt.xlabel("Classification Type")
plt.tight_layout()
plt.show()


In [None]:
# Gather kappa scores per case into a list

cases = df['case_id'].unique()
results = []

for case in cases:
    subset = df[df['case_id'] == case]
    kappa_cot = cohen_kappa_score(subset['cot1'], subset['cot2'])
    kappa_mp = cohen_kappa_score(subset['mp1'], subset['mp2'])
    kappa_ps = cohen_kappa_score(subset['ps1'], subset['ps2'])

    results.append({
        'case_id': case,
        'cot': kappa_cot,
        'mp': kappa_mp,
        'ps': kappa_ps
    })

# Create df from results
kappa_case_df = pd.DataFrame(results)

# Prepare df for heatmap
heatmap_data = kappa_case_df.set_index('case_id')

# Fill NaN with 0
heatmap_data = heatmap_data.fillna(0)

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(heatmap_data, annot=True, cmap='coolwarm', vmin=-0.2, vmax=1, center=0)
plt.title("Heatmap of Cohen's Kappa per Case and Classification Type")
plt.ylabel("Case ID")
plt.xlabel("Classification Type")
plt.tight_layout()
plt.show()




In [None]:
'''
Next I would like to create tables that present which cases and criteria
do not match for the three strategies. This way I can do some more
qualitative analysis into the underlying reason for the divergences.
'''

In [None]:
# Create mismatch df for each method
cot_mismatches = df[df['cot1'] != df['cot2']][['case_id', 'criteria', 'cot1', 'cot2']].sort_values(by='criteria')
mp_mismatches = df[df['mp1'] != df['mp2']][['case_id', 'criteria', 'mp1', 'mp2']].sort_values(by='criteria')
ps_mismatches = df[df['ps1'] != df['ps2']][['case_id', 'criteria', 'ps1', 'ps2']].sort_values(by='criteria')


In [None]:
# Print
cot_mismatches
mp_mismatches
ps_mismatches