In [1]:
import json
from collections import defaultdict

In [3]:
with open("validation.json", "r") as f:
    data = json.load(f)

In [4]:
data

{'CLINTON_199_2': {'intervention_id': 'CLINTON_199_2',
  'intervention': 'CLINTON: "which may prove to be an intelligence benefit\nwe\'ve got to do everything we can to vacuum up intelligence from Europe, from the Middle East\nThat means we\'ve got to work more closely with our allies, and that\'s something that Donald has been very dismissive of\nWe\'re working with NATO, the longest military alliance in the history of the world, to really turn our attention to terrorism\nWe\'re working with our friends in the Middle East, many of which, as you know, are Muslim majority nations\nDonald has consistently insulted Muslims abroad, Muslims at home, when we need to be cooperating with Muslim nations and with the American Muslim community\nThey\'re on the front lines\nThey can provide information to us that we might not get anywhere else\nThey need to have close working cooperation with law enforcement in these communities, not be alienated and pushed away as some of Donald\'s rhetoric, unfo

### Datasets

In [5]:
datasets = {}
for key, value in data.items():
    dataset = value["dataset"]
    if dataset in datasets:
        datasets[dataset] += 1

    else:
        datasets[dataset] = 1

datasets = dict(sorted(datasets.items(), key=lambda item: item[1], reverse=True))

print("Total number of datasets:", len(datasets))
print("\nDatasets with total number of interventions (sortedn in descending order):")
datasets

Total number of datasets: 4

Datasets with total number of interventions (sortedn in descending order):


{'US2016': 80, 'rrd': 72, 'moral_maze_schemes': 20, 'us2016reddit': 14}

### Authors

In [6]:
authors = {}
for key, value in data.items():
    author = key.split("_")[0]
    if author in authors:
        authors[author] += 1
    else:
        authors[author] = 1

authors = dict(sorted(authors.items(), key=lambda item: item[1], reverse=True))

print("Total number of autors:", len(authors))
print("\nAuthors with total number of interventions (sorted in descending order):")
authors

Total number of autors: 69

Authors with total number of interventions (sorted in descending order):


{'TRUMP': 43,
 'CLINTON': 34,
 'Antanagoge': 4,
 'JJMurray': 4,
 'MT': 4,
 'howie': 4,
 'HOLT': 3,
 'CF': 3,
 'JL': 3,
 'Mulder': 3,
 'SofieM': 3,
 'CL': 2,
 'atraveller': 2,
 'cd38': 2,
 'citizen-s': 2,
 'darawayne': 2,
 'JW': 2,
 'JetJock': 2,
 'MP': 2,
 'ND': 2,
 'NYCMuscleman18': 2,
 'PracticalJo': 2,
 'SWong': 2,
 'dberger': 2,
 'dlpoole': 2,
 'elizwestley': 2,
 'golff4fun': 2,
 'mcliverty': 2,
 'secretcurse': 2,
 'smr': 2,
 'travellots': 2,
 'Doctor-Mom': 1,
 'Elmattador': 1,
 'FoodAllergyMom': 1,
 'Frequent-Flyer': 1,
 'Glblwrmingisfak': 1,
 'Helen': 1,
 '17th': 1,
 'AFCHF': 1,
 'AK-traveler': 1,
 'AllergyDad': 1,
 'AngelComa': 1,
 'Bill': 1,
 'Tuatho': 1,
 'Vec': 1,
 'Velshtein': 1,
 'Zewstain': 1,
 'aimwill': 1,
 'ambersky': 1,
 'annoyed': 1,
 'JDwyer': 1,
 'Javier': 1,
 'KHenrickson': 1,
 'MR': 1,
 'Melanie': 1,
 'Mpogoda': 1,
 'MrFordization': 1,
 'PeanutAllergy': 1,
 'Qubbin': 1,
 'Sithsaber': 1,
 'drgreg': 1,
 'grayk47': 1,
 'hgranato': 1,
 'kateinhawaii': 1,
 'lauraclare'

### Argumentation schemes

In [7]:
schemes = {}
for key, value in data.items():
    scheme = value["schemes"]
    for sch in scheme:
        if sch in schemes:
            schemes[sch] += 1
        else:
            schemes[sch] = 1

schemes = dict(sorted(schemes.items(), key=lambda item: item[1], reverse=True))

print("Total number of schemes:", len(schemes))
print("\nSchemes with total number of occurrences (sorted in descending order):")
schemes

Total number of schemes: 28

Schemes with total number of occurrences (sorted in descending order):


{'ERPracticalReasoning': 97,
 'Example': 91,
 'ERExample': 84,
 'CauseToEffect': 55,
 'PracticalReasoning': 38,
 'Consequences': 36,
 'VerbalClassification': 25,
 'Sign': 24,
 'CircumstantialAdHominem': 22,
 'GenericAdHominem': 15,
 'Analogy': 11,
 'Values': 10,
 'PositionToKnow': 10,
 'PopularOpinion': 8,
 'FearAppeal': 7,
 'DangerAppeal': 7,
 'Ad hominem': 7,
 'ERAdHominem': 7,
 'ExpertOpinion': 6,
 'Alternatives': 6,
 'PopularPractice': 6,
 'Bias': 4,
 'ERExpertOpinion': 4,
 'ArgumentFromAuthority': 4,
 'DirectAdHominem': 2,
 'PositiveConsequences': 1,
 'NegativeConsequences': 1,
 'SignFromOtherEvents': 1}

### Critical questions

In [8]:
def compute_cq_stats(data):
    cq_stats = {
        "total_cqs": 0,
        "theoretical": {
            "count": 0,
            "labels": defaultdict(int),
            "percentage_useful": 0.0
        },
        "llm_generated": {
            "count": 0,
            "labels": defaultdict(int),
            "percentage_useful": 0.0
        }
    }

    theoretical_cqs = {}
    llm_cqs = {}

    for value in data.values():
        for cq in value.get("cqs", []):
            cq_stats["total_cqs"] += 1
            cq_id = cq.get("id")
            label = cq.get("label")

            if cq_id is None:
                print("Warning: Missing ID in CQ ->", cq)
                continue  # Skip missing IDs

            if "_T_" in cq_id:
                cq_stats["theoretical"]["count"] += 1
                cq_stats["theoretical"]["labels"][label] += 1

                if cq_id in theoretical_cqs:
                    print(f"Duplicate theoretical CQ ID found: {cq_id}")

                theoretical_cqs[cq_id] = {"cq": cq["cq"], "label": label}

            elif "_LLM_" in cq_id:
                cq_stats["llm_generated"]["count"] += 1
                cq_stats["llm_generated"]["labels"][label] += 1

                if cq_id in llm_cqs:
                    print(f"Duplicate LLM CQ ID found: {cq_id}")

                llm_cqs[cq_id] = {"cq": cq["cq"], "label": label}


    cq_stats["theoretical"]["labels"] = dict(sorted(cq_stats["theoretical"]["labels"].items(), key=lambda item: item[1], reverse=True))
    cq_stats["llm_generated"]["labels"] = dict(sorted(cq_stats["llm_generated"]["labels"].items(), key=lambda item: item[1], reverse=True))


    if cq_stats["theoretical"]["count"] > 0:
        cq_stats["theoretical"]["percentage_useful"] = round(
            cq_stats["theoretical"]["labels"].get("Useful", 0) / cq_stats["theoretical"]["count"], 2
        )

    if cq_stats["llm_generated"]["count"] > 0:
        cq_stats["llm_generated"]["percentage_useful"] = round(
            cq_stats["llm_generated"]["labels"].get("Useful", 0) / cq_stats["llm_generated"]["count"], 2
        )

    return cq_stats, theoretical_cqs, llm_cqs


cq_stats_result, theoretical_cqs, llm_cqs = compute_cq_stats(data)

# Check if the lengths match
print("\n")
print("Expected theoretical CQs:", cq_stats_result["theoretical"]["count"], "-> Found:", len(theoretical_cqs))
print("Expected LLM CQs:", cq_stats_result["llm_generated"]["count"], "-> Found:", len(llm_cqs))


Duplicate LLM CQ ID found: TRUMP_240_2_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_3_L
Duplicate LLM CQ ID found: TRUMP_240_2_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_0_L
Duplicate theoretical CQ ID found: TRUMP_240_2_T__7


Expected theoretical CQs: 993 -> Found: 992
Expected LLM CQs: 3143 -> Found: 3141


In [9]:
cq_stats_result

{'total_cqs': 4136,
 'theoretical': {'count': 993,
  'labels': {'Useful': 415, 'Unhelpful': 394, 'Invalid': 184},
  'percentage_useful': 0.42},
 'llm_generated': {'count': 3143,
  'labels': {'Useful': 2375, 'Unhelpful': 499, 'Invalid': 269},
  'percentage_useful': 0.76}}

### Overall stats per dataset

In [11]:
def compute_stats(data):
    stats = defaultdict(lambda: {
        "num_interventions": 0,
        "authors_count": defaultdict(int),
        "schemes_count": defaultdict(int),
        "authors": {},
        "schemes": {},
        "cqs": {
            "total_cqs": 0,
            "theoretical": {"count": 0, "labels_count": defaultdict(int), "percentage_useful": 0.0},
            "llm_generated": {"count": 0, "labels_count": defaultdict(int), "percentage_useful": 0.0}
        }
    })

    for key, value in data.items():
        dataset = value["dataset"]
        author = key.split("_")[0]  # Extract author name

        stats[dataset]["num_interventions"] += 1
        stats[dataset]["authors_count"][author] += 1

        for scheme in value.get("schemes", []):
            stats[dataset]["schemes_count"][scheme] += 1

        # Process Critical Questions (CQs)
        for cq in value.get("cqs", []):
            stats[dataset]["cqs"]["total_cqs"] += 1
            cq_id = cq["id"]
            label = cq["label"]

            if "_T_" in cq_id:
                stats[dataset]["cqs"]["theoretical"]["count"] += 1
                stats[dataset]["cqs"]["theoretical"]["labels_count"][label] += 1
            elif "_LLM_" in cq_id:
                stats[dataset]["cqs"]["llm_generated"]["count"] += 1
                stats[dataset]["cqs"]["llm_generated"]["labels_count"][label] += 1

    # Convert defaultdicts to regular dicts and add final stats
    result = {}
    for ds, ds_data in stats.items():
        num_authors = len(ds_data["authors_count"])
        ds_data["authors_count"] = dict(sorted(ds_data["authors_count"].items(), key=lambda item: item[1], reverse=True))
        avg_interventions = ds_data["num_interventions"] / num_authors if num_authors > 0 else 0

        unique_schemes = len(ds_data["schemes_count"])
        num_schemes = sum(ds_data["schemes_count"].values())
        ds_data["schemes_count"] = dict(sorted(ds_data["schemes_count"].items(), key=lambda item: item[1], reverse=True))

        ds_data["cqs"]["theoretical"]["labels_count"] = dict(sorted(ds_data["cqs"]["theoretical"]["labels_count"].items(), key=lambda item: item[1], reverse=True))
        ds_data["cqs"]["llm_generated"]["labels_count"] = dict(sorted(ds_data["cqs"]["llm_generated"]["labels_count"].items(), key=lambda item: item[1], reverse=True))

        ds_data["cqs"]["theoretical"]["percentage_useful"] = round(ds_data["cqs"]["theoretical"]["labels_count"]["Useful"] / ds_data["cqs"]["theoretical"]["count"], 2) if ds_data["cqs"]["theoretical"]["count"] > 0 else 0
        ds_data["cqs"]["llm_generated"]["percentage_useful"] = round(ds_data["cqs"]["llm_generated"]["labels_count"]["Useful"] / ds_data["cqs"]["llm_generated"]["count"], 2) if ds_data["cqs"]["theoretical"]["count"] > 0 else 0

        result[ds] = {
            "num_interventions": ds_data["num_interventions"],
            "authors": {
                "num_authors": num_authors,
                "authors_count": ds_data["authors_count"],
                "avg_interventions_per_author": avg_interventions
            },
            "schemes": {
                "unique_schemes": unique_schemes,
                "num_schemes": num_schemes,
                "schemes_count": ds_data["schemes_count"]
            },
            "cqs": ds_data["cqs"]
        }

    return dict(sorted(result.items(), key=lambda item: item[1]["num_interventions"], reverse=True))


stats_result = compute_stats(data)
print("Total stats on authors, argumentation schemes, and critical questions per dataset:\n")
stats_result


Total stats on authors, argumentation schemes, and critical questions per dataset:



{'US2016': {'num_interventions': 80,
  'authors': {'num_authors': 3,
   'authors_count': {'TRUMP': 43, 'CLINTON': 34, 'HOLT': 3},
   'avg_interventions_per_author': 26.666666666666668},
  'schemes': {'unique_schemes': 18,
   'num_schemes': 342,
   'schemes_count': {'Example': 74,
    'CauseToEffect': 45,
    'Consequences': 36,
    'PracticalReasoning': 34,
    'VerbalClassification': 25,
    'Sign': 24,
    'CircumstantialAdHominem': 22,
    'GenericAdHominem': 15,
    'Values': 10,
    'PositionToKnow': 10,
    'Analogy': 7,
    'FearAppeal': 7,
    'DangerAppeal': 7,
    'PopularOpinion': 6,
    'Alternatives': 6,
    'PopularPractice': 6,
    'ExpertOpinion': 4,
    'Bias': 4}},
  'cqs': {'total_cqs': 2121,
   'theoretical': {'count': 722,
    'labels_count': {'Unhelpful': 283, 'Useful': 270, 'Invalid': 169},
    'percentage_useful': 0.37},
   'llm_generated': {'count': 1399,
    'labels_count': {'Useful': 1117, 'Unhelpful': 166, 'Invalid': 116},
    'percentage_useful': 0.8}}},
 '

### Critical questions in depth

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from transformers import AutoTokenizer
from huggingface_hub import login
login()  # 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

In [55]:
# List of models
models_list = [
    "meta-llama/Meta-Llama-3-8B-Instruct",
    "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    "mistralai/Mixtral-8x7B-Instruct-v0.1",
    "Qwen/Qwen2.5-7B-Instruct"
]

def compute_token_stats_for_models(cq_dict, models_list):
    stats_by_model = {}

    for model_name in models_list:
        print(f"Processing {model_name}...")
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        token_lengths = [len(tokenizer(cq["cq"])["input_ids"]) for cq in cq_dict.values()]

        if not token_lengths:
            stats_by_model[model_name] = {"max": 0, "min": 0, "avg": 0.0}
            continue

        stats_by_model[model_name] = {
            "max": max(token_lengths),
            "min": min(token_lengths),
            "avg": round(sum(token_lengths) / len(token_lengths), 2)
        }

    return stats_by_model

# Compute stats for each model
theoretical_token_stats = compute_token_stats_for_models(theoretical_cqs, models_list)
llm_token_stats = compute_token_stats_for_models(llm_cqs, models_list)

# Print results
print("\n\n")
print("Theoretical CQ Token Stats:")
for model, stats in theoretical_token_stats.items():
    print(f"{model}: {stats}")

print("\nLLM-Generated CQ Token Stats:")
for model, stats in llm_token_stats.items():
    print(f"{model}: {stats}")


Processing meta-llama/Meta-Llama-3-8B-Instruct...
Processing deepseek-ai/DeepSeek-R1-Distill-Llama-8B...
Processing mistralai/Mixtral-8x7B-Instruct-v0.1...
Processing Qwen/Qwen2.5-7B-Instruct...
Processing meta-llama/Meta-Llama-3-8B-Instruct...
Processing deepseek-ai/DeepSeek-R1-Distill-Llama-8B...
Processing mistralai/Mixtral-8x7B-Instruct-v0.1...
Processing Qwen/Qwen2.5-7B-Instruct...



Theoretical CQ Token Stats:
meta-llama/Meta-Llama-3-8B-Instruct: {'max': 65, 'min': 9, 'avg': 27.01}
deepseek-ai/DeepSeek-R1-Distill-Llama-8B: {'max': 65, 'min': 9, 'avg': 27.01}
mistralai/Mixtral-8x7B-Instruct-v0.1: {'max': 73, 'min': 9, 'avg': 28.8}
Qwen/Qwen2.5-7B-Instruct: {'max': 64, 'min': 8, 'avg': 26.11}

LLM-Generated CQ Token Stats:
meta-llama/Meta-Llama-3-8B-Instruct: {'max': 72, 'min': 9, 'avg': 28.9}
deepseek-ai/DeepSeek-R1-Distill-Llama-8B: {'max': 72, 'min': 9, 'avg': 28.9}
mistralai/Mixtral-8x7B-Instruct-v0.1: {'max': 79, 'min': 9, 'avg': 31.55}
Qwen/Qwen2.5-7B-Instruct: {'max': 71, '

In [47]:
def cluster_cqs_with_labels(cq_dict, num_clusters=5):
    cqs = [cq["cq"] for cq in cq_dict.values()]
    labels = [cq["label"] for cq in cq_dict.values()]

    # Convert to numerical representation
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(cqs)  # TF-IDF matrix

    # Apply K-Means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(X)

    # Initialize cluster dictionary
    clustered_cqs = {
        i: {"questions": [], "metadata": {"tot_questions": 0, "labels": {'Useful': 0, 'Unhelpful': 0, "Invalid": 0}, "percentage_useful": 0.0}}
        for i in range(num_clusters)
    }

    for i, cq in enumerate(cqs):
        cluster_id = clusters[i]
        clustered_cqs[cluster_id]["questions"].append(cq)
        clustered_cqs[cluster_id]["metadata"]["tot_questions"] += 1
        clustered_cqs[cluster_id]["metadata"]["labels"][labels[i]] += 1

    for cluster_id, data in clustered_cqs.items():
        useful_count = data["metadata"]["labels"]["Useful"]
        total_count = data["metadata"]["tot_questions"]
        if total_count > 0:
            data["metadata"]["percentage_useful"] = round(useful_count / total_count, 2)

    return clustered_cqs


In [50]:
# Theoretical CQs
theoretical_clusters = cluster_cqs_with_labels(theoretical_cqs, num_clusters=10)

for cluster_id, cluster_data in theoretical_clusters.items():
    print(f"\n=== Cluster {cluster_id} ===")
    print(f"Total Questions: {cluster_data['metadata']['tot_questions']}")
    print(f"Label Distribution: {cluster_data['metadata']['labels']}")
    print(f"Percentage Useful: {cluster_data['metadata']['percentage_useful'] * 100:.0f}%")
    print("\nSample Questions:")
    for question in cluster_data["questions"][:3]:
        print(f"- {question}")



=== Cluster 0 ===
Total Questions: 173
Label Distribution: {'Useful': 72, 'Unhelpful': 62, 'Invalid': 39}
Percentage Useful: 42%

Sample Questions:
- If Donald insults Muslims, will they not be on the front lines anymore? What evidence supports this claim? And how likely are the consequences?
- If Donald Trump insults Muslims abroad and at home, will they not cooperate with us and provide information that we can't get elsewhere? What evidence supports this claim? How likely are the consequences?
- How strong is the generalization that if Clinton achieved putting together a coalition to impose tough sanctions on Iran, then the USA would drive Iranians to the negotiation table?

=== Cluster 1 ===
Total Questions: 110
Label Distribution: {'Useful': 71, 'Unhelpful': 29, 'Invalid': 10}
Percentage Useful: 65%

Sample Questions:
- Could working more closely with the USA's allies have consequences that we should take into account? Is it practically possible?
- What other consequences should a

In [57]:
# LLM-generated CQs
theoretical_clusters = cluster_cqs_with_labels(llm_cqs, num_clusters=10)

for cluster_id, cluster_data in theoretical_clusters.items():
    print(f"\n=== Cluster {cluster_id} ===")
    print(f"Total Questions: {cluster_data['metadata']['tot_questions']}")
    print(f"Label Distribution: {cluster_data['metadata']['labels']}")
    print(f"Percentage Useful: {cluster_data['metadata']['percentage_useful'] * 100:.0f}%")
    print("\nSample Questions:")
    for question in cluster_data["questions"][:3]:
        print(f"- {question}")



=== Cluster 0 ===
Total Questions: 304
Label Distribution: {'Useful': 232, 'Unhelpful': 40, 'Invalid': 32}
Percentage Useful: 76%

Sample Questions:
- What specific intelligence benefits have been gained from working with European and Middle Eastern allies in the past, and how do these benefits justify increased cooperation?
- What specific plans do you have to address the root causes of income inequality, and how would you measure the success of these plans?
- What is the track record of the speaker in implementing similar policies in the past, and what were the results?

=== Cluster 1 ===
Total Questions: 461
Label Distribution: {'Useful': 299, 'Unhelpful': 100, 'Invalid': 62}
Percentage Useful: 65%

Sample Questions:
- How does Clinton's proposal compare to Trump's proposal in terms of their potential impact on the economy, and what are the key differences between their approaches?
- How do Clinton's statements about NATO and Iran relate to the broader topic of discussion, and what