# Explore the dataset

In [2]:
import os

print(os.path.exists("sample.json"))  # Should return True if file exists
print(os.path.exists("validation.json"))  # Should return True if file exists


True
True


In [3]:
import json
import pandas as pd
import os

# Define file paths
file_paths = {
    "sample.json": "sample.json",
    "validation.json": "validation.json"
}

# Function to load JSON file
def load_json(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return json.load(file)

# Load JSON data
data = {name: load_json(path) for name, path in file_paths.items()}

# Extracting useful information
exploration_data = []

for file_name, content in data.items():
    for intervention_id, details in content.items():
        intervention_text = details.get("intervention", "")
        num_words = len(intervention_text.split())

        # Extract number of CQs and their label distribution
        cqs = details.get("cqs", [])
        num_cqs = len(cqs)
        labels = [cq["label"] for cq in cqs]
        num_useful = labels.count("Useful")
        num_unhelpful = labels.count("Unhelpful")
        num_invalid = labels.count("Invalid")
        
        exploration_data.append({
            "File": file_name,
            "Intervention ID": intervention_id,
            "Dataset": details.get("dataset", "Unknown"),
            "Number of CQs": num_cqs,
            "Useful CQs": num_useful,
            "Unhelpful CQs": num_unhelpful,
            "Invalid CQs": num_invalid,
            "Total Words in Intervention": num_words,
            "Scheme Types": ", ".join(details.get("schemes", []))
        })

# Create DataFrame
df_exploration = pd.DataFrame(exploration_data)

# Display the dataframe
display(df_exploration)


Unnamed: 0,File,Intervention ID,Dataset,Number of CQs,Useful CQs,Unhelpful CQs,Invalid CQs,Total Words in Intervention,Scheme Types
0,sample.json,CLINTON_1_1,US2016,32,12,3,17,149,"PracticalReasoning, PracticalReasoning, Practi..."
1,sample.json,Javier_84,rrd,19,8,11,0,179,ERPracticalReasoning
2,sample.json,TRUMP_125_1,US2016,46,23,11,12,201,"Example, Example, Sign, Sign, VerbalClassifica..."
3,sample.json,travellots_133_1,rrd,18,10,4,4,172,"ERPracticalReasoning, ERPracticalReasoning"
4,sample.json,Zewstain__641,us2016reddit,18,9,9,0,34,Example
...,...,...,...,...,...,...,...,...,...
186,validation.json,CLINTON_1_1,US2016,32,12,3,17,149,"PracticalReasoning, PracticalReasoning, Practi..."
187,validation.json,TRUMP_99,US2016,30,17,10,3,199,"VerbalClassification, Example, VerbalClassific..."
188,validation.json,TRUMP_174_1,US2016,22,11,4,7,184,"CircumstantialAdHominem, GenericAdHominem, Pos..."
189,validation.json,TRUMP_112,US2016,27,15,3,9,77,"VerbalClassification, CircumstantialAdHominem,..."


In [10]:
num_interventions_val = (df_exploration["File"] == "validation.json").sum()
num_interventions_sam = (df_exploration["File"] == "sample.json").sum()

print("Number of sample examples", num_interventions_sam)
print("Number of validation examples", num_interventions_val)

Number of sample examples 5
Number of validation examples 186


In [11]:
# Compute ratios for useful, unhelpful, and invalid CQs
total_cqs = df_exploration["Number of CQs"].sum()
useful_ratio = df_exploration["Useful CQs"].sum() / total_cqs if total_cqs else 0
unhelpful_ratio = df_exploration["Unhelpful CQs"].sum() / total_cqs if total_cqs else 0
invalid_ratio = df_exploration["Invalid CQs"].sum() / total_cqs if total_cqs else 0

print(f"Ratio of Useful CQs: {useful_ratio:.2%}")
print(f"Ratio of Unhelpful CQs: {unhelpful_ratio:.2%}")
print(f"Ratio of Invalid CQs: {invalid_ratio:.2%}")


Ratio of Useful CQs: 66.81%
Ratio of Unhelpful CQs: 21.81%
Ratio of Invalid CQs: 11.38%
