In [1]:
## How well do our datasets match?
import json
import re

filenames = ['Matt', 'MW', 'HNEB', 'MW_merge']
for filepath in filenames:
    print(filepath)
    with open(f"data/filtered_utterances_sample_{filepath}.json", 'r') as file:
        transcripts = json.load(file)
        print("   Total Transcripts:",len(transcripts))
        filtered_data = [item for item in transcripts if not any(value == "ungraded" for value in item["matched_terms"].values())]
        print("   Fully Graded Transcripts:",len(filtered_data))

Matt
   Total Transcripts: 1000
   Fully Graded Transcripts: 102
MW
   Total Transcripts: 246
   Fully Graded Transcripts: 98
HNEB
   Total Transcripts: 246
   Fully Graded Transcripts: 246
MW_merge
   Total Transcripts: 203
   Fully Graded Transcripts: 200


In [25]:
import json
file_comparison = ['HNEB', 'MW_merge']

with open(f"data/filtered_utterances_sample_{file_comparison[0]}.json", 'r') as file1:
    json1 = json.load(file1)
with open(f"data/filtered_utterances_sample_{file_comparison[1]}.json", 'r') as file2:
    json2 = json.load(file2)
    
    
set1 = set((item["transcript_id"], item["statement"]) for item in json1)
set2 = set((item["transcript_id"], item["statement"]) for item in json2)

# Finding matching and unique transcript_ids
matching_ids = set1.intersection(set2)
unique_in_json1 = set1.difference(set2)
unique_in_json2 = set2.difference(set1)

# Printing the results
print(f"Matching IDs ({len(matching_ids)}): {matching_ids}")
print(f"Unique in JSON1 ({len(unique_in_json1)}): {unique_in_json1}")
print(f"Unique in JSON2 ({len(unique_in_json2)}): {unique_in_json2}")

Matching IDs (100): {('CNN-67617', "Let's get away from the diplomatic wrangling here for a moment. Let's talk <ABOUT> the military front. Tommy Franks, head of the Central Command, yesterday at the Pentagon says the military is ready to go, is in position now, if <GIVEN> the order from the White House. To the Pentagon from yesterday, back again today, Barbara Starr for more on this -- Barbara, good morning to you. Turkey, we <KNOW>, not granting that permission to stage <ABOUT> 60,000 troops on its territory, which has the Pentagon scrambling right now. We are hearing that some movement, some aircraft carriers in the eastern Mediterranean <<COULD>> be on the move. What are we learning <ABOUT> this?"), ('CNN-84518', 'Sir, the training of the Geneva Convention is inherent <EVERY> time from as a recruit <ALL> the way up to my rank level. In terms of these M.P.s, as far as internment and resettlement, some of them received training at home station and the mob station and some did not. And

In [26]:
from util import group_by_key
from collections import defaultdict
# Grouping both JSON lists
grouped_json1 = group_by_key(json1)
grouped_json2 = group_by_key(json2)

# Finding common keys
common_keys = set(grouped_json1.keys()).intersection(grouped_json2.keys())

# Merging matched terms for common keys
merged_json = []
for key in common_keys:
    combined_matched_terms = defaultdict(list)
    for item in grouped_json1[key] + grouped_json2[key]:
        for term, value in item['matched_terms'].items():
            if value not in combined_matched_terms[term]:
                combined_matched_terms[term].append(value)
    
    # Create a new entry for each common key with merged matched terms
    new_entry = {
        "transcript_id": key[0],
        "matched_terms": dict(combined_matched_terms),
        "previous_statement": grouped_json1[key][0]["previous_statement"],  # Example, using the first found
        "statement": key[1]
    }
    merged_json.append(new_entry)

# Print or process the resulting merged_json
# print(merged_json)

# Output the sampled data as a JSON file
with open('data/filtered_utterances_samples_MW_HNEB_MERGE.json', 'w') as file:
    json.dump(merged_json, file, indent=4)

In [32]:
json_data = merged_json
# Initialize counters
total_samples = len(json_data)
total_matched_terms = 0
total_length_one = 0
total_length_more_than_one = 0
all_length_one = 0
any_length_more_than_one = 0
actual_prm = 0

# Analysis of each sample
for sample in json_data:
    matched_terms = sample['matched_terms']
    sample_terms_count = len(matched_terms)
    total_matched_terms += sample_terms_count

    length_one_count = 0
    length_more_than_one_count = 0

    for term, values in matched_terms.items():
        if len(values) == 1:
            total_length_one += 1
            length_one_count += 1
            if values[0] == "hedge":
                actual_prm+=1
            if values[0] == "authority":
                actual_prm+=1
        elif len(values) > 1:
            total_length_more_than_one += 1
            length_more_than_one_count += 1

    # Check if all terms in a sample have lists of length 1
    if length_one_count == sample_terms_count:
        all_length_one += 1
    
    # Check if any term in a sample has a list longer than 1
    if length_more_than_one_count > 0:
        any_length_more_than_one += 1

# Printing the results
print("Total number of samples:      ", total_samples)
print("Total number of matched terms:", total_matched_terms)
print(f"Terms per sample utterance:    {total_matched_terms / total_samples:.2f}")
print(f"Perfect Match Terms:           {round(100*total_length_one / total_matched_terms, 1)}%  ({total_length_one}/{total_matched_terms})")
print(f"PrM Positive Match Count:           {actual_prm}")
print(f"Total Match Statements:        {round(100*all_length_one / total_samples, 1)}%  ( {all_length_one}/ {total_samples})")
# print("Total samples where at least one matched term has list length > 1:", any_length_more_than_one)

Total number of samples:       100
Total number of matched terms: 414
Terms per sample utterance:    4.14
Perfect Match Terms:           66.4%  (275/414)
PrM Positive Match Count:           65
Total Match Statements:        34.0%  ( 34/ 100)


In [4]:
from util import count_prms_in_utt, print_sorted_json
# Filepaths for the JSON files
prms_filepath = 'data/prms.json'
large_json_filepath = 'data/news_dialogue_sample.json'
output_filepath = 'data/prms_sample_count.json'

# Call the function on sample JSON
count_prms_in_utt(prms_filepath, large_json_filepath, output_filepath)

# Call the function on full JSON
# large_json_filepath = 'data/news_dialogue.json'
# output_filepath = 'data/prms_full_count.json'
# count_prms_in_utt(prms_filepath, large_json_filepath, output_filepath)

# Path to the JSON file
input_filepath = 'data/prms_full_count.json'
# Call the function to print sorted JSON
print_sorted_json(input_filepath)

and: 462792
but: 447460
so: 419813
now: 395928
or: 391821
like: 360746
also: 340402
really: 322127
even: 297664
first: 296996
then: 287166
look: 267461
actually: 238482
next: 217319
well: 194280
listen: 83200
finally: 54925
totally: 35285
yes: 22851
oh: 14775
besides: 8540
un: 4137
genuinely: 3093
okay: 2047
hence: 1121
admittedly: 862
likewise: 662
er: 498
uh: 272
alright: 161
cr: 150
anyhow: 113
alternatively: 100
conversely: 95
em: 59
lol: 56
uhu: 1


In [22]:
from util import print_sample_json

print_sample_json('data/news_dialogue_sample.json', sample_size=200)

[
    {
        "id": "CNN-369804",
        "program": "CONNECT THE WORLD",
        "date": "2019-05-16",
        "url": "http://transcripts.cnn.com/TRANSCRIPTS/1905/16/ctw.01.html",
        "utt": [
            "You're watching CNN. This is CONNECT THE WORLD. I'm Becky Anderson. Welcome back, it's just half past seven here in the UAE. They call it the forgotten war but not for those living the bloodshed every single day. And it is important that you know about it. I'm talking about the conflict in Yemen, where today the Saudi-led coalition carried out several air strikes in what is the Houthi controlled capital of Sanaa. Now Saudi state-run news says forces hit a number of what it calls legitimate military targets. That's after Houthi militia claimed responsibility for the drone attack that targeted two oil pumping stations in Saudi Arabia. It's messy times. Early I spoke to Martin Griffiths who's the U.N. Special Counsel Envoy for Yemen to the Security Council and I got his take on h