## Make New Samples to Grade

In [6]:
import json

# Load your initial JSON data and the graded dataset
with open('data/filtered_utterances.json', 'r') as file:
    merged_json = json.load(file)

# Define the list of markers
with open('data/short_prm_list.json', 'r') as merge_file:
    markers = json.load(merge_file)

# with open('data/filtered_utterances_samples_MW_HNEB_MERGE.json', 'r') as merge_file:
#     graded_data = json.load(merge_file)

with open('data/new_filtered_data_combined.json', 'r') as merge_file:
    graded_data = json.load(merge_file)

# Extract statements from graded data for exclusion
graded_statements = {item['statement'] for item in graded_data}
print(len(graded_statements))

798


In [7]:
# Initialize the results list
filtered_data = []
i = 0
j=0
# Filter the data
for sample in merged_json:
    # Check if the statement has been graded already
    if sample['statement'] not in graded_statements:
        j+=1
        # Filter matched_terms by markers
        filtered_terms = {key: value for key, value in sample['matched_terms'].items() if key in markers}
        
        # Only add the sample if there are any filtered terms
        if filtered_terms:
            new_sample = sample.copy()
            new_sample['matched_terms'] = filtered_terms
            filtered_data.append(new_sample)
        if j%100==0:
            print(f"Filtered {j} samples: {new_sample}")
    else:
        i +=1
        if i >90:
            # if i%10==0:
            print(f"Graded Statement Skipped {i}: ", sample['statement'])

# Output the filtered results as JSON
with open('data/new_filtered_utterances.json', 'w') as output_file:
    json.dump(filtered_data, output_file, indent=2)


Filtered 100 samples: {'transcript_id': 'CNN-85093', 'matched_terms': {'kind of': 'ungraded'}, 'previous_statement': 'Accenture does not disclose how much they pay in U.S. taxes, and the U.S. Visit was technically awarded to Accenture LLP. Accenture LLP falls under Accenture Incorporated, and Accenture Incorporated is a branch of the Accenture Company in Bermuda -- Lou.', 'statement': '<KIND OF> convoluted for a company that means to streamline an operation on border security, Lisa.'}
Filtered 200 samples: {'transcript_id': 'CNN-394456', 'matched_terms': {'think': 'ungraded', 'could': 'ungraded', 'maybe': 'ungraded', 'should': 'ungraded', 'around': 'ungraded', 'fact': 'ungraded', 'clear': 'ungraded'}, 'previous_statement': 'Should these sick people be going to work?', 'statement': "Well, there is no question that if you are diagnosed with coronavirus, you <SHOULD> not be going to work and be <AROUND> people because you <COULD> infect them. And, in <FACT>, that's what we say for people 

In [10]:
import math
import random
import json

# Number of samples per file
random.shuffle(filtered_data)

samples_per_file = 100
total_samples = len(filtered_data)
total_files = math.ceil(total_samples / samples_per_file)

file_num = 3
# Generate a list of indices in random order
indices = list(range(file_num))

# Split data into chunks and save to separate files using shuffled indices
for i in indices:
    start_index = i * samples_per_file
    end_index = start_index + samples_per_file
    chunk = filtered_data[start_index:end_index]
    
    # Generate filename based on the chunk number
    filename = f'data/new_filtered_utterances_to_grade/authority_markers_HN_part_{i}.json'
    
    # Save the chunk to a file
    with open(filename, 'w') as file:
        json.dump(chunk, file, indent=2)
    filename = f'data/new_filtered_utterances_to_grade/authority_markers_MW_part_{i}.json'
    
    # Save the chunk to a file
    with open(filename, 'w') as file:
        json.dump(chunk, file, indent=2)

print(f"Data split into {file_num} files with up to {samples_per_file} samples each.")

Data split into 3 files with up to 100 samples each.


In [None]:
import json
import os

def load_json(filename):
    with open(filename, 'r') as file:
        return json.load(file)

def combine_data(HN_part, MW_part):
    combined_data = []
    for hn, mw in zip(HN_part, MW_part):
        if hn['transcript_id'] == mw['transcript_id'] and hn['statement'] == mw['statement']:
            combined_terms = {}
            graders_count = 0
            for term, value in hn['matched_terms'].items():
                mw_value = mw['matched_terms'].get(term, 'ungraded')  # default to 'ungraded' if not found
                unique_values = {value, mw_value} - {"ungraded"}
                graders_count += len(unique_values)
                combined_terms[term] = list(unique_values)
            combined_data.append({
                "transcript_id": hn["transcript_id"],
                "matched_terms": combined_terms,
                "graders": graders_count,
                "previous_statement": hn["previous_statement"],
                "statement": hn["statement"]
            })
    return combined_data

base_path = 'data/graded_new/'
all_combined_data = []

# Process files from part 1 to 20
for i in range(1, 21):
    hn_file = os.path.join(base_path, f'filtered_data_HN_part_{i}.json')
    mw_file = os.path.join(base_path, f'filtered_data_MW_part_{i}.json')

    # Load data if both files exist
    if os.path.exists(hn_file) and os.path.exists(mw_file):
        HN_part = load_json(hn_file)
        MW_part = load_json(mw_file)
        combined_part = combine_data(HN_part, MW_part)
        all_combined_data.extend(combined_part)

# Save the combined data to a file
with open('data/new_data/combined_data.json', 'w') as file:
    json.dump(all_combined_data, file, indent=4)

print("Data combined and saved successfully.")


### Here's where the annotated samples merge

In [9]:
from util import group_by_key
import json
from collections import defaultdict


file_comparison = ['HNEB', 'MW_merge']

with open(f"data/filtered_utterances_sample_{file_comparison[0]}.json", 'r') as file1:
    HN_part = json.load(file1)
with open(f"data/filtered_utterances_sample_{file_comparison[1]}.json", 'r') as file2:
    MW_part = json.load(file2)
    
for i in range(3, 10):
    with open(f'data/graded_new/filtered_data_HN_part_{i}.json', 'r') as file:
        HN_part += json.load(file)

    with open(f'data/graded_new/filtered_data_MW_part_{i}.json', 'r') as file:
        MW_part += json.load(file)
    print(f"Appended part {i}")

grouped_json1 = group_by_key(HN_part)
grouped_json2 = group_by_key(MW_part)

# Finding common keys
common_keys = set(grouped_json1.keys()).intersection(grouped_json2.keys())

# Merging matched terms for common keys
merged_json = []
for key in common_keys:
    combined_matched_terms = defaultdict(list)
    for item in grouped_json1[key] + grouped_json2[key]:
        for term, value in item['matched_terms'].items():
            if value not in combined_matched_terms[term]:
                combined_matched_terms[term].append(value)
                
    
    # Create a new entry for each common key with merged matched terms
    new_entry = {
        "transcript_id": key[0],
        "matched_terms": dict(combined_matched_terms),
        "previous_statement": grouped_json1[key][0]["previous_statement"],  # Example, using the first found
        "statement": key[1]
    }
    merged_json.append(new_entry)

# Print or process the resulting merged_json
# print(merged_json)

# Output the sampled data as a JSON file
with open('data/graded_new/new_filtered_data_combined.json', 'w') as file:
    json.dump(merged_json, file, indent=4)

Appended part 3
Appended part 4
Appended part 5
Appended part 6
Appended part 7
Appended part 8
Appended part 9


In [18]:
mismatch_json = []

# Initialize counters
total_samples = len(merged_json)
total_matched_terms = 0
total_length_one = 0
total_length_more_than_one = 0
all_length_one = 0
any_length_more_than_one = 0
actual_prm = 0
statement_with_match = 0
authority = 0
hedge = 0
none_match = 0
# Analysis of each sample
for sample in merged_json:
    matched_terms = sample['matched_terms']
    sample_terms_count = len(matched_terms)
    total_matched_terms += sample_terms_count
    sample_contains_match = 0

    length_one_count = False
    length_more_than_one_count = 0

    for term, values in matched_terms.items():
        if len(values) == 1:
            length_one_count = True
            total_length_one += 1
            length_one_count += 1
            if values[0] == "hedge":
                actual_prm+=1
                hedge +=1
            if values[0] == "authority":
                actual_prm+=1
                authority +=1
            if values[0] == "none":
                none_match +=1
        elif len(values) > 1:
            total_length_more_than_one += 1
            length_more_than_one_count += 1
            
    if length_more_than_one_count > 0:
        mismatch_json.append(sample)

    # Check if all terms in a sample have lists of length 1
    if length_one_count == sample_terms_count:
        all_length_one += 1
    
    # Check if any term in a sample has a list longer than 1
    if length_more_than_one_count > 0:
        any_length_more_than_one += 1
    
    if length_one_count:
        statement_with_match+=1



# Printing the results
print("Total number of graded samples: ", total_samples)
print("Total number of matched terms:  ", total_matched_terms)
print("Total number of statement with match:", statement_with_match)
print(f"Terms per sample utterance:    {total_matched_terms / total_samples:.2f}")
print(f"Perfect Match Terms:           {round(100*total_length_one / total_matched_terms, 1)}%  ({total_length_one}/{total_matched_terms})")
print(f"PrM Positive Match Count:      {actual_prm}")
print(f"Hedge Match Count:                {hedge}")
print(f"Authority Match Count:            {authority}")
print(f"None Match Count:              {none_match}")
print(f"Total Match Statements:        {round(100*all_length_one / total_samples, 1)}%  ( {all_length_one}/ {total_samples})")
# print("Total samples where at least one matched term has list length > 1:", any_length_more_than_one)

Total number of graded samples:  800
Total number of matched terms:   1895
Total number of statement with match: 599
Terms per sample utterance:    2.37
Perfect Match Terms:           61.3%  (1161/1895)
PrM Positive Match Count:      506
Hedge Match Count:                441
Authority Match Count:            65
None Match Count:              655
Total Match Statements:        20.0%  ( 160/ 800)


In [11]:
import json
import uuid

# Load the JSON data from a file
with open("data/graded_new/new_filtered_data_combined.json", 'r') as file1:
    data = json.load(file1)  # Changed variable name from 'json' to 'data'
final_list = []
# Iterate through each item in the data list
for item in data:
    # Print each term in the 'matched_terms' dictionary
    PrMs = {}
    for key, value in item['matched_terms'].items():
        if len(value) == 1:
        #    print(key, value)
           PrMs[key] = {"correct": value[0]}
    # print(PrMs)
    if len(PrMs)>0:
        sample = {
            "transcript_id": item['transcript_id'],
            "statement_id": str(uuid.uuid4()),  # Generate a random UUID for the 'statement_id
            "matched_terms": PrMs,
            "previous_statement": item['previous_statement'],
            "statement": item['statement']
        }
        # print(sample)
        final_list.append(sample)

# Convert all sets to lists in the final_list
def convert_sets_to_lists(item):
    if isinstance(item, set):
        return list(item)
    elif isinstance(item, dict):
        return {key: convert_sets_to_lists(value) for key, value in item.items()}
    elif isinstance(item, list):
        return [convert_sets_to_lists(element) for element in item]
    else:
        return item

final_list_converted = [convert_sets_to_lists(item) for item in final_list]

with open('data/human_annotated_dataset.json', 'w') as file:
    json.dump(final_list_converted, file, indent=4)
    
print("Final Length:", len(final_list_converted))

Final Length: 599


## Update Model Responses File to add in new samples

In [13]:
import json

# Step 1: Read the existing data
with open('data/human_annotated_model_response.json', 'r') as f:
    existing_data = json.load(f)

# Step 2: Read the new data
with open('data/human_annotated_dataset.json', 'r') as f:
    new_data = json.load(f)

# Step 3: Create a function to match and merge records
def merge_records(existing, new):
    existing_lookup = {(item['transcript_id'], item['previous_statement']): item for item in existing}
    
    for new_item in new:
        key = (new_item['transcript_id'], new_item['previous_statement'])
        if key in existing_lookup:
            asdfsdaf = 0
            # Update the existing record with the new data (assuming the new data has the same format)
            
        else:
            # If the new item doesn't exist in the existing data, add it
            existing.append(new_item)
    
    return existing

# Step 4: Merge the records
updated_data = merge_records(existing_data, new_data)

# Step 5: Write the updated data back to the file
with open('data/human_annotated_model_response.json', 'w') as f:
    json.dump(updated_data, f, indent=4)

print("Data merge complete.")
print("Total Statements:", len(updated_data))


Data merge complete.
Total Statements: 599
