In [None]:
#import packages
import ast
import swifter
import matplotlib.pyplot as plt
import numpy as np
import ipywidgets
import pandas as pd
import json
import random

This script extracts data from the Burnham DEBATE runs and processes this in different ways for further analysis and use in Label Studio as well.

In [None]:
#notes:
#Blame and praise
# entailsments
#does not entail an actually entailment

#Blame vs endorsement
#Also report initial inspection of model wher praise and neutral was added to the hypothesis paramenter instead of blame/not blame.
# something about hieracivcal order and the words not being complete opisats and therefore the relative probabilities entails needed information
# in addition to the absolute probability

In [None]:
#define functions


def extract_blame_from_paragraph_lookup(input_str):
    """
    Returns a binary list for blame per sentence:
    1 if blame is highest among labels and >= 0.8, else 0
    Handles arbitrary label order.
    """
    try:
        
        sentence_list = ast.literal_eval(input_str)
    except Exception:
        return []

    # List comprehension is faster than appending in a loop
    blame_binary = [
        int(
            (label_score := {label: score for label, score in zip(sent['labels'], sent['scores'])})['blame']
            >= max(label_score.get('praise', 0.0), label_score.get('neutral', 0.0), 0.8)
        )
        for sent in sentence_list
    ]


    return blame_binary

#example usage: final_data['blame_binary'] = final_data['blame_in_text'].swifter.apply(extract_blame_from_paragraph_lookup)


#get row indices (paragraphs) that contain blame
def get_rows_with_blame(df, col="blame_binary"):
    """
    Returns row indices where the list in `col` contains at least one 1.
    Handles both real lists and stringified lists.
    """
    indices = []
    for i, values in zip(df.index, df[col]):
        if isinstance(values, str):  # convert only if it's a string
            values = ast.literal_eval(values)
        if 1 in values:
            indices.append(i)
    return indices

#example usage: row_indices = get_rows_with_blame(final_data, col="blame_binary")
#print(row_indices[:10])


#get paragraphs and sentence indices of blame True
def get_rows_and_positions(df, col="blame_binary"):
    """
    Returns {row_index: [positions_of_1s]}.
    Handles both real lists and stringified lists.
    """
    results = {}
    for i, values in zip(df.index, df[col]):
        if isinstance(values, str):
            values = ast.literal_eval(values)
        ones = [j for j, v in enumerate(values) if v == 1]
        if ones:
            results[i] = ones
    return results

#example usage: rows_with_positions = get_rows_and_positions(final_data, col="blame_binary")
#print(rows_with_positions)

#
#get danish sentences containing blame from indices extracted as above

def danish_sentences_with_blame_extraction(dict, data, text_column):


    rows = list(dict.keys())
    sentences = {}

    for para in rows:
        sentence_indices = dict[para]
        text_sentences = ast.literal_eval(data.loc[para][text_column])
        
        blame_sentence_dict = {}
        for indx in sentence_indices:
            blame_sentence = text_sentences[indx]
            blame_sentence_dict[indx] = blame_sentence
        
        
        sentences[para] = blame_sentence_dict

    return sentences


#example usage: danish_sentences_with_blame = danish_sentences_with_blame_extraction(rows_with_positions, final_data, 'da_segmented_text')


#extract blame percentage:

import ast

def total_blame_percentage(string_rows):
    """
    Calculate the total percentage of blame-sentences across all rows,
    converting string representations of lists into actual lists.
    
    Parameters
    ----------
    string_rows : list of str
        Each element is a string like '[1, 0, 1]' representing a row of blame labels.

    Returns
    -------
    float
        Total percentage of blame-sentences (0–100).
    """
    total_sentences = 0
    total_blame = 0

    for row_str in string_rows:
        try:
            row = row_str  # convert string to list
            #row = ast.literal_eval(row_str)
            if not isinstance(row, list):
                continue  # skip if not a list
            row = [int(val) for val in row]  # ensure integers
            total_sentences += len(row)
            total_blame += sum(row)
        except (ValueError, SyntaxError):
            continue  # skip invalid rows

    if total_sentences == 0:
        return 0.0

    return (total_blame / total_sentences) * 100, total_sentences, total_blame

#Example usage: percentage_blame, total_sent, total_blame= total_blame_percentage(final_data['blame_binary'])

#extract blame scores

def get_blame_scores(data, blame_in_text_column = 'blame_in_text'):
    all_blame_scores = []

    for i in range(len(data)):
        dict_labels = ast.literal_eval(data.loc[i][blame_in_text_column])

        blame_list = [(label_score := {label: score for label, score in zip(sent['labels'], sent['scores'])})['blame'] for sent in dict_labels]

        all_blame_scores +=blame_list
    return all_blame_scores


#make vizualization of the distribution

def vizualize_blame_prob(blame_scores):
    len_blame_scores = len(blame_scores)
    fig, axs = plt.subplots(len_blame_scores, 2, figsize=(12, 10))
    for i in range(len_blame_scores):
        axs[i,0].hist(blame_scores[i], log = True, bins = 50)
        axs[0, 0].set_title('Log transformed y axis')
        axs[i,1].hist(blame_scores[i], bins = 50)
        axs[0, 1].set_title('absolute y-axis')

    fig.suptitle('Distribution of blame probabilities by template (log transformed counts/absolute counts)')

    for ax in axs.flat:
        ax.set(xlabel='probability', ylabel='count')
    fig.show()


#change format from list of paragraph, sentence pairs to dictioranry compatible with other functions

def list_to_dict(pairs):
    result = {}
    for key, value in pairs:
        if key in result:
            result[key].append(value)
        else:
            result[key] = [value]
    return result

# Example usage: converted = list_to_dict(data)



def danish_sentences_without_blame_extraction(blame_dict, data, text_column):
    """
    Extract sentences that do NOT contain blame, using the dictionary
    returned by get_rows_and_positions.
    
    blame_dict: {row_index: [positions_of_1s]}
    data: DataFrame with a column containing lists of sentences (as lists or stringified lists)
    text_column: name of the column in `data` containing the text sentences
    """
    
    sentences = {}
    
    for para in data.index:
        text_sentences = ast.literal_eval(data.loc[para][text_column])
        blame_positions = blame_dict.get(para, [])
        
        non_blame_sentence_dict = {}
        for idx, sentence in enumerate(text_sentences):
            if idx not in blame_positions:  # only keep sentences not containing blame
                non_blame_sentence_dict[idx] = sentence
        
        if non_blame_sentence_dict:  # only add if there are non-blame sentences
            sentences[para] = non_blame_sentence_dict
    
    return sentences



#------------------------------------#
#json related functions


# Convert dictionary to a JSON string and write to file
def convert_to_json_and_write(file_name, sentences):
    with open(f'/work/MarkusLundsfrydJensen#1865/Bachelor_project/json_files/{file_name}.json', 'w') as file:
        file.write(json.dumps(sentences, indent=4))



    #preprocess data for label studio
    # Load your data
    with open(f'/work/MarkusLundsfrydJensen#1865/Bachelor_project/json_files/{file_name}.json', "r", encoding="utf-8") as f:
        data = json.load(f)

    flattened = []

    for paragraph, sentences in data.items():
        for sentence_nr, text in sentences.items():
            flattened.append({
                "paragraph": paragraph,
                "sentence_nr": sentence_nr,
                "text": text
            })

    # Save in a format Label Studio can import
    with open(f'/work/MarkusLundsfrydJensen#1865/Bachelor_project/json_files/{file_name}.json', "w", encoding="utf-8") as f:
        json.dump(flattened, f, ensure_ascii=False, indent=2)

    return


#final data is csv file as output of PolDebate model

def json_append_meta_data(file_name, data):
    meta_data = data[['Unnamed: 0','speaker','party']]
    meta_data = meta_data.replace({np.nan: None})
    meta_data.head()


    #connect label studio data with meta data


    # Load the flattened sentence JSON
    with open(f"/work/MarkusLundsfrydJensen#1865/Bachelor_project/json_files/{file_name}.json", "r", encoding="utf-8") as f:
        sentences = json.load(f)

    # Load metadata
    meta = meta_data

    # Convert metadata to dict for fast lookup
    meta_dict = meta.set_index("Unnamed: 0").to_dict(orient="index")

    # Merge
    for item in sentences:
        paragraph = int(item["paragraph"])
        if paragraph in meta_dict:

            item.update(meta_dict[paragraph])

    # Save merged dataset
    with open(f'/work/MarkusLundsfrydJensen#1865/Bachelor_project/json_files/{file_name}.json', "w", encoding="utf-8") as f:
        json.dump(sentences, f, ensure_ascii=False, indent=2)


    return




#find parties in government by dataset
def find_government(gov_data, date):
    match = gov_data[(gov_data["Start Date"] <= date) & (gov_data["End Date"] >= date)]
    if not match.empty:

        parties = match['Party Letter']
        parties = ast.literal_eval(parties.iloc[0])
        
        return parties
    else:
        print('empty')
        return None


#append context and government related data to the json file
def json_government_and_context(file_name, data, government_data):

    # Load the flattened sentence JSON
    with open(f"/work/MarkusLundsfrydJensen#1865/Bachelor_project/json_files/{file_name}.json", "r", encoding="utf-8") as f:
        json_data = json.load(f)

    for entry in json_data:
        paragraph_nr = int(entry['paragraph'])
        sentence_nr = int(entry['sentence_nr'])

        #initialize information on current paragraph
        temp_data = data.loc[paragraph_nr]

        #get preceding sentence
        temp_data_da_text = ast.literal_eval(temp_data['da_segmented_text'])
        
        pr_sentence_index = sentence_nr-1

        if pr_sentence_index < 0:
            #do empty string
            pr_sent = ''

        else:
            pr_sent = f'{temp_data_da_text[pr_sentence_index]}'

        # find succesding sentence

        suc_sent_index = sentence_nr + 1

        try:
            suc_sent = f'{temp_data_da_text[suc_sent_index]}'

        except:
            suc_sent = ''

        
        #Find out if speaker is member of party in Government
        party = entry['party']
        date = temp_data['date']
        parties_gov = find_government(government_data, date)
        if party in parties_gov:
            in_gov = True
        else:
            in_gov = False
        
        #make into dict

        context_dict = {'preceding_sentence': pr_sent, 
                        'succeeding_sent': suc_sent,
                        'current_speaker_in_government': in_gov,
                        'parties_in_government': parties_gov,
                        'date': str(date)
                        }

        #update
        entry.update(context_dict)

    #save data
    with open(f"/work/MarkusLundsfrydJensen#1865/Bachelor_project/json_files/{file_name}.json", "w", encoding="utf-8") as f:
        json.dump(json_data, f, ensure_ascii=False, indent=2)
    return



def merge_json_files(json_files, output_file_path):
    merged_data = []

    for file in json_files:
        with open(file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            merged_data.extend(data)  # add the list from this file to the merged_data

    # Save the merged list to a new JSON file
    with open(output_file_path, 'w', encoding='utf-8') as f:
        json.dump(merged_data, f, ensure_ascii=False, indent=2)

    return


In [None]:
#Data with all templates


all_templates_data = pd.read_csv("/work/MarkusLundsfrydJensen#1865/Bachelor_project/annotation_data_fifth_template_appended.csv")

all_templates_data.head()

In [None]:
blame_columns = ["blame_in_text","second_template_blame_in_text","third_template_blame_in_text","fourth_template_blame_in_text","fifth_template_blame_in_text"]


In [None]:
#Sanity checks:
#check if translated, orignal and blame probabilty are same lengt (extend for all templates)

for column in blame_columns:
    for indx in range(len(all_templates_data)):

        temo = all_templates_data.loc[indx]

        l_da = len(ast.literal_eval(temo['da_segmented_text']))
        l_en = len(ast.literal_eval(temo['translated_text']))
        l_bl = len(ast.literal_eval(temo[column]))

        if l_da != l_en != l_bl:
            print(column)
            print(temo)

In [None]:
import ipywidgets
# apply extract blame from paragraph lookup function in order to evaulate blame from PolDebate probabilities

for i, column in enumerate(blame_columns):
    all_templates_data[f'blame_binary_temp_{i+1}'] = all_templates_data[column].swifter.apply(extract_blame_from_paragraph_lookup)

all_templates_data.head()

In [None]:
#Do vizualization of blame probability distribution for all templates
#alter function to make it subplots

all_blame_scores = []

for column in blame_columns:
    all_blame_scores.append(get_blame_scores(all_templates_data, blame_in_text_column = column))

In [None]:
vizualize_blame_prob(all_blame_scores)

In [None]:
#Get percentage of blame for each template

plt.figure(figsize=(12, 6))

blame_percentage = []
abs_blame = []
template_name = []
for i in range(1,6):
    temp_name = f'blame_binary_temp_{i}'
    percentage_blame, total_sent, total_blame= total_blame_percentage(all_templates_data[temp_name])

    blame_percentage.append(percentage_blame)
    abs_blame.append(total_blame)
    template_name.append(temp_name)





plt.bar(template_name, blame_percentage)
plt.title('Percentage of sentences as blame')


plt.suptitle('Bar plot of blame percentage by template')

#for ax in axs.flat:
#    ax.set(xlabel='probability', ylabel='count')
plt.show()

In [None]:
#Get data and do vizualization of the overlapping blame
# How much of the total blame by all templates do 4/3/2/1 templates agree upon
# Extract this information on a sentence level df['agreement_degree] maybe in format paragraph_i = [0,0,4,3,2,0,0,1]


In [None]:
blame_indices_template_1 = get_rows_and_positions(all_templates_data, col='blame_binary_temp_1')
blame_indices_template_2 = get_rows_and_positions(all_templates_data, col='blame_binary_temp_2')
blame_indices_template_3 = get_rows_and_positions(all_templates_data, col='blame_binary_temp_3')
blame_indices_template_4 = get_rows_and_positions(all_templates_data, col='blame_binary_temp_4')
blame_indices_template_5 = get_rows_and_positions(all_templates_data, col='blame_binary_temp_5')

In [None]:
#list of dictioranries holding the paragraph/sentence indices for blame detected
dicts = [blame_indices_template_1, blame_indices_template_2, blame_indices_template_3, blame_indices_template_4, blame_indices_template_5]


# Step 1: make an empty dictionary to count appearances
# structure: {(paragraph, sentence): count}
counts = {}

# Step 2: loop through all dictionaries one by one
for d in dicts:
    # For each paragraph in the dictionary
    for paragraph, sentences in d.items():
        # For each sentence number in the paragraph
        for sentence in sentences:
            key = (paragraph, sentence)
            # If we’ve seen it before, increase count by 1
            if key in counts:
                counts[key] += 1
            # If not, add it and set count to 1
            else:
                counts[key] = 1

In [None]:
# Step 3: Count how many pairs appear in 1–5 dictionaries
summary = {i: 0 for i in range(1, 6)}
for count in counts.values():
    summary[count] += 1

print("Pairs appearing in N dictionaries:")
for n, total in summary.items():
    print(f"{n}: {total}")

In [None]:

# Step 4: Gather which pairs appear in each count
pairs_by_count = {i: [] for i in range(1, 6)}
for pair, count in counts.items():
    pairs_by_count[count].append(pair)

# Example: show all pairs that appear in exactly 3 dictionaries
print("\nPairs appearing in exactly 5 dictionaries:")
print(pairs_by_count[5])

# Step 5: Total number of unique (paragraph, sentence) pairs
n_unique_pairs = len(counts)
print(f"\nTotal unique (paragraph, sentence) pairs: {n_unique_pairs}")


In [None]:
percentages = []
for i in summary.values():
    percentages.append((float(i)/n_unique_pairs)*100)

amount_of_docts = summary.keys()

plt.figure()

plt.bar(amount_of_docts, percentages)

plt.title("Hello")
plt.xlabel("AMount of templates assign True")
plt.ylabel("percentage of unique true labels")

plt.show()

In [None]:
all_templates_data.to_csv("/work/MarkusLundsfrydJensen#1865/Bachelor_project/data_ready_for_analysis_10_10.csv", index = False)

In [None]:
all_templates_data = pd.read_csv("/work/MarkusLundsfrydJensen#1865/Bachelor_project/data_ready_for_analysis_10_10.csv")

In [None]:
all_templates_data.head()

In [None]:
dict_indices_5_temps = list_to_dict(pairs_by_count[5])
dict_indices_4_temps = list_to_dict(pairs_by_count[4])
dict_indices_3_temps = list_to_dict(pairs_by_count[3])
dict_indices_2_temps = list_to_dict(pairs_by_count[2])
dict_indices_1_temps = list_to_dict(pairs_by_count[1])


In [None]:
#now extract danish sentences

da_sent_5_temps = danish_sentences_with_blame_extraction(dict_indices_5_temps, all_templates_data, 'da_segmented_text')
da_sent_4_temps = danish_sentences_with_blame_extraction(dict_indices_4_temps, all_templates_data, 'da_segmented_text')
da_sent_3_temps = danish_sentences_with_blame_extraction(dict_indices_3_temps, all_templates_data, 'da_segmented_text')
da_sent_2_temps = danish_sentences_with_blame_extraction(dict_indices_2_temps, all_templates_data, 'da_segmented_text')
da_sent_1_temps = danish_sentences_with_blame_extraction(dict_indices_1_temps, all_templates_data, 'da_segmented_text')


In [None]:
#get government data
regerings_data = pd.read_csv("/work/MarkusLundsfrydJensen#1865/Bachelor_project/danish_govs.csv")

#make date related columns into datetime objects
regerings_data["Start Date"] = pd.to_datetime(regerings_data["Start Date"], format="%Y-%m-%d")
regerings_data["End Date"]   = pd.to_datetime(regerings_data["End Date"], format="%Y-%m-%d")

#Also make the date in debate data as datetime object
all_templates_data["date"] = pd.to_datetime(all_templates_data["date"], format="%Y-%m-%d")

In [None]:
#make all sentences identified by all five templates as blame into json file
#make danish sentences into json and write
convert_to_json_and_write(file_name = 'sentences_in_five_templates',sentences = da_sent_5_temps)

#change format for labelstudio and append meta data
json_append_meta_data(file_name = 'sentences_in_five_templates', data = all_templates_data)

#Now append preceding and succeding sentence for each blame in addition to government meta data
json_government_and_context(file_name = 'sentences_in_five_templates', data = all_templates_data, government_data = regerings_data)


In [None]:
#make all sentences identified by four templates as blame into json file
#make danish sentences into json and write
convert_to_json_and_write(file_name = 'sentences_in_four_templates',sentences = da_sent_4_temps)

#change format for labelstudio and append meta data
json_append_meta_data(file_name = 'sentences_in_four_templates', data = all_templates_data)

#Now append preceding and succeding sentence for each blame in addition to government meta data
json_government_and_context(file_name = 'sentences_in_four_templates', data = all_templates_data, government_data = regerings_data)


In [None]:
#make all sentences identified by three templates as blame into json file
#make danish sentences into json and write
convert_to_json_and_write(file_name = 'sentences_in_three_templates',sentences = da_sent_3_temps)

#change format for labelstudio and append meta data
json_append_meta_data(file_name = 'sentences_in_three_templates', data = all_templates_data)

#Now append preceding and succeding sentence for each blame in addition to government meta data
json_government_and_context(file_name = 'sentences_in_three_templates', data = all_templates_data, government_data = regerings_data)


In [None]:
#make all sentences identified by two templates as blame into json file
#make danish sentences into json and write
convert_to_json_and_write(file_name = 'sentences_in_two_templates',sentences = da_sent_2_temps)

#change format for labelstudio and append meta data
json_append_meta_data(file_name = 'sentences_in_two_templates', data = all_templates_data)

#Now append preceding and succeding sentence for each blame in addition to government meta data
json_government_and_context(file_name = 'sentences_in_two_templates', data = all_templates_data, government_data = regerings_data)


In [None]:
#make all sentences identified by only one templates as blame into json file
#make danish sentences into json and write
convert_to_json_and_write(file_name = 'sentences_in_one_templates',sentences = da_sent_1_temps)

#change format for labelstudio and append meta data
json_append_meta_data(file_name = 'sentences_in_one_templates', data = all_templates_data)

#Now append preceding and succeding sentence for each blame in addition to government meta data
json_government_and_context(file_name = 'sentences_in_one_templates', data = all_templates_data, government_data = regerings_data)


In [None]:
# merge json_files
#base directory
base_dir = "/work/MarkusLundsfrydJensen#1865/Bachelor_project/json_files"

#save paths in variables
five_temps = base_dir + "/sentences_in_five_templates.json"
four_temps = base_dir + "/sentences_in_four_templates.json"
three_temps = base_dir + "/sentences_in_three_templates.json"
two_temps = base_dir + "/sentences_in_two_templates.json"
one_temps = base_dir + "/sentences_in_one_templates.json"

In [None]:


#apply merge

merge_json_files(json_files = [five_temps, four_temps], output_file_path = base_dir + '/collapsed_sentences_5_and_4.json')
merge_json_files(json_files = [five_temps, four_temps, three_temps], output_file_path = base_dir + '/collapsed_sentences_5_and_4_and_3.json')
merge_json_files(json_files = [five_temps, four_temps, three_temps, two_temps], output_file_path = base_dir + '/collapsed_sentences_5_and_4_and_3_and_2.json')
merge_json_files(json_files = [five_temps, four_temps, three_temps, two_temps, one_temps], output_file_path = base_dir + '/collapsed_sentences_5_and_4_and_3_and_2_and_1.json')


In [None]:
# Now for the no blame data

#extract sentences with no blame from the dictiorary of blame indices with blame



In [None]:
sentences_no_blame_5_temps = danish_sentences_without_blame_extraction(blame_dict = da_sent_5_temps, data = all_templates_data, text_column = 'da_segmented_text')

In [None]:
#make all sentences identified by all five templates with NO BLAME into json file
#make danish sentences into json and write
convert_to_json_and_write(file_name = 'NO_BLAME_sentences_in_five_templates',sentences = sentences_no_blame_5_temps)

#change format for labelstudio and append meta data
json_append_meta_data(file_name = 'NO_BLAME_sentences_in_five_templates', data = all_templates_data)

#Now append preceding and succeding sentence for each blame in addition to government meta data
json_government_and_context(file_name = 'NO_BLAME_sentences_in_five_templates', data = all_templates_data, government_data = regerings_data)

In [None]:
sentences_no_blame_4_temps = danish_sentences_without_blame_extraction(blame_dict = da_sent_4_temps, data = all_templates_data, text_column = 'da_segmented_text')

In [None]:
#make all sentences identified by four templates with NO BLAME into json file
#make danish sentences into json and write
convert_to_json_and_write(file_name = 'NO_BLAME_sentences_in_four_templates',sentences = sentences_no_blame_4_temps)

#change format for labelstudio and append meta data
json_append_meta_data(file_name = 'NO_BLAME_sentences_in_four_templates', data = all_templates_data)

#Now append preceding and succeding sentence for each blame in addition to government meta data
json_government_and_context(file_name = 'NO_BLAME_sentences_in_four_templates', data = all_templates_data, government_data = regerings_data)

In [None]:
sentences_no_blame_3_temps = danish_sentences_without_blame_extraction(blame_dict = da_sent_3_temps, data = all_templates_data, text_column = 'da_segmented_text')

In [None]:
#make all sentences identified by three templates with NO BLAME into json file
#make danish sentences into json and write
convert_to_json_and_write(file_name = 'NO_BLAME_sentences_in_three_templates',sentences = sentences_no_blame_3_temps)

#change format for labelstudio and append meta data
json_append_meta_data(file_name = 'NO_BLAME_sentences_in_three_templates', data = all_templates_data)

#Now append preceding and succeding sentence for each blame in addition to government meta data
json_government_and_context(file_name = 'NO_BLAME_sentences_in_three_templates', data = all_templates_data, government_data = regerings_data)

In [None]:
sentences_no_blame_2_temps = danish_sentences_without_blame_extraction(blame_dict = da_sent_2_temps, data = all_templates_data, text_column = 'da_segmented_text')

In [None]:
#make all sentences identified by two templates with NO BLAME into json file
#make danish sentences into json and write
convert_to_json_and_write(file_name = 'NO_BLAME_sentences_in_two_templates',sentences = sentences_no_blame_2_temps)

#change format for labelstudio and append meta data
json_append_meta_data(file_name = 'NO_BLAME_sentences_in_two_templates', data = all_templates_data)

#Now append preceding and succeding sentence for each blame in addition to government meta data
json_government_and_context(file_name = 'NO_BLAME_sentences_in_two_templates', data = all_templates_data, government_data = regerings_data)

In [None]:
sentences_no_blame_1_temps = danish_sentences_without_blame_extraction(blame_dict = da_sent_1_temps, data = all_templates_data, text_column = 'da_segmented_text')

In [None]:
#make all sentences identified by one templates with NO BLAME into json file
#make danish sentences into json and write
convert_to_json_and_write(file_name = 'NO_BLAME_sentences_in_one_templates',sentences = sentences_no_blame_1_temps)

#change format for labelstudio and append meta data
json_append_meta_data(file_name = 'NO_BLAME_sentences_in_one_templates', data = all_templates_data)

#Now append preceding and succeding sentence for each blame in addition to government meta data
json_government_and_context(file_name = 'NO_BLAME_sentences_in_one_templates', data = all_templates_data, government_data = regerings_data)

In [None]:
#apply merge of 5+4, 5+4+3 etc


base_dir = "/work/MarkusLundsfrydJensen#1865/Bachelor_project/json_files"

#save paths in variables
no_blame_five_temps = base_dir + "/NO_BLAME_sentences_in_five_templates.json"
no_blame_four_temps = base_dir + "/NO_BLAME_sentences_in_four_templates.json"
no_blame_three_temps = base_dir + "/NO_BLAME_sentences_in_three_templates.json"
no_blame_two_temps = base_dir + "/NO_BLAME_sentences_in_two_templates.json"
no_blame_one_temps = base_dir + "/NO_BLAME_sentences_in_one_templates.json"

merge_json_files(json_files = [no_blame_five_temps, no_blame_four_temps], output_file_path = base_dir + '/NO_BLAME_collapsed_sentences_5_and_4.json')
merge_json_files(json_files = [no_blame_five_temps, no_blame_four_temps, no_blame_three_temps], output_file_path = base_dir + '/NO_BLAME_collapsed_sentences_5_and_4_and_3.json')
merge_json_files(json_files = [no_blame_five_temps, no_blame_four_temps, no_blame_three_temps, no_blame_two_temps], output_file_path = base_dir + '/NO_BLAME_collapsed_sentences_5_and_4_and_3_and_2.json')
merge_json_files(json_files = [no_blame_five_temps, no_blame_four_temps, no_blame_three_temps, no_blame_two_temps, no_blame_one_temps], output_file_path = base_dir + '/NO_BLAME_collapsed_sentences_5_and_4_and_3_and_2_and_1.json')


In [None]:
'''
Afterwards 50 random samples of sentences which had been classified as containing blame were collected.
These sentences were randowmly drawn using xxx from the data file containing merged blame sentences from all model-templates.
This was deemed a suffienct step in order to gather a validation set with as little premade assumptions 
regarding the model performance as possiple. Due to the concern that the sentences which all templates classified as containing
blame could share some intrinsic feature of blame, which woudl not caputure the entire spectrum of what blame can be. 
By drawing the validation set from all detected sentences of blame, the aim was to make the validation set as representative of
blame in political speach as possiple. However, due to the relative small number of blame as a proportion of the 
total amount of sentences, drawing from the pre-classified labels was a nessecary step due to time-constraints, 
as an enourmous amount of data should be gold-labeled if a true random sample would be drawn from the entire dataset. 
In this case, if the calculated percentage of sentences containing blame is somewhat accurate 
(around 1%) 5.000 true random samples would need to be manually labelled if we were to achive a validation 
set of just 50 labels being true.

In addition to the 50 random samples pre-classified as blame drawn, also 50 random samples pre-classified as not blame were drawn.

These would be combined to a single set of 100 samples, which would be passed to labelStudio for labelling. It is important to note, that 
the computationally pre-classified label (decision by the PolDebate) as either blame or no blame, was hid during labelling,
as to not influence the authers decision of whether the sentence contained blame or not.
 
'''

In [None]:
#draw 50 random samples from the blame data

# Load your flattened JSON
with open("/work/MarkusLundsfrydJensen#1865/Training_data/cleaned_training_data_1_2_3_4_5_temps.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Sample 50 random entries
sampled_data = random.sample(data, k=50)

# Add a new key "Blame": 1 to each entry
for entry in sampled_data:
    entry["Blame"] = 1

# Save the modified sampled data
with open("/work/MarkusLundsfrydJensen#1865/Bachelor_project/json_files/blame_true_sampled_data.json", "w", encoding="utf-8") as f:
    json.dump(sampled_data, f, ensure_ascii=False, indent=2)


In [None]:
#draw 50 random samples from the no blame data

# Load your flattened JSON
with open("/work/MarkusLundsfrydJensen#1865/Bachelor_project/json_files/NO_BLAME_collapsed_sentences_5_and_4_and_3_and_2_and_1.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Sample 50 random entries
sampled_data = random.sample(data, k=50)

# Add a new key "Blame": 1 to each entry
for entry in sampled_data:
    entry["Blame"] = 0

# Save the modified sampled data
with open("/work/MarkusLundsfrydJensen#1865/Bachelor_project/json_files/blame_false_sampled_data.json", "w", encoding="utf-8") as f:
    json.dump(sampled_data, f, ensure_ascii=False, indent=2)

In [None]:
true_blame_sample_path = "/work/MarkusLundsfrydJensen#1865/Bachelor_project/json_files/blame_true_sampled_data.json"
false_blame_sample_path = "/work/MarkusLundsfrydJensen#1865/Bachelor_project/json_files/blame_false_sampled_data.json"
path_output = "/work/MarkusLundsfrydJensen#1865/Bachelor_project/json_files/Gold_Gold_label_data.json"

merge_json_files(json_files = [true_blame_sample_path, false_blame_sample_path], output_file_path = path_output)

In [None]:
#Do the above again

In [None]:
#draw 50 random samples from the blame data
# Load your flattened JSON
with open("/work/MarkusLundsfrydJensen#1865/Bachelor_project/json_files/collapsed_sentences_5_and_4_and_3_and_2_and_1.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Sample 50 random entries
sampled_data = random.sample(data, k=50)

# Add a new key "Blame": 1 to each entry
for entry in sampled_data:
    entry["Blame"] = 1

# Save the modified sampled data
with open("/work/MarkusLundsfrydJensen#1865/Bachelor_project/json_files/v2_blame_true_sampled_data.json", "w", encoding="utf-8") as f:
    json.dump(sampled_data, f, ensure_ascii=False, indent=2)


In [None]:
#draw 50 random samples from the no blame data

# Load your flattened JSON
with open("/work/MarkusLundsfrydJensen#1865/Bachelor_project/json_files/NO_BLAME_collapsed_sentences_5_and_4_and_3_and_2_and_1.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Sample 50 random entries
sampled_data = random.sample(data, k=50)

# Add a new key "Blame": 1 to each entry
for entry in sampled_data:
    entry["Blame"] = 0

# Save the modified sampled data
with open("/work/MarkusLundsfrydJensen#1865/Bachelor_project/json_files/v2_blame_false_sampled_data.json", "w", encoding="utf-8") as f:
    json.dump(sampled_data, f, ensure_ascii=False, indent=2)

In [None]:
true_blame_sample_path = "/work/MarkusLundsfrydJensen#1865/Bachelor_project/json_files/v2_blame_true_sampled_data.json"
false_blame_sample_path = "/work/MarkusLundsfrydJensen#1865/Bachelor_project/json_files/v2_blame_false_sampled_data.json"
path_output = "/work/MarkusLundsfrydJensen#1865/Bachelor_project/json_files/v2_Gold_Gold_label_data.json"

merge_json_files(json_files = [true_blame_sample_path, false_blame_sample_path], output_file_path = path_output)

In [None]:
#For a thrid time randomly sample data but this time from the correct dataset

import json
import random

# Load JSON file
with open('/work/MarkusLundsfrydJensen#1865/Training_data/cleaned_training_data_1_2_3_4_5_temps.json', 'r') as f:
    data = json.load(f)

# Filter entries by label
label_1_entries = [entry for entry in data if entry.get('label') == 1]
label_0_entries = [entry for entry in data if entry.get('label') == 0]

# Ensure there are enough entries in both categories
if len(label_1_entries) < 50:
    raise ValueError(f"Not enough entries with label=1. Only found {len(label_1_entries)}.")
if len(label_0_entries) < 50:
    raise ValueError(f"Not enough entries with label=0. Only found {len(label_0_entries)}.")

# Randomly sample 50 entries from each
sampled_1 = random.sample(label_1_entries, 50)
sampled_0 = random.sample(label_0_entries, 50)

# Combine both samples
combined_sample = sampled_1 + sampled_0

# (Optional) Shuffle the combined sample so labels are mixed
random.shuffle(combined_sample)

# (Optional) Save the sample to a new JSON file
with open('/work/MarkusLundsfrydJensen#1865/annotation_data_v3.json', 'w') as f:
    json.dump(combined_sample, f, indent=4)
