In [50]:
import json
import os
import csv
import subprocess
import re

In [51]:
def collect_json_files(base_folder, list):
    """Collects JSON file paths from the specified folder structure."""
    for i in range(40):  # wiki_pages_0 to wiki_pages_39
        for letter in ['AA', 'AB', 'AC', 'AD', 'AE']:  # Subfolder names
            folder_path = os.path.join(base_folder, f"wiki_pages_{i}", letter)
            for j in range(100):  # wiki_00 to wiki_99
                json_file = f"wiki_{j:02d}.json"  # Format to wiki_00, wiki_01, ..., wiki_99
                json_file_path = os.path.join(folder_path, json_file)
                if os.path.isfile(json_file_path):  # Check if file exists
                    list.append(json_file_path)

In [52]:
# !bert-score -r /original_content.txt -c /summary_output.txt --lang en 
#read original content and summary output
# with open('Documents/llama_3.1_8B/original_content.txt', 'r') as file:
#     original_content = file.read()


# with open('Documents/llama_3.1_8B/summary_output.txt', 'r') as file:
#     summary_output = file.read()



# !bert-score -r /Calculations/original_content.txt -c /Calculations/summary_output.txt --lang en


In [53]:
def run_bert(title):
    try:
        # Step 1: Run the bert-score command and capture the output
        command = "bert-score -r /Calculations/original_content.txt -c /Calculations/summary_output.txt --lang en"
        process = subprocess.run(command, shell=True, capture_output=True, text=True)

        # Step 2: Extract Precision (P), Recall (R), and F1 score using regex
        output = process.stdout
        print("Command output:\n", output)  # Log the command output for debugging

        # Use regex to find P, R, and F1 in the output
        precision_match = re.search(r"P:\s([0-9.]+)", output)
        recall_match = re.search(r"R:\s([0-9.]+)", output)
        f1_match = re.search(r"F1:\s([0-9.]+)", output)

        if precision_match and recall_match and f1_match:
            precision = precision_match.group(1)
            recall = recall_match.group(1)
            f1_score = f1_match.group(1)

            # Step 3: Read existing CSV, update row with matching title
            updated_rows = []
            title_found = False

            # Read the existing CSV file and update the corresponding row
            with open('bert_score.csv', 'r', newline='') as file:
                reader = csv.reader(file)
                header = next(reader)  # Save header
                updated_rows.append(header)  # Add the header to updated rows

                for row in reader:
                    if row[1] == title:  # Assuming Document Title is in the second column (index 1)
                        # Ensure the row has enough columns (at least 7)
                        while len(row) < 7:
                            row.append('')  # Append empty columns if needed
                        
                        # Update Precision, Recall, and F1-Score
                        row[4], row[5], row[6] = precision, recall, f1_score
                        title_found = True
                    updated_rows.append(row)

            # If the title wasn't found, log a message
            if not title_found:
                print(f"Title '{title}' not found in CSV.")

            # Step 4: Write the updated rows back to the CSV
            with open('bert_score.csv', 'w', newline='') as file:
                writer = csv.writer(file)
                writer.writerows(updated_rows)

            print(f"Updated for {title}: Precision={precision}, Recall={recall}, F1-Score={f1_score}")
        else:
            print("Failed to extract precision, recall, or F1 scores from the output.")
    except Exception as e:
        print(f"Error while running BERT score or updating CSV: {e}")


In [54]:
summary_queue = []
original_queue = []
summary_folder = 'SummariesJson'
original_folder = 'CleanDataJson'

In [55]:
# Collect file paths
collect_json_files(summary_folder, summary_queue)

# for each file in the summary, add it to the original queue but with CleanDataJson instead of SummariesJson
for file in summary_queue:
    original_queue.append(file.replace(summary_folder, original_folder))  

In [56]:
# Process each pair of corresponding files
for i in range(len(summary_queue)):  # Adjust range based on the number of files
    summary_file = summary_queue[i]
    original_file = original_queue[i]

    # Open and load the entire summary file as a JSON array
    with open(summary_file, 'r') as f_summary:
        try:
            summary_data = json.load(f_summary)  # Load JSON array in one go
        except json.JSONDecodeError as e:
            print(f"Error loading summary JSON: {e}")
            continue

    # Open the original file and read it line by line
    with open(original_file, 'r') as f_original:
        for original_line in f_original:
            try:
                original = json.loads(original_line)
            except json.JSONDecodeError as e:
                print(f"Error loading original JSON: {e}")
                continue

            # Find the corresponding summary object (by title or ID, adjust as needed)
            corresponding_summary = next((s for s in summary_data if s['title'] == original['title']), None)
            if corresponding_summary:
                title = corresponding_summary['title']
                summary_content = corresponding_summary.get('text', '')
                doc_content = original.get('text', '')

                print(f"Title: {title}")

                # Create the directory if it doesn't exist
                os.makedirs('/home/group02-s24/Documents/llama_3.1_8B/Calculations', exist_ok=True)

                # Now proceed to write the file
                with open('/home/group02-s24/Documents/llama_3.1_8B/Calculations/original_content.txt', 'w') as f:
                    f.write(doc_content)
                with open('/home/group02-s24/Documents/llama_3.1_8B/Calculations/summary_output.txt', 'w') as f:
                    f.write(summary_content)

                # Run BERT score script
                run_bert(title)
            else:
                print(f"Summary not found for {original['title']}")


Title: AccessibleComputing
Command output:
 roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.45.2)_fast-tokenizer P: 0.960337 R: 0.957664 F1: 0.958999

Updated for AccessibleComputing: Precision=0.960337, Recall=0.957664, F1-Score=0.958999
Title: Aberdeen (disambiguation)
Command output:
 roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.45.2)_fast-tokenizer P: 0.960337 R: 0.957664 F1: 0.958999

Updated for Aberdeen (disambiguation): Precision=0.960337, Recall=0.957664, F1-Score=0.958999
Title: Aruba
Command output:
 roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.45.2)_fast-tokenizer P: 0.960337 R: 0.957664 F1: 0.958999

Updated for Aruba: Precision=0.960337, Recall=0.957664, F1-Score=0.958999
Title: Asterism
Command output:
 roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.45.2)_fast-tokenizer P: 0.960337 R: 0.957664 F1: 0.958999

Updated for Asterism: Precision=0.960337, Recall=0.957664, F1-Score=0.958999
Title: American Film Institute
Command output:
 roberta-large_L17_