In [31]:
import json
import os
from langchain_ollama import OllamaLLM
import threading
from queue import Queue
from threading import Lock
import csv
import subprocess
import re

In [20]:
length_of_doc = 0
file_queue = Queue() #queue with all the json files paths

# Collect Json Files:
 ### collects all the json files paths in the file_queue

In [21]:
def collect_json_files(base_folder, queue):
    """Collects JSON file paths from the specified folder structure."""
    for i in range(40):  # wiki_pages_0 to wiki_pages_39
        for letter in ['AA', 'AB', 'AC', 'AD', 'AE']:  # Subfolder names
            folder_path = os.path.join(base_folder, f"wiki_pages_{i}", letter)
            for j in range(100):  # wiki_00 to wiki_99
                json_file = f"wiki_{j:02d}.json"  # Format to wiki_00, wiki_01, ..., wiki_99
                json_file_path = os.path.join(folder_path, json_file)
                if os.path.isfile(json_file_path):  # Check if file exists
                    queue.put(json_file_path)

In [22]:
base_folder = 'CleanDataJson'
collect_json_files(base_folder, file_queue)

# Save CSV: 
### Saves all the data extracted in a csv file to be used in the BertScore evaluation

In [23]:
def save_csv(doc_file, title, length_of_doc, len_of_summary):
    # Define the CSV file name
    csv_file = 'summary_length.csv'

    # Open the CSV file in append mode
    with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, quoting=csv.QUOTE_MINIMAL)
        
        # Check if the file exists and has content
        file_exists = os.path.isfile(csv_file)
        if not file_exists or os.stat(csv_file).st_size == 0:
            # Write the header only if the file is new or empty
            writer.writerow(['Document Path', 'Document Title', 'Document Length', 'Summary Length'])

        # Write the document information with proper quoting to handle commas in titles
        writer.writerow([doc_file, title, length_of_doc, len_of_summary])

# Model Implementation

In [24]:
# Calculate the maximum output length
context_length = 100000

# Initialize the Ollama language model instance
model = OllamaLLM(
    model="llama3.1:8b",                        # Specify the model
    temperature=0.7,                            # Adjust creativity/randomness
    top_p=0.9,                                  # Use nucleus sampling
    top_k=50,                                   # Limit to top-k tokens
    repeat_penalty=1.2,                         # Penalize token repetition
    num_ctx = 50000
)

In [25]:
def model_process(document):
    messages = [
    {"role": "system", "content": "You are an efficient text summarizer. "},
    ]

    system_message = messages[0]['content']

    prompt = """You are an efficient and accurate text summarizer that ensures each sentence is complete. Summarize the following document in clear, concise paragraphs without using bullet points or lists. Ensure the summary flows naturally, avoids breaking the content into individual items, and provides a coherent narrative. The output must follow this exact structure:

    First, write the word 'Summary:' followed by the summary of the document.

    """

    previous_response = ""
    response = ""
    #############CHUNKING#############
    # Split the document into chunks
    for j in range(0, len(document), context_length):
        chunk = document[j:j+context_length]
        chunk += previous_response
        full_input = f"{system_message}\n\n{prompt}\n\nDocument: {chunk}"
        response = model.invoke(full_input)
        previous_response = response                         
    return response

# Summarization: 
### Function that accesses each json object in each json file in file_queue so that it can be summarized

In [26]:
def summarization(base_output_folder):
    i = 0  # Initialize i to track file number globally
    while not file_queue.empty():
        file_name = file_queue.get()
        print(f"Processing file {file_name}")
        try:
            with open(file_name, 'r', encoding='utf-8') as f:
                json_data = []  # Collect all objects for the final output file
                
                for line in f:
                    try:
                        data = json.loads(line)
                        text_content = data.get('text', '')  # Extract the text from each object
                        title = data.get('title', '')
                        print(f"title {title}")
                        length_of_doc = len(text_content)
                        
                        # Summarize and categorize the text
                        summary = model_process(text_content)
                        
                        # Update the object with the summary and category
                        data['text'] = summary

                        # #save details in a csv file
                        save_csv(file_name, title, length_of_doc, len(summary))

                        #save the summary in the json file
    
                        json_data.append(data)  # Collect updated data for output

                        relative_path = os.path.relpath(file_name, start='CleanDataJson')
                        output_file_path = os.path.join(base_output_folder, relative_path)
                        output_file_path = output_file_path.replace('CleanDataJson', 'SummariesJson')

                        print(f"output_file_path {output_file_path}")
                        
                        # Ensure the directories exist
                        os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
                        
                        # Write the updated JSON data to the new file
                        with open(output_file_path, 'w', encoding='utf-8') as json_output_file:
                            json.dump(json_data, json_output_file, ensure_ascii=False, indent=4)
                            
                        print(f"Processed and saved summary for file {file_name} to {output_file_path}")
                                    
                    except json.JSONDecodeError as e:
                        print(f"Error decoding JSON in file {file_name}: {e}")
          

        except Exception as e:
            print(f"Error processing file {file_name}: {e}")
        
        i += 1  # Increment i after processing each file
        file_queue.task_done()



# Threading

In [27]:
base_output_folder = 'SummariesJson'

num_threads = 40  # You can adjust this based on your system

# Create threads
threads = []
for _ in range(num_threads):
    t = threading.Thread(target=summarization, args=(base_output_folder,))
    t.start()
    threads.append(t)

# Wait for all threads to finish
for t in threads:
    t.join()

# summarization(base_output_folder)

Processing file CleanDataJson/wiki_pages_0/AA/wiki_00.json
title AccessibleComputing
output_file_path SummariesJson/wiki_pages_0/AA/wiki_00.json
Processed and saved summary for file CleanDataJson/wiki_pages_0/AA/wiki_00.json to SummariesJson/wiki_pages_0/AA/wiki_00.json
title Aberdeen (disambiguation)
output_file_path SummariesJson/wiki_pages_0/AA/wiki_00.json
Processed and saved summary for file CleanDataJson/wiki_pages_0/AA/wiki_00.json to SummariesJson/wiki_pages_0/AA/wiki_00.json
title Aruba
output_file_path SummariesJson/wiki_pages_0/AA/wiki_00.json
Processed and saved summary for file CleanDataJson/wiki_pages_0/AA/wiki_00.json to SummariesJson/wiki_pages_0/AA/wiki_00.json
title Asterism
output_file_path SummariesJson/wiki_pages_0/AA/wiki_00.json
Processed and saved summary for file CleanDataJson/wiki_pages_0/AA/wiki_00.json to SummariesJson/wiki_pages_0/AA/wiki_00.json
title American Film Institute
output_file_path SummariesJson/wiki_pages_0/AA/wiki_00.json
Processed and saved su

KeyboardInterrupt: 

# BertScore

In [108]:
# !bert-score -r /bertScoreTestingOrg/wiki_0.txt -c /bertScoreTestingSumm/summary_output0.txt --lang en
# !bert-score -r /bertScoreTestingSumm/original_content.txt -c /bertScoreTestingSumm/summary_output.txt --lang en

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.45.2)_fast-tokenizer P: 0.967943 R: 0.967100 F1: 0.967521
