In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
from tqdm import tqdm

# Load the T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-large")
model = T5ForConditionalGeneration.from_pretrained("t5-large")

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


KeyboardInterrupt: 

In [None]:
def finalize_summary_with_t5(summary: str, prompt_section: str, **kwargs) -> str:
    """
    Finalizes the summary by using T5 to refine and remove redundancies, ensuring fluency.
    
    Args:
    - summary (str): The rough draft summary to be refined.
    - prompt_section (str): Section prompt to provide context.
    - kwargs: Additional parameters for the model's generate function.
    
    Returns:
    - str: The finalized, fluent summary.
    """
    refine_prompt = (
        f"Identify Key Information from the following "
        f"text: {summary}"
    )
    
    # Tokenize and ensure input fits within max length
    inputs = tokenizer(refine_prompt, max_length=kwargs.get("max_length", 512), truncation=True, return_tensors="pt")

    try:
        # Generate the refined summary with parameters from **kwargs
        summary_ids = model.generate(inputs.input_ids, **kwargs)
        final_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Error during final refinement: {e}")
        final_summary = summary

    return final_summary

def get_summary_with_t5(write_up: str, prompt_section: str, window_size: int = 512, overlap: int = 50, max_summary_tokens: int = 200, **kwargs) -> str:
    """
    Summarizes the welfare scheme write-up based on the specified prompt section using T5.
    This function uses a sliding window approach with dynamic chunk sizing to handle long texts, accumulating the summary as it progresses.

    Args:
    - write_up (str): Full write-up of the welfare scheme.
    - prompt_section (str): The section prompt, e.g., "Beneficiary and Problem Statement".
    - window_size (int): The maximum size for each model input.
    - overlap (int): The number of tokens to overlap between segments.
    - max_summary_tokens (int): Maximum tokens to retain in the accumulated summary for each input.
    - kwargs: Additional parameters for the model's generate function.

    Returns:
    - str: The combined generated summary for the specified section.
    """
    if not write_up or not prompt_section:
        raise ValueError("Write-up and prompt section must not be empty.")
    
    # Tokenize the entire write-up
    total_tokens = tokenizer(write_up)["input_ids"]
    
    accumulated_summary = ""
    summaries = []
    start_idx = 0

    # Process in sliding windows to handle long write-ups
    while start_idx < len(total_tokens):
        truncated_summary = tokenizer.decode(
            tokenizer(accumulated_summary)["input_ids"][-max_summary_tokens:], 
            skip_special_tokens=True
        )
        
        prompt_text = (
            f"{prompt_section}.\n"
            f"Summary so far: {truncated_summary}\n"
            f"summarize: "
        )
        
        prompt_tokens = tokenizer(prompt_text)["input_ids"]
        available_chunk_size = window_size - len(prompt_tokens)
        
        chunk = total_tokens[start_idx:start_idx + available_chunk_size]
        if len(chunk) == 0:
            break

        chunk_text = tokenizer.decode(chunk, skip_special_tokens=True)
        
        model_input = prompt_text + chunk_text
        
        inputs = tokenizer(model_input, max_length=window_size, truncation=True, return_tensors="pt")

        try:
            summary_ids = model.generate(inputs.input_ids, **kwargs)
            summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
            
            accumulated_summary += " " + summary
            summaries.append(summary)
        
        except Exception as e:
            print(f"Error generating summary for chunk starting at token index {start_idx}: {e}")
        
        start_idx += available_chunk_size - overlap
    
    final_summary = finalize_summary_with_t5(accumulated_summary.strip(), prompt_section,**kwargs)
    return final_summary


def summarize_scheme_with_t5(write_up: str) -> dict:
    """
    Generates summaries for each of the three sections using T5:
    'Beneficiary and Problem Statement', 'Application Process and Benefits', 'Outcome and Impact'.
    
    Args:
    - write_up (str): Full write-up of the welfare scheme.

    Returns:
    - dict: A dictionary containing summaries for each section.
    """
    sections = [
        "Beneficiary and Problem Statement",
        "Application Process and Benefits",
        "Outcome and Impact"
    ]
    
    section_prompts_t5 = {
        f"Beneficiary and Problem Statement": "Identify the key beneficiaries of the welfare scheme and the problems they face.",
        "Application Process and Benefits": "Identify the application process and benefits provided by the scheme.",
        "Outcome and Impact": " Identify the outcomes and impact of the scheme on its beneficiaries."
    }
    
    summaries = {}
    for section, prompt in tqdm(section_prompts_t5.items(), desc="Summarizing Sections"):
        summaries[section] = get_summary_with_t5(write_up, prompt, max_length=80, min_length=20, num_beams=5, length_penalty=1.2, early_stopping=True)

    return summaries


In [None]:
import os

input_directory = "data"
output_directory = "scheme_writeups_T5"

os.makedirs(output_directory, exist_ok=True)

def summarize_and_save(input_dir, output_dir):
    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            file_path = os.path.join(input_dir, filename)
            with open(file_path, "r") as file:
                write_up = file.read()
            
            print(f"Welfare scheme write-up from {filename} loaded successfully.")
            
            summaries = summarize_scheme_with_t5(write_up)
            
            for section, summary in summaries.items():
                summary_filename = f"{os.path.splitext(filename)[0]}_{section}.txt"
                summary_file_path = os.path.join(output_dir, summary_filename)
                
                with open(summary_file_path, "w") as summary_file:
                    summary_file.write(summary)
                
                print(f"Summary for {section} saved as {summary_filename}.")

summarize_and_save(input_directory, output_directory)

In [None]:
import os

directory = 'scheme_writeups_T5' 

for filename in os.listdir(directory):
    new_filename = filename.replace(' ', '_')
    
    old_file = os.path.join(directory, filename)
    new_file = os.path.join(directory, new_filename)
    
    if old_file != new_file:
        os.rename(old_file, new_file)
        print(f'Renamed: "{old_file}" to "{new_file}"')