In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com




In [2]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com




In [4]:
# Load the welfare scheme write-up from a text file
file_path = "AAY.txt"  # Ensure the file is in the same directory

with open(file_path, "r") as file:
    write_up = file.read()

print("Welfare scheme write-up loaded successfully.")


Welfare scheme write-up loaded successfully.


In [5]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
from tqdm import tqdm  # Import tqdm for progress tracking

# Load the T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-large")  # You can use other versions like t5-base or t5-large as well
model = T5ForConditionalGeneration.from_pretrained("t5-large")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
def finalize_summary_with_t5(summary: str, prompt_section: str, **kwargs) -> str:
    """
    Finalizes the summary by using T5 to refine and remove redundancies, ensuring fluency.
    
    Args:
    - summary (str): The rough draft summary to be refined.
    - prompt_section (str): Section prompt to provide context.
    - kwargs: Additional parameters for the model's generate function.
    
    Returns:
    - str: The finalized, fluent summary.
    """
    # Prepare the refinement prompt for T5
    # refine_prompt = (
    #     f"Refine and condense the following summary for clarity, completeness, "
    #     f"and to remove redundant information for the section: {prompt_section}.\n"
    #     f"Summary: {summary}"
    # )
    # refine_prompt = (
    #     f"Remove repeating sentences  and finish incomplete sentences from the following text\n"
    #     f"text: {summary}"
    # )
    refine_prompt = (
        f"Identify Key Information from the following "
        f"text: {summary}"
    )
    
    # Tokenize and ensure input fits within max length
    inputs = tokenizer(refine_prompt, max_length=kwargs.get("max_length", 512), truncation=True, return_tensors="pt")

    try:
        # Generate the refined summary with parameters from **kwargs
        summary_ids = model.generate(inputs.input_ids, **kwargs)
        final_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Error during final refinement: {e}")
        final_summary = summary  # Fall back to the draft summary without further processing

    # return f"Original Summary : {summary}\n Refined_summary : {final_summary}"
    return final_summary

def get_summary_with_t5(write_up: str, prompt_section: str, window_size: int = 512, overlap: int = 50, max_summary_tokens: int = 200, **kwargs) -> str:
    """
    Summarizes the welfare scheme write-up based on the specified prompt section using T5.
    This function uses a sliding window approach with dynamic chunk sizing to handle long texts, accumulating the summary as it progresses.

    Args:
    - write_up (str): Full write-up of the welfare scheme.
    - prompt_section (str): The section prompt, e.g., "Beneficiary and Problem Statement".
    - window_size (int): The maximum size for each model input.
    - overlap (int): The number of tokens to overlap between segments.
    - max_summary_tokens (int): Maximum tokens to retain in the accumulated summary for each input.
    - kwargs: Additional parameters for the model's generate function.

    Returns:
    - str: The combined generated summary for the specified section.
    """
    if not write_up or not prompt_section:
        raise ValueError("Write-up and prompt section must not be empty.")
    
    # Tokenize the entire write-up
    total_tokens = tokenizer(write_up)["input_ids"]
    print(f"Total tokens: {len(total_tokens)}")  # Debugging line
    
    accumulated_summary = ""
    summaries = []
    start_idx = 0  # To keep track of the position in the tokenized write-up

    # Process in sliding windows to handle long write-ups
    while start_idx < len(total_tokens):
        # Truncate accumulated summary to last max_summary_tokens tokens to fit within the model input
        truncated_summary = tokenizer.decode(
            tokenizer(accumulated_summary)["input_ids"][-max_summary_tokens:], 
            skip_special_tokens=True
        )
        
        # Prepare the prompt and summary context
        prompt_text = (
            f"{prompt_section}.\n"
            f"Summary so far: {truncated_summary}\n"
            f"summarize: "
        )
        
        # Calculate remaining tokens for the current chunk
        prompt_tokens = tokenizer(prompt_text)["input_ids"]
        available_chunk_size = window_size - len(prompt_tokens)
        
        # Extract the chunk with the available size
        chunk = total_tokens[start_idx:start_idx + available_chunk_size]
        if len(chunk) == 0:
            break

        # Decode the chunk back to text
        chunk_text = tokenizer.decode(chunk, skip_special_tokens=True)
        
        # Combine prompt and chunk text
        model_input = prompt_text + chunk_text
        
        # Tokenize the combined input for the model
        inputs = tokenizer(model_input, max_length=window_size, truncation=True, return_tensors="pt")

        try:
            # Generate the summary for the current chunk
            summary_ids = model.generate(inputs.input_ids, **kwargs)
            summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
            
            # Update the accumulated summary and add to the summaries list for debugging
            accumulated_summary += " " + summary
            summaries.append(summary)
        
        except Exception as e:
            print(f"Error generating summary for chunk starting at token index {start_idx}: {e}")
        
        # Move start index for the next chunk, applying overlap
        start_idx += available_chunk_size - overlap
    
    # Final accumulated summary
    final_summary = finalize_summary_with_t5(accumulated_summary.strip(), prompt_section,**kwargs)
    return final_summary


def summarize_scheme_with_t5(write_up: str) -> dict:
    """
    Generates summaries for each of the three sections using T5:
    'Beneficiary and Problem Statement', 'Application Process and Benefits', 'Outcome and Impact'.
    
    Args:
    - write_up (str): Full write-up of the welfare scheme.

    Returns:
    - dict: A dictionary containing summaries for each section.
    """
    sections = [
        "Beneficiary and Problem Statement",
        "Application Process and Benefits",
        "Outcome and Impact"
    ]
    
    section_prompts_t5 = {
        f"Beneficiary and Problem Statement": "Identify the key beneficiaries of the welfare scheme and the problems they face.",
        "Application Process and Benefits": "Identify the application process and benefits provided by the scheme.",
        "Outcome and Impact": " Identify the outcomes and impact of the scheme on its beneficiaries."
    }
    
    summaries = {}
    # Use tqdm to show progress for summarizing each section
    for section, prompt in tqdm(section_prompts_t5.items(), desc="Summarizing Sections"):
        summaries[section] = get_summary_with_t5(write_up, prompt, max_length=80, min_length=20, num_beams=5, length_penalty=1.2, early_stopping=True)

    return summaries


In [44]:
# Generate summaries
summaries = summarize_scheme_with_t5(write_up)

# Display the summaries
for section, summary in summaries.items():
    print(f"{section}:\n{summary}\n")

Summarizing Sections:   0%|          | 0/3 [00:00<?, ?it/s]

Total tokens: 875


Summarizing Sections:  33%|███▎      | 1/3 [04:18<08:37, 258.63s/it]

Total tokens: 875


Summarizing Sections:  67%|██████▋   | 2/3 [07:40<03:45, 225.07s/it]

Total tokens: 875


Summarizing Sections: 100%|██████████| 3/3 [11:17<00:00, 225.89s/it]

Beneficiary and Problem Statement:
Original Summary : Antyodaya Anna Yojana (AAY) is a flagship food security program in india . it aims to provide highly subsidized food grains to the poorest of the poor . each eligible household can receive up to 35 kg of food grains per month . Antyodaya Anna Yojana (AAY) is a flagship food security program in india . it aims to provide highly subsidized food grains to the poorest of the poor . each eligible household can receive up to 35 kg of food grains per month . food grains, AAY plays a significant role in improving nutritional standards and alleviating poverty . it aims to provide highly subsidized food grains to the poorest of the poor . each eligible household can receive up to 35 kg of food grains per month .
 Refined_summary : Antyodaya Anna Yojana (AAY) is a flagship food security program in india . it aims to provide highly subsidized food grains to the poorest households . each eligible household can receive up to 35 kg of food grains 


