In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
from tqdm import tqdm

# Load the T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-large")
model = T5ForConditionalGeneration.from_pretrained("t5-large")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [2]:
def finalize_summary_with_t5(summary: str, prompt_section: str, **kwargs) -> str:
    """
    Finalizes the summary by using T5 to refine and remove redundancies, ensuring fluency.

    Args:
    - summary (str): The rough draft summary to be refined.
    - prompt_section (str): Section prompt to provide context.
    - kwargs: Additional parameters for the model's generate function.

    Returns:
    - str: The finalized, fluent summary.
    """
    refine_prompt = (
        f"Identify Key Information from the following "
        f"text: {summary}"
    )

    # Tokenize and ensure input fits within max length
    inputs = tokenizer(refine_prompt, max_length=kwargs.get("max_length", 512), truncation=True, return_tensors="pt")

    try:
        # Generate the refined summary with parameters from **kwargs
        summary_ids = model.generate(inputs.input_ids, **kwargs)
        final_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Error during final refinement: {e}")
        final_summary = summary

    return final_summary

def get_summary_with_t5(write_up: str, prompt_section: str, window_size: int = 512, overlap: int = 50, max_summary_tokens: int = 200, **kwargs) -> str:
    """
    Summarizes the welfare scheme write-up based on the specified prompt section using T5.
    This function uses a sliding window approach with dynamic chunk sizing to handle long texts, accumulating the summary as it progresses.

    Args:
    - write_up (str): Full write-up of the welfare scheme.
    - prompt_section (str): The section prompt, e.g., "Beneficiary and Problem Statement".
    - window_size (int): The maximum size for each model input.
    - overlap (int): The number of tokens to overlap between segments.
    - max_summary_tokens (int): Maximum tokens to retain in the accumulated summary for each input.
    - kwargs: Additional parameters for the model's generate function.

    Returns:
    - str: The combined generated summary for the specified section.
    """
    if not write_up or not prompt_section:
        raise ValueError("Write-up and prompt section must not be empty.")

    # Tokenize the entire write-up
    total_tokens = tokenizer(write_up)["input_ids"]

    accumulated_summary = ""
    summaries = []
    start_idx = 0

    # Process in sliding windows to handle long write-ups
    while start_idx < len(total_tokens):
        truncated_summary = tokenizer.decode(
            tokenizer(accumulated_summary)["input_ids"][-max_summary_tokens:],
            skip_special_tokens=True
        )

        prompt_text = (
            f"{prompt_section}.\n"
            f"Summary so far: {truncated_summary}\n"
            f"summarize: "
        )

        prompt_tokens = tokenizer(prompt_text)["input_ids"]
        available_chunk_size = window_size - len(prompt_tokens)

        chunk = total_tokens[start_idx:start_idx + available_chunk_size]
        if len(chunk) == 0:
            break

        chunk_text = tokenizer.decode(chunk, skip_special_tokens=True)

        model_input = prompt_text + chunk_text

        inputs = tokenizer(model_input, max_length=window_size, truncation=True, return_tensors="pt")

        try:
            summary_ids = model.generate(inputs.input_ids, **kwargs)
            summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

            accumulated_summary += " " + summary
            summaries.append(summary)

        except Exception as e:
            print(f"Error generating summary for chunk starting at token index {start_idx}: {e}")

        start_idx += available_chunk_size - overlap

    final_summary = finalize_summary_with_t5(accumulated_summary.strip(), prompt_section,**kwargs)
    return final_summary


def summarize_scheme_with_t5(write_up: str) -> dict:
    """
    Generates summaries for each of the three sections using T5:
    'Beneficiary and Problem Statement', 'Application Process and Benefits', 'Outcome and Impact'.

    Args:
    - write_up (str): Full write-up of the welfare scheme.

    Returns:
    - dict: A dictionary containing summaries for each section.
    """
    sections = [
        "Beneficiary and Problem Statement",
        "Application Process and Benefits",
        "Outcome and Impact"
    ]

    section_prompts_t5 = {
        f"Beneficiary and Problem Statement": "Identify the key beneficiaries of the welfare scheme and the problems they face.",
        "Application Process and Benefits": "Identify the application process and benefits provided by the scheme.",
        "Outcome and Impact": " Identify the outcomes and impact of the scheme on its beneficiaries."
    }

    summaries = {}
    for section, prompt in tqdm(section_prompts_t5.items(), desc="Summarizing Sections"):
        summaries[section] = get_summary_with_t5(write_up, prompt, max_length=80, min_length=20, num_beams=5, length_penalty=1.2, early_stopping=True)

    return summaries


In [3]:
import os

input_directory = "data"
output_directory = "scheme_writeups_T5"

os.makedirs(output_directory, exist_ok=True)

def summarize_and_save(input_dir, output_dir):
    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            file_path = os.path.join(input_dir, filename)
            with open(file_path, "r") as file:
                write_up = file.read()

            print(f"Welfare scheme write-up from {filename} loaded successfully.")

            summaries = summarize_scheme_with_t5(write_up)

            for section, summary in summaries.items():
                summary_filename = f"{os.path.splitext(filename)[0]}_{section}.txt"
                summary_file_path = os.path.join(output_dir, summary_filename)

                with open(summary_file_path, "w") as summary_file:
                    summary_file.write(summary)

                print(f"Summary for {section} saved as {summary_filename}.")

summarize_and_save(input_directory, output_directory)

Welfare scheme write-up from MGNREGA.txt loaded successfully.


Summarizing Sections: 100%|██████████| 3/3 [05:38<00:00, 112.68s/it]


Summary for Beneficiary and Problem Statement saved as MGNREGA_Beneficiary and Problem Statement.txt.
Summary for Application Process and Benefits saved as MGNREGA_Application Process and Benefits.txt.
Summary for Outcome and Impact saved as MGNREGA_Outcome and Impact.txt.
Welfare scheme write-up from PMGSY.txt loaded successfully.


Summarizing Sections: 100%|██████████| 3/3 [05:09<00:00, 103.28s/it]


Summary for Beneficiary and Problem Statement saved as PMGSY_Beneficiary and Problem Statement.txt.
Summary for Application Process and Benefits saved as PMGSY_Application Process and Benefits.txt.
Summary for Outcome and Impact saved as PMGSY_Outcome and Impact.txt.
Welfare scheme write-up from PMMVY.txt loaded successfully.


Summarizing Sections: 100%|██████████| 3/3 [07:13<00:00, 144.44s/it]


Summary for Beneficiary and Problem Statement saved as PMMVY_Beneficiary and Problem Statement.txt.
Summary for Application Process and Benefits saved as PMMVY_Application Process and Benefits.txt.
Summary for Outcome and Impact saved as PMMVY_Outcome and Impact.txt.
Welfare scheme write-up from PMAY.txt loaded successfully.


Summarizing Sections: 100%|██████████| 3/3 [07:29<00:00, 149.86s/it]


Summary for Beneficiary and Problem Statement saved as PMAY_Beneficiary and Problem Statement.txt.
Summary for Application Process and Benefits saved as PMAY_Application Process and Benefits.txt.
Summary for Outcome and Impact saved as PMAY_Outcome and Impact.txt.
Welfare scheme write-up from NRLM.txt loaded successfully.


Summarizing Sections: 100%|██████████| 3/3 [07:34<00:00, 151.54s/it]


Summary for Beneficiary and Problem Statement saved as NRLM_Beneficiary and Problem Statement.txt.
Summary for Application Process and Benefits saved as NRLM_Application Process and Benefits.txt.
Summary for Outcome and Impact saved as NRLM_Outcome and Impact.txt.
Welfare scheme write-up from PMSBY.txt loaded successfully.


Summarizing Sections: 100%|██████████| 3/3 [05:58<00:00, 119.36s/it]


Summary for Beneficiary and Problem Statement saved as PMSBY_Beneficiary and Problem Statement.txt.
Summary for Application Process and Benefits saved as PMSBY_Application Process and Benefits.txt.
Summary for Outcome and Impact saved as PMSBY_Outcome and Impact.txt.
Welfare scheme write-up from NSAP.txt loaded successfully.


Summarizing Sections: 100%|██████████| 3/3 [05:14<00:00, 104.93s/it]


Summary for Beneficiary and Problem Statement saved as NSAP_Beneficiary and Problem Statement.txt.
Summary for Application Process and Benefits saved as NSAP_Application Process and Benefits.txt.
Summary for Outcome and Impact saved as NSAP_Outcome and Impact.txt.
Welfare scheme write-up from ABPMJAY.txt loaded successfully.


Summarizing Sections: 100%|██████████| 3/3 [07:27<00:00, 149.31s/it]


Summary for Beneficiary and Problem Statement saved as ABPMJAY_Beneficiary and Problem Statement.txt.
Summary for Application Process and Benefits saved as ABPMJAY_Application Process and Benefits.txt.
Summary for Outcome and Impact saved as ABPMJAY_Outcome and Impact.txt.
Welfare scheme write-up from AAY.txt loaded successfully.


Summarizing Sections: 100%|██████████| 3/3 [06:30<00:00, 130.27s/it]


Summary for Beneficiary and Problem Statement saved as AAY_Beneficiary and Problem Statement.txt.
Summary for Application Process and Benefits saved as AAY_Application Process and Benefits.txt.
Summary for Outcome and Impact saved as AAY_Outcome and Impact.txt.
Welfare scheme write-up from DDUGKY.txt loaded successfully.


Summarizing Sections:   0%|          | 0/3 [01:04<?, ?it/s]


KeyboardInterrupt: 

In [2]:
import os

directory = 'scheme_writeups_T5'

for filename in os.listdir(directory):
    new_filename = filename.replace(' ', '_')

    old_file = os.path.join(directory, filename)
    new_file = os.path.join(directory, new_filename)

    if old_file != new_file:
        os.rename(old_file, new_file)
        print(f'Renamed: "{old_file}" to "{new_file}"')

Renamed: "scheme_writeups_T5/AAY_Beneficiary and Problem Statement.txt" to "scheme_writeups_T5/AAY_Beneficiary_and_Problem_Statement.txt"
Renamed: "scheme_writeups_T5/NRLM_Application Process and Benefits.txt" to "scheme_writeups_T5/NRLM_Application_Process_and_Benefits.txt"
Renamed: "scheme_writeups_T5/ABPMJAY_Outcome and Impact.txt" to "scheme_writeups_T5/ABPMJAY_Outcome_and_Impact.txt"
Renamed: "scheme_writeups_T5/AAY_Outcome and Impact.txt" to "scheme_writeups_T5/AAY_Outcome_and_Impact.txt"
Renamed: "scheme_writeups_T5/ABPMJAY_Application Process and Benefits.txt" to "scheme_writeups_T5/ABPMJAY_Application_Process_and_Benefits.txt"
Renamed: "scheme_writeups_T5/AAY_Application Process and Benefits.txt" to "scheme_writeups_T5/AAY_Application_Process_and_Benefits.txt"
Renamed: "scheme_writeups_T5/ABPMJAY_Beneficiary and Problem Statement.txt" to "scheme_writeups_T5/ABPMJAY_Beneficiary_and_Problem_Statement.txt"
Renamed: "scheme_writeups_T5/NRLM_Beneficiary and Problem Statement.txt" t

In [13]:
import os
import nltk
from nltk.translate.bleu_score import sentence_bleu
from sklearn.metrics import precision_recall_fscore_support

# Download necessary NLTK resources
nltk.download('punkt')  # For word tokenization

def calculate_bleu(reference_texts, candidate_text):
    """Calculate BLEU score between reference texts and a candidate text."""
    bleu_score = sentence_bleu(reference_texts, candidate_text)
    return bleu_score

# Define folders containing reference and candidate text files
reference_folder = 'scheme_writeups_gpt'
candidate_folder = 'scheme_writeups_T5'

# Iterate through each file in the reference folder
for filename in os.listdir(reference_folder):
    if filename.endswith('.txt'):
        reference_file_path = os.path.join(reference_folder, filename)
        candidate_file_path = os.path.join(candidate_folder, filename)

        # Check if the candidate file exists
        if os.path.exists(candidate_file_path):
            # Read and tokenize reference texts
            with open(reference_file_path, 'r') as ref_file:
                reference_text = ref_file.readlines()
                reference_text = [nltk.word_tokenize(line.strip()) for line in reference_text]

            # Read and tokenize candidate text
            with open(candidate_file_path, 'r') as cand_file:
                candidate_text = cand_file.readlines()
                candidate_text = [nltk.word_tokenize(line.strip()) for line in candidate_text]

            # Calculate BLEU score
            bleu = calculate_bleu(reference_text, candidate_text[0])
            print(f'BLEU Score for {filename}: {bleu}')
        else:
            print(f'Candidate file for {filename} does not exist.')

Candidate file for MGNREGA_Beneficiary_and_Problem_Statement.txt does not exist.
Candidate file for MGNREGA_Application_Process_and_Benefits.txt does not exist.
BLEU Score for ABPMJAY_Outcome_and_Impact.txt: 0.22217193823708134
BLEU Score for NRLM_Application_Process_and_Benefits.txt: 1.7959576944393866e-78
BLEU Score for AAY_Beneficiary_and_Problem_Statement.txt: 0.15384298044693662
BLEU Score for NRLM_Beneficiary_and_Problem_Statement.txt: 2.239238178682016e-78
BLEU Score for NRLM_Outcome_and_Impact.txt: 0.17777857862828253
BLEU Score for ABPMJAY_Application_Process_and_Benefits.txt: 0.23077003876352636
BLEU Score for AAY_Application_Process_and_Benefits.txt: 7.788709901982841e-79
BLEU Score for AAY_Outcome_and_Impact.txt: 0.13076807930794349
BLEU Score for ABPMJAY_Beneficiary_and_Problem_Statement.txt: 0.21668197249881999
Candidate file for DDUGKY_Beneficiary_and_Problem_Statement.txt does not exist.
Candidate file for DDUGKY_Application_Process_and_Benefits.txt does not exist.
Cand

[nltk_data] Downloading package punkt to /home/harsh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import os
from rouge_score import rouge_scorer

def calculate_rouge(reference_text, candidate_text):
    """Calculate ROUGE score between reference text and a candidate text."""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_text, candidate_text)
    return scores

reference_folder = 'scheme_writeups_gpt'
candidate_folder = 'scheme_writeups_T5'

for filename in os.listdir(reference_folder):
    if filename.endswith('.txt'):
        reference_file_path = os.path.join(reference_folder, filename)
        candidate_file_path = os.path.join(candidate_folder, filename)

        
        if os.path.exists(candidate_file_path):
            
            with open(reference_file_path, 'r') as ref_file:
                reference_text = ref_file.read().strip()

            with open(candidate_file_path, 'r') as cand_file:
                candidate_text = cand_file.read().strip()

            scores = calculate_rouge(reference_text, candidate_text)

            print(f'ROUGE Scores for {filename}:')
            for key in scores:
                print(f'  {key}: {scores[key]}')
        else:
            print(f'Candidate file for {filename} does not exist.')

Candidate file for MGNREGA_Beneficiary_and_Problem_Statement.txt does not exist.
Candidate file for MGNREGA_Application_Process_and_Benefits.txt does not exist.
ROUGE Scores for ABPMJAY_Outcome_and_Impact.txt:
  rouge1: Score(precision=0.5142857142857142, recall=0.16666666666666666, fmeasure=0.2517482517482517)
  rouge2: Score(precision=0.2647058823529412, recall=0.08411214953271028, fmeasure=0.1276595744680851)
  rougeL: Score(precision=0.37142857142857144, recall=0.12037037037037036, fmeasure=0.1818181818181818)
ROUGE Scores for NRLM_Application_Process_and_Benefits.txt:
  rouge1: Score(precision=0.49056603773584906, recall=0.12807881773399016, fmeasure=0.20312500000000003)
  rouge2: Score(precision=0.15384615384615385, recall=0.039603960396039604, fmeasure=0.06299212598425197)
  rougeL: Score(precision=0.32075471698113206, recall=0.08374384236453201, fmeasure=0.13281249999999997)
ROUGE Scores for AAY_Beneficiary_and_Problem_Statement.txt:
  rouge1: Score(precision=0.3584905660377358