In [None]:
from IPython.display import clear_output
!pip install -q triton bitsandbytes accelerate hf_xet
!pip install -q evaluate bert_score rouge_score git+https://github.com/google-research/bleurt.git
clear_output()

In [None]:
import gc
import time

import evaluate
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel


In [None]:
import nltk

nltk.download("punkt")
nltk.download("punkt_tab")
from nltk.tokenize import sent_tokenize

In [None]:
BASE_MODEL = "google/long-t5-tglobal-base"
REPO_NAME = "Mels22/longt5-scisummnet"
DATA_CSV = "/kaggle/input/scisummnet-corpus/scisumm.csv"

CHUNK_SIZE = 8192
OVERLAP_SIZE = 512
MAX_TARGET_LENGTH = 512

BATCH_SIZE = 4

In [None]:
class ScisummnetDataset:
    def __init__(self, path, tokenizer, chunk_size=CHUNK_SIZE, overlap=OVERLAP_SIZE):
        df = pd.read_csv(path)
        self.hf_dataset = Dataset.from_pandas(df)
        self.tokenizer = tokenizer
        self.chunk_size = chunk_size
        self.overlap = overlap

    def _process_data_to_model_inputs(self, batch):
        all_input_ids = []
        all_attention_masks = []
        all_labels = []

        for text, summary in zip(batch["text"], batch["summary"]):
            tokenized_inputs = self.tokenizer(
                text,
                return_overflowing_tokens=True,
                stride=self.overlap,
                truncation=True,
                max_length=self.chunk_size,
                padding="max_length",
            )

            tokenized_outputs = self.tokenizer(
                summary,
                truncation=True,
                max_length=MAX_TARGET_LENGTH,
                padding="max_length",
            )

            for input_ids, attention_mask in zip(
                tokenized_inputs["input_ids"], tokenized_inputs["attention_mask"]
            ):
                # Apply -100 masking to pad tokens in the label
                labels = [
                    -100 if token == self.tokenizer.pad_token_id else token
                    for token in tokenized_outputs["input_ids"]
                ]

                all_input_ids.append(input_ids)
                all_attention_masks.append(attention_mask)
                all_labels.append(labels)

        return {
            "input_ids": np.array(all_input_ids, dtype=np.int64),
            "attention_mask": np.array(all_attention_masks, dtype=np.int64),
            "labels": np.array(all_labels, dtype=np.int64),
        }

    def get_data(self, test_size=0.1):
        split_data = self.hf_dataset.train_test_split(test_size=test_size)
        train_ds = split_data["train"]
        val_ds = split_data["test"]

        train_data = train_ds.map(
            self._process_data_to_model_inputs,
            batched=True,
            batch_size=BATCH_SIZE,
            remove_columns=["text", "summary"],
        )
        train_data.set_format(
            type="torch",
            columns=["input_ids", "attention_mask", "labels"],
            output_all_columns=False,  # make sure only required tensors are kept
        )

        val_data = val_ds.map(
            self._process_data_to_model_inputs,
            batched=True,
            batch_size=BATCH_SIZE,
            remove_columns=["text", "summary"],
        )
        val_data.set_format(
            type="torch",
            columns=["input_ids", "attention_mask", "labels"],
            output_all_columns=False,  # make sure only required tensors are kept
        )

        return {"train": train_data, "val": val_data}, val_ds

In [None]:
class LongT5Inference:
    def __init__(
        self,
        chunk_size=CHUNK_SIZE,
        overlap=OVERLAP_SIZE,
        max_gen_len=MAX_TARGET_LENGTH,
        batch_size=4,  # New param: safe batch size
    ):
        self.chunk_size = chunk_size
        self.chunk_overlap = overlap
        self.max_len = max_gen_len
        self.batch_size = batch_size
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        base_model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL)
        self.model = PeftModel.from_pretrained(base_model, REPO_NAME)
        self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
        self.model = self.model.to(self.device)
        self.model.eval()
        self.rouge = self.bert_score = self.bleurt = self.meteor = None

    def _semantic_split(self, text):
        sentences = sent_tokenize(text)
        chunks = []
        current_chunk = []
        current_length = 0

        for sentence in sentences:
            sentence_tokens = self.tokenizer.tokenize(sentence)
            sentence_length = len(sentence_tokens)

            if current_length + sentence_length > self.chunk_size:
                chunks.append(" ".join(current_chunk))

                if self.chunk_overlap > 0:
                    overlap_sentences = []
                    total_overlap = 0
                    for prev_sentence in reversed(current_chunk):
                        tokens = self.tokenizer.tokenize(prev_sentence)
                        total_overlap += len(tokens)
                        overlap_sentences.insert(0, prev_sentence)
                        if total_overlap >= self.chunk_overlap:
                            break
                    current_chunk = overlap_sentences + [sentence]
                    current_length = total_overlap + sentence_length
                else:
                    current_chunk = [sentence]
                    current_length = sentence_length
            else:
                current_chunk.append(sentence)
                current_length += sentence_length

        if current_chunk:
            chunks.append(" ".join(current_chunk))

        return chunks

    def _batch_summarize(self, texts):
        summaries = []
        for i in range(0, len(texts), self.batch_size):
            batch = texts[i : i + self.batch_size]

            inputs = self.tokenizer(
                batch,
                truncation=True,
                max_length=self.chunk_size,
                padding="max_length",
                return_tensors="pt",
            ).to(self.device)

            global_attention_mask = torch.zeros_like(inputs["attention_mask"]).to(
                self.device
            )
            global_attention_mask[:, 0] = 1

            with torch.no_grad():
                outputs = self.model.generate(
                    input_ids=inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    max_new_tokens=self.max_len,
                    do_sample=False,
                    num_beams=2,
                    no_repeat_ngram_size=3,
                    repetition_penalty=2.0,
                    length_penalty=1.0,
                    early_stopping=True,
                )

            batch_summaries = self.tokenizer.batch_decode(
                outputs, skip_special_tokens=True
            )
            summaries.extend([s.strip() for s in batch_summaries])

            del inputs, global_attention_mask, outputs
            torch.cuda.empty_cache()

        return summaries

    def infer(self, text):
        gc.collect()
        torch.cuda.empty_cache()

        tokenized_text = self.tokenizer.tokenize(text)
        if len(tokenized_text) <= self.chunk_size:
            # Direct summarization
            return self._batch_summarize([text])[0]

        # Otherwise, semantic split
        chunks = self._semantic_split(text)

        # Batch summarize chunks
        chunk_summaries = self._batch_summarize(chunks)

        combined_summary = " ".join(chunk_summaries)

        # Final summarization
        final_summary = self._batch_summarize([combined_summary])[0]

        return final_summary

    def _load_metrics(self, metric_keys):
        if "rouge" in metric_keys and self.rouge is None:
            self.rouge = evaluate.load("rouge")
        if "bertscore" in metric_keys and self.bert_score is None:
            self.bert_score = evaluate.load("bertscore")
        if "bleurt" in metric_keys and self.bleurt is None:
            self.bleurt = evaluate.load(
                "bleurt",
                module_type="metric",
            )
        if "meteor" in metric_keys and self.meteor is None:
            self.meteor = evaluate.load("meteor")

    def evaluate(self, dataset, metric_keys=["all"]):
        if metric_keys is None or metric_keys == ["none"] or metric_keys == "none":
            raise Exception(
                "⚠️ Please define the metric to calculate on: `rouge`, `bertscore`, `bleurt`, `meteor`, `time`, or `all`"
            )
        if isinstance(metric_keys, str):
            metric_keys = [metric_keys]
        if "all" in metric_keys:
            metric_keys = ["rouge", "bertscore", "bleurt", "meteor", "time"]
        self._load_metrics(metric_keys)
        gc.collect()
        torch.cuda.empty_cache()

        predictions, references = [], []
        total_time = 0.0
        for record in tqdm(dataset, desc="Evaluate"):
            references.append(record["summary"])
            start_time = time.time()
            predictions.append(self.infer(record["text"]))
            total_time += time.time() - start_time

            gc.collect()
            torch.cuda.empty_cache()

        results = {}
        if "rouge" in metric_keys:
            rouge_result = self.rouge.compute(
                predictions=predictions, references=references, use_stemmer=True
            )
            results.update({f"rouge_{k}": v for k, v in rouge_result.items()})
        if "bertscore" in metric_keys:
            bert_result = self.bert_score.compute(
                predictions=predictions, references=references, lang="en"
            )
            results.update(
                {
                    f"bertscore_{k}": sum(v) / len(v)
                    for k, v in bert_result.items()
                    if isinstance(v, list)
                }
            )
        if "bleurt" in metric_keys:
            bleurt_result = self.bleurt.compute(
                predictions=predictions, references=references
            )
            results.update(
                {f"bleurt": sum(bleurt_result["scores"]) / len(bleurt_result["scores"])}
            )
        if "meteor" in metric_keys:
            meteor_result = self.meteor.compute(
                predictions=predictions, references=references
            )
            results.update(meteor_result)
        if "time" in metric_keys:
            results.update({"avg_inference_in_sec": total_time / len(dataset)})
        del predictions, references
        return results

In [None]:
longt5 = LongT5Inference(batch_size=8)

In [None]:
scisummnet = ScisummnetDataset(DATA_CSV, longt5.tokenizer)
data_loader, val_df = scisummnet.get_data()

In [None]:
scientific_paragraph = """
The vast and intricate field of neuroscience, dedicated to unraveling the mysteries of the nervous system, encompasses a multitude of disciplines, each contributing unique insights into the complexities of brain structure, function, and development. From the molecular level, where neurochemists explore the synthesis, release, and action of neurotransmitters, to the systems level, where neurophysiologists investigate the coordinated activity of neural circuits, neuroscience seeks to understand how the brain gives rise to behavior, cognition, and consciousness. The fundamental building block of the nervous system, the neuron, is a specialized cell characterized by its unique morphology, featuring a cell body (soma), dendrites that receive incoming signals, and an axon that transmits signals to other neurons or effector cells. The electrical signaling within neurons relies on the generation and propagation of action potentials, rapid changes in membrane potential driven by the precisely regulated opening and closing of ion channels, primarily voltage-gated sodium and potassium channels. These action potentials travel along the axon, and at the axon terminal, they trigger the release of neurotransmitters into the synaptic cleft, the narrow gap between neurons. Neurotransmitters, a diverse group of chemical messengers including acetylcholine, dopamine, serotonin, glutamate, and GABA, bind to specific receptors on the postsynaptic neuron, initiating a cascade of intracellular signaling events that can either excite or inhibit the postsynaptic neuron, thereby influencing its likelihood of firing an action potential. The precise balance of excitatory and inhibitory synaptic transmission is crucial for proper brain function, and disruptions in this balance have been implicated in a variety of neurological and psychiatric disorders. Synaptic plasticity, the ability of synapses to strengthen or weaken over time in response to changes in neural activity, is a fundamental mechanism underlying learning and memory. Long-term potentiation (LTP), a form of synaptic plasticity characterized by a persistent increase in synaptic strength following high-frequency stimulation, is widely studied as a cellular model of learning. Conversely, long-term depression (LTD), a decrease in synaptic strength following low-frequency stimulation, is thought to contribute to the forgetting of irrelevant information. The brain is organized into distinct anatomical regions, each with specialized functions. The cerebral cortex, the outermost layer of the brain, is responsible for higher-order cognitive functions such as language, memory, and reasoning. The cortex is divided into four lobes: the frontal lobe, involved in planning, decision-making, and motor control; the parietal lobe, responsible for sensory perception, spatial awareness, and attention; the temporal lobe, crucial for auditory processing, memory, and language comprehension; and the occipital lobe, dedicated to visual processing. Within each lobe, specific cortical areas are organized into functional maps, reflecting the precise representation of sensory or motor information. For example, the primary visual cortex in the occipital lobe contains a retinotopic map, where neighboring neurons respond to neighboring regions of the visual field. The motor cortex in the frontal lobe contains a somatotopic map, where different body parts are represented in a specific spatial arrangement. The intricate connectivity of the brain is organized into complex neural networks, allowing for the integration and processing of information across different brain regions. These networks can be studied using a variety of techniques, including tractography, which uses diffusion tensor imaging (DTI) to visualize white matter tracts, and functional connectivity analysis, which examines the temporal correlations in neural activity between different brain regions. The default mode network (DMN), a network of brain regions that is most active when a person is at rest and not engaged in any specific task, has been implicated in self-referential thought and mind-wandering. The salience network, involved in detecting and responding to salient stimuli, plays a crucial role in attention and cognitive control. The central executive network, responsible for working memory and cognitive flexibility, is essential for goal-directed behavior. The development of the nervous system is a remarkably complex and precisely orchestrated process, beginning with the formation of the neural tube during embryogenesis. Neural tube development is influenced by a complex interplay of genetic factors and signaling molecules, including sonic hedgehog (Shh), Wnt, and fibroblast growth factors (FGFs). Neurons are generated in specific regions of the developing brain through a process called neurogenesis, and they migrate to their final destinations, guided by a variety of cues, including cell adhesion molecules and extracellular matrix proteins. Axons grow and navigate to their target cells, forming synapses and establishing functional neural circuits. The formation of synapses is a highly regulated process, involving the precise matching of pre- and postsynaptic partners and the expression of specific synaptic adhesion molecules. The developing brain exhibits a high degree of plasticity, allowing it to adapt to environmental influences and experiences. Critical periods, specific time windows during development when the brain is particularly sensitive to certain types of input, are essential for the proper development of sensory and motor systems. For example, the visual system undergoes a critical period during early childhood, during which visual experience is necessary for the development of normal visual acuity. The study of brain disorders provides valuable insights into the functions of the nervous system. Neurological disorders, such as Alzheimer's disease, Parkinson's disease, and stroke, result from damage to or dysfunction of specific brain regions or neural circuits. Alzheimer's disease, a progressive neurodegenerative disorder, is characterized by the accumulation of amyloid plaques and neurofibrillary tangles in the brain, leading to neuronal loss and cognitive decline. Parkinson's disease, another neurodegenerative disorder, is caused by the loss of dopamine-producing neurons in the substantia nigra, resulting in motor symptoms such as tremor, rigidity, and bradykinesia. Stroke, a sudden interruption of blood flow to the brain, can cause permanent brain damage and a variety of neurological deficits, depending on the affected brain region. Psychiatric disorders, such as depression, schizophrenia, and anxiety disorders, are complex mental health conditions that involve disruptions in brain function and neural circuitry. Depression is characterized by persistent sadness, loss of interest, and changes in sleep and appetite. Schizophrenia, a severe mental disorder, is associated with hallucinations, delusions, and disorganized thinking. Anxiety disorders, such as generalized anxiety disorder and panic disorder, involve excessive fear and worry. The treatment of neurological and psychiatric disorders often involves a combination of pharmacological interventions, which target specific neurotransmitter systems or signaling pathways, and behavioral therapies, which aim to modify maladaptive thought patterns and behaviors. Deep brain stimulation (DBS), a neurosurgical procedure that involves implanting electrodes in specific brain regions to deliver electrical stimulation, has shown promise in the treatment of Parkinson's disease, essential tremor, and other movement disorders. Transcranial magnetic stimulation (TMS), a non-invasive technique that uses magnetic fields to stimulate or inhibit neural activity, is being investigated as a potential treatment for depression, anxiety, and other psychiatric disorders. The ongoing development of new neurotechnologies is revolutionizing our ability to study and manipulate the brain. Optogenetics, a technique that uses light to control the activity of genetically modified neurons, allows for precise control of neural circuits and has provided valuable insights into the neural basis of behavior. CLARITY, a technique that renders brain tissue transparent, allows for the visualization of neural circuits in three dimensions. Brain-computer interfaces (BCIs), which allow for direct communication between the brain and external devices, hold promise for restoring function in individuals with paralysis and other neurological conditions. The ethical implications of these new neurotechnologies are a subject of ongoing debate and discussion, raising important questions about privacy, autonomy, and the potential for misuse. The future of neuroscience holds immense promise for advancing our understanding of the brain and developing new treatments for neurological and psychiatric disorders. Continued research into the molecular, cellular, and systems-level mechanisms of brain function, coupled with the development of innovative neurotechnologies, will undoubtedly lead to groundbreaking discoveries and transformative advances in the years to come. The study of consciousness, one of the most profound and challenging problems in neuroscience, continues to captivate researchers from various disciplines. Defining consciousness, identifying its neural correlates, and understanding its evolutionary origins remain fundamental questions. Various theories of consciousness have been proposed, including the global workspace theory, which posits that consciousness arises from the broadcasting of information across a global network of brain regions, and the integrated information theory, which proposes that consciousness is related to the complexity and integration of information within a system. The search for the neural correlates of consciousness (NCC), the minimal set of neural events sufficient for a conscious experience, is a central focus of research in this area. Studies using neuroimaging techniques, such as fMRI and EEG, have identified several brain regions and neural networks that are thought to play a role in consciousness, including the thalamocortical system, the frontoparietal network, and the insula. The role of attention in consciousness is also a subject of intense investigation. Attention, the process of selectively focusing on certain aspects of the environment while ignoring others, is closely linked to conscious awareness. It has been proposed that attention is necessary, but not sufficient, for consciousness. Further research is needed to fully elucidate the relationship between attention and consciousness. The evolution of consciousness is another intriguing area of study. It is thought that consciousness evolved gradually over millions of years, perhaps as a way to integrate information and make more complex decisions. Comparative studies of brain structure and function in different species can provide insights into the evolutionary origins of consciousness. The development of artificial consciousness is a long-term goal of artificial intelligence research. While current AI systems can perform many tasks that once required human intelligence, they do not possess the subjective experience of consciousness. Creating a truly conscious artificial intelligence would require a deep understanding of the neural mechanisms underlying consciousness. The ethical implications of artificial consciousness are profound, raising questions about the rights and moral status of conscious machines. The study of the mind-brain relationship, the philosophical problem of how mental states are related to physical states in the brain, remains a central challenge in neuroscience and philosophy. Dualism, the view that the mind and brain are separate entities, has been a long-standing philosophical position, but it faces challenges from neuroscience, which has shown that mental states are closely correlated with brain activity. Materialism, the view that the mind is a product of the brain and that there is only one kind of substance, matter, is the dominant view in neuroscience today. However, materialism still faces challenges in explaining the subjective nature of conscious experience. The development of new neurophilosophical approaches, which integrate insights from neuroscience, philosophy, and psychology, may be necessary to resolve the mind-brain problem. The field of neuroethics is emerging as an important area of study, addressing the ethical, legal, and social implications of neuroscience research and its applications. Neuroethics encompasses a wide range of topics, including the ethics of brain interventions, the responsible use of neurotechnologies, and the impact of neuroscience on our understanding of free will, moral responsibility, and personal identity. The use of neurotechnologies to enhance cognitive abilities, such as memory and attention, raises ethical questions about fairness, access, and the potential for creating a "cognitive divide." The development of brain-reading technologies, which can decode mental states from brain activity, raises concerns about privacy and the potential for misuse. The debate over free will and moral responsibility has been reinvigorated by neuroscience research, which has shown that brain activity precedes conscious decisions. Some neuroscientists argue that this finding undermines the traditional concept of free will, while others maintain that free will is compatible with our current understanding of the brain. The implications of neuroscience for our understanding of personal identity are also being explored. Studies of patients with brain damage or neurodegenerative diseases have shown that changes in brain function can lead to profound changes in personality and self-awareness. This raises questions about the nature of the self and its relationship to the brain. The future of neuroscience will undoubtedly be shaped by interdisciplinary collaborations, bringing together researchers from diverse fields such as biology, psychology, computer science, mathematics, physics, and engineering. The development of new tools and techniques, the sharing of data and resources, and the open exchange of ideas will be essential for accelerating progress in this exciting and rapidly evolving field. The ultimate goal of neuroscience is to understand the brain in its entirety, from its molecular components to its complex functions, and to use this knowledge to improve human health and well-being.
"""

%time longt5.infer(scientific_paragraph)

In [None]:
longt5.evaluate(val_df)

In [None]:
arxiv = load_dataset(
    "armanc/scientific_papers",
    "arxiv",
    split="test",
    streaming=True,
    trust_remote_code=True,
)


def convert_ARXIV_dataset(hf_dataset, X):
    hf_dataset = hf_dataset.take(X)
    texts, summaries = [], []
    for sample in hf_dataset:
        texts.append(sample["article"])
        summaries.append(sample["abstract"])
    return Dataset.from_dict(
        {
            "text": texts,
            "summary": summaries,
        }
    )

In [None]:
longt5.evaluate(convert_ARXIV_dataset(arxiv, 50))