<a href="https://colab.research.google.com/github/MayankKhoria2007/CSESA-PS5-SUMMARIZER/blob/main/SUMMARIZERT5HYBRIDWITHTEXTRANK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
os.environ["TRANSFORMERS_NO_TF"]="1"

In [None]:
import nltk
import torch
import numpy as np

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer



In [None]:
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')


In [None]:
from datasets import load_dataset
ds=load_dataset("NortheasternUniversity/big_patent","a",split="train[:6%]")

In [None]:
from transformers import AutoTokenizer
model_name="t5-small"
tokenizer=AutoTokenizer.from_pretrained(model_name)

In [None]:
prefix="summarize"
def preprocess(examples):
  input=[prefix+ i for i in examples["description"]]

  model_inputs=tokenizer(input,max_length=512,truncation=True,padding="max_length")
  labels=tokenizer(text_target=examples["abstract"],max_length=128,truncation=True,padding="max_length")
  model_inputs["labels"]=labels["input_ids"]
  return model_inputs
raw_dataset=ds.train_test_split(test_size=0.2)
eval_dataset=raw_dataset["test"]
toeknized_ds=ds.map(preprocess,batched=True,remove_columns=ds.column_names)

In [None]:

toeknized_ds=toeknized_ds.train_test_split(test_size=0.2)


In [None]:
!pip install evaluate
!pip install rouge_score

In [None]:
import evaluate
import numpy as np
rouge=evaluate.load("rouge")
def compute_metrics(eval_pred):
  predictions,labels=eval_pred
  decoded_preds=tokenizer.batch_decode(predictions,skip_special_tokens=True)
  labels=np.where(labels!=-100,labels,tokenizer.pad_token_id)
  decoded_labels=tokenizer.batch_decode(labels,skip_special_tokens=True)
  result=rouge.compute(predictions=decoded_preds,references=decoded_labels,use_stemmer=True)
  prediction_lens=[np.count_nonzero(pred!=tokenizer.pad_token_id) for pred in predictions]
  result["gen_len"]=np.mean(prediction_lens)
  return {k:round(v,4) for k,v in result.items()}

In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer,model=model_name,padding="longest")

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
model=AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-bigpatent",
    eval_strategy="epoch",
    logging_steps=100,
    learning_rate=2e-4,
    weight_decay=0.01,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    save_total_limit=2,
    num_train_epochs=4,
    predict_with_generate=False,
    remove_unused_columns=False,
    report_to="none",
    fp16=False)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=toeknized_ds["train"],
    eval_dataset=toeknized_ds["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=None,
)

trainer.train()

In [None]:
!pip install bert_score
bert=evaluate.load("bertscore")

In [None]:
model.save_pretrained("./t5_bigpatent_model")
tokenizer.save_pretrained("./t5_bigpatent_model")


In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def lemmatize_sentence(sentence):
    words = word_tokenize(sentence.lower())
    words = [
        lemmatizer.lemmatize(w)
        for w in words
        if w.isalpha() and w not in stop_words
    ]
    return " ".join(words)


In [None]:
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
def textrank_extractive_lemmatized(text, k=7):
    sentences = sent_tokenize(text)
    processed = [lemmatize_sentence(s) for s in sentences]
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf = vectorizer.fit_transform(processed)
    similarity_matrix = cosine_similarity(tfidf)
    graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(graph)
    ranked_sentences = sorted(
        ((scores[i], s, i) for i, s in enumerate(sentences)),
        reverse=True
    )
    selected = sorted(ranked_sentences[:k], key=lambda x: x[2])
    extracted_text = " ".join([s[1] for s in selected])
    return extracted_text




In [None]:

from transformers import pipeline

summarizer = pipeline(
    "summarization",
    model="./t5_bigpatent_model",   # your trained model path
    tokenizer="./t5_bigpatent_model",
    device=0 if torch.cuda.is_available() else -1
)

def hybrid_summarizer_pipeline(text):
    # Extractive phase
    extracted_text = textrank_extractive_lemmatized(text, k=7)
    extracted_text=extracted_text[:1500]
    torch.cuda.empty_cache()
    # Abstractive phase using pipeline
    with torch.no_grad():
      summary = summarizer(
          "summarize:"+extracted_text,
          max_length=150,
          min_length=40,
          num_beams=2,
          do_sample=False
      )
    torch.cuda.empty_cache()
    return summary[0]["summary_text"]



predictions = []
references = []
c=0

for ex in eval_dataset.select(range(200)):
    text = ex["description"]
    ref = ex["abstract"]

    pred = hybrid_summarizer_pipeline(text)


    predictions.append(pred)
    references.append(ref)



In [None]:
P,R,F1 = bert.compute(predictions=predictions, references=references,lang='en')
print(f"BERTScore Precision:{P.mean().item():.4f}")
print(f"BERTScore Recall:{R.mean().item():.4f}")
print(f"BERTScore F1:{F1.mean().item():.4f}")


In [None]:
t="Artificial intelligence systems are increasingly being integrated into modern technological solutions across multiple industries. These systems rely on advanced machine learning algorithms that enable computers to learn patterns from large volumes of data and make intelligent decisions. In recent years, deep learning techniques based on neural networks have shown significant improvements in tasks such as image recognition, natural language processing, and speech analysis. However, training deep learning models requires substantial computational resources, including high-performance processors and graphical processing units. The growing size of datasets and model parameters has made scalability a major challenge. To address this issue, researchers have proposed optimization techniques such as model compression, distributed training, and hybrid processing pipelines. Hybrid systems combine multiple approaches to improve efficiency and performance. For example, in text summarization, extractive methods can be used to identify the most relevant portions of a document, while abstractive models generate concise and fluent summaries. This combination reduces input length while preserving essential information. Such hybrid approaches are particularly useful for processing long technical documents, including patents and research articles. By integrating statistical methods with neural models, hybrid systems achieve better accuracy, faster processing, and improved adaptability to real-world applications."
print(hybrid_summarizer_pipeline(t))