In [1]:
import spacy

# Load SpaCy's English language model
nlp = spacy.load("en_core_web_sm")

In [2]:
from nltk.corpus import wordnet
import re
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import re
from collections import defaultdict

# Function to detect sections based on common cues
def break_into_sections(article_text):
    # Define patterns for section identification
    section_patterns = {
        "Introduction/Background": [
            r"\b(Introduction|Background)\b",
            r"(important|significant role|little is known|crucial for|aim of this study)"
        ],
        "Methods": [
            r"\b(Methods|Materials and Methods|Experimental Setup|Protocol)\b",
            r"(we treated|we measured|used to|step-by-step|procedure|performed)"
        ],
        "Results/Findings": [
            r"\b(Results|Findings|Data)\b",
            r"(we observed|shows that|resulted in|analysis of|as shown in|Figure \d)"
        ],
        "Discussion/Conclusion": [
            r"\b(Discussion|Conclusion|Summary)\b",
            r"(these findings|suggests that|future work|implications|this highlights|broader context)"
        ]
    }

    # Split the article into paragraphs
    paragraphs = article_text.split("\n")

    # Create a dictionary to store sections
    sections = defaultdict(list)

    # Classify each paragraph based on the section patterns
    for paragraph in paragraphs:
        classified = False
        for section, patterns in section_patterns.items():
            for pattern in patterns:
                if re.search(pattern, paragraph, re.IGNORECASE):
                    sections[section].append(paragraph.strip())
                    classified = True
                    break
            if classified:
                break
        if not classified:
            sections["Unclassified"].append(paragraph.strip())

    return sections

# Example usage
article_text = """
Introduction
Retinoic acid (RA) plays a significant role in development. However, its impact on kidney segmentation remains unclear.

Methods
Embryos were treated with varying concentrations of RA to assess dose-dependent effects. We measured kidney segments using fluorescence microscopy.

Results
RA treatment resulted in fewer kidney segments, while cdx mutations led to patterning defects. Figure 2 shows the data supporting this observation.

Discussion
These results highlight the interplay between RA signaling and cdx genes in kidney development. This suggests potential pathways for further investigation.
"""

sections = break_into_sections(article_text)

# Display the output
for section, content in sections.items():
    print(f"\n=== {section} ===")
    print("\n".join(content))

In [3]:
from datasets import load_dataset

ds1 = load_dataset("bogdancazan/wikilarge-text-simplification") 
ds2 = load_dataset("rahular/simple-wikipedia")
ds3 = load_dataset("dongqi-me/SciNews")
ds4 = load_dataset("pszemraj/scientific_lay_summarisation-plos-norm")

In [4]:
# Convert to Pandas DataFrames (assuming the default split is 'train')
df1 = ds1['train'].to_pandas()
df2 = ds2['train'].to_pandas()
df3 = ds3['train'].to_pandas()
df4 = ds4['train'].to_pandas()


In [5]:
def clean_text(text):
    """
    Cleans the input text by removing unwanted symbols and normalizing formatting.
    """
    # Remove unwanted symbols like brackets, special characters
    text = text.replace("‘","'")
    text = re.sub(r"\[.*?\]", "", text)  # Matches [anything inside]
    text = re.sub(r"\{.*?\}", "", text)  # Matches {anything inside}
    # Remove unwanted hyphens (not between two words)
    text = re.sub(r"(?<!\w)-|-(?!\w)", "", text)  # Removes '-' unless between two word characters
    
    text = re.sub(r"(?<!\d)%", "", text)  # Remove standalone %
    
    text = re.sub(r"[^\w\s.,!?;:%'()]", "", text)  # Keep alphanumeric, punctuation, and parentheses
    
    
    
    # Remove any brackets  () with 'Figure' inside
    text = re.sub(r"\(\s*Figure.*?\)", "", text, flags=re.IGNORECASE)  # Matches (Figure ...)

    # Remove any brackets () with 'Table' inside
    text = re.sub(r"\(\s*Table.*?\)", "", text, flags=re.IGNORECASE)  # Matches (Table ...)
    
    # Remove spaces before dots (e.g., "word .") 
    text = re.sub(r"\s+\.", ".", text)
    
    text = re.sub(r"\s+", " ", text)  # Replace multiple spaces with a single space
    return text.strip()

# Apply cleaning to both 'source' and 'target' columns
def preprocess_cleaning(examples):
    examples["article"] = [clean_text(text) for text in examples["article"]]
    return examples
# Clean the dataset
cleaned_dataset = ds4.map(preprocess_cleaning, batched=True)

In [6]:
# Check the first cleaned example
print(ds4["train"][14737])
print(cleaned_dataset["train"][14737])

{'article': 'One of the most powerful techniques for studying the function of a gene is to disrupt the expression of that gene using genetic engineering strategies such as targeted recombination or viral integration of gene trap cassettes. The tremendous utility of these tools was recognized this year with the awarding of the Nobel Prize in Physiology or Medicine to Capecchi, Evans, and Smithies for their pioneering work in targeted recombination mutagenesis in mammals. Another noteworthy discovery made nearly a decade ago was the identification of a novel class of non-coding genes called microRNAs. MicroRNAs are among the largest known classes of regulatory elements with more than 1000 predicted to exist in the mouse genome. Over 50% of known microRNAs are located within introns of coding genes. Given that currently about half of the genes in mouse have been knocked out, we investigated the possibility that intronic microRNAs may have been coincidentally deleted or disrupted in some o

In [52]:
print(df1.columns)
print(df2.columns)
print(df3.columns)
print(df4.columns)

Index(['Normal', 'Simple'], dtype='object')
Index(['text'], dtype='object')
Index(['Topic', 'News_Title', 'Citation', 'Paper_URL', 'News_URL',
       'Paper_Body', 'News_Body', 'DOI'],
      dtype='object')
Index(['article', 'summary', 'section_headings', 'keywords', 'year', 'title',
       'article_length', 'summary_length'],
      dtype='object')


In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import openai

# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')

# Set your OpenAI API key (if using GPT)
openai.api_key = "your_openai_api_key"

# Step 1: Tokenize the Text
def tokenize_text(text):
    sentences = sent_tokenize(text)
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    return sentences, tokenized_sentences

# Step 2: Simplify Words Using Context (Word Simplification)
def simplify_words(sentences):
    simplified_sentences = []
    
    # Load a Hugging Face model for word-level simplifications (e.g., T5)
    tokenizer = AutoTokenizer.from_pretrained("t5-small")
    model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
    
    for sentence in sentences:
        inputs = tokenizer(f"Simplify: {sentence}", return_tensors="pt", max_length=512, truncation=True)
        outputs = model.generate(inputs.input_ids, max_length=512, num_beams=5, early_stopping=True)
        simplified_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
        simplified_sentences.append(simplified_sentence)
    
    return simplified_sentences

# Step 3: Simplify Sentences Using a Model Trained on a Good Dataset
def simplify_sentences(sentences):
    # Load a pre-trained model fine-tuned for sentence simplification
    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
    model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
    
    simplified_sentences = []
    for sentence in sentences:
        inputs = tokenizer(f"Simplify this sentence: {sentence}", return_tensors="pt", max_length=512, truncation=True)
        outputs = model.generate(inputs.input_ids, max_length=512, num_beams=5, early_stopping=True)
        simplified_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
        simplified_sentences.append(simplified_sentence)
    
    return simplified_sentences

# Step 4: Simplify the Entire Text Using SimpleWiki and SciNews Dataset
def simplify_text_with_gpt(sentences):
    simplified_paragraphs = []
    
    for sentence in sentences:
        # GPT prompt for further simplification
        prompt = (
            f"Here is a scientific sentence: \"{sentence}\". "
            "Rewrite it as if explaining to a non-expert audience, keeping it simple and engaging."
        )
        
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are an expert at simplifying scientific content."},
                {"role": "user", "content": prompt},
            ],
            max_tokens=150,
        )
        simplified_text = response['choices'][0]['message']['content'].strip()
        simplified_paragraphs.append(simplified_text)
    
    return " ".join(simplified_paragraphs)

# Main Pipeline
def simplify_scientific_article(article_text):
    # Step 1: Tokenize text into sentences and words
    sentences, _ = tokenize_text(article_text)
    
    # Step 2: Simplify words based on context
    word_simplified_sentences = simplify_words(sentences)
    
    # Step 3: Simplify sentences using a good model
    sentence_simplified_sentences = simplify_sentences(word_simplified_sentences)
    
    # Step 4: Simplify the entire text for non-expert audiences using GPT-4
    final_simplified_text = simplify_text_with_gpt(sentence_simplified_sentences)
    
    return final_simplified_text

# Example Usage
if __name__ == "__main__":
    # Input scientific article text
    scientific_text = """
    The experiment demonstrates that utilizing a controlled magnetic field can enhance the efficiency of ion propulsion in spacecraft, achieving velocities previously unattainable. 
    This novel approach may revolutionize interplanetary travel by significantly reducing fuel requirements.
    """
    
    # Simplify the article
    simplified_article = simplify_scientific_article(scientific_text)
    print("Simplified Article:\n", simplified_article)


In [29]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [8]:
# import for metrics

import sacrebleu
from easse.sari import corpus_sari
# Needs jve to use.
import language_tool_python
tool = language_tool_python.LanguageTool('en-US')
# alternative:
import textstat

from bert_score import score

from rouge_score import rouge_scorer

Longer sequences
With Transformer models, there is a limit to the lengths of the sequences we can pass the models. Most models handle sequences of up to 512 or 1024 tokens, and will crash when asked to process longer sequences. There are two solutions to this problem:

Use a model with a longer supported sequence length.
Truncate your sequences.
Models have different supported sequence lengths, and some specialize in handling very long sequences. Longformer is one example, and another is LED. If you’re working on a task that requires very long sequences, we recommend you take a look at those models.

Otherwise, we recommend you truncate your sequences by specifying the max_sequence_length parameter:

Copied
sequence = sequence[:max_sequence_length]

In [None]:
from transformers import AutoTokenizer
def tokenize_function(example):
    return tokenizer(
        example["article"], 
        example["summary"], 
        truncation=True,
        max_length=10000,
        padding="max_length"
        )

checkpoint = "google/long-t5-tglobal-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_datasets = cleaned_dataset.map(tokenize_function, batched=True)

tokenized_datasets


Map:   0%|          | 0/24773 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(['article', 'summary', 'section_headings', 'keywords', 'year', 'title','article_length', 'summary_length'])
tokenized_datasets

In [None]:
from transformers import Trainer, TrainingArguments
model = AutoModelForSeq2SeqLM.from_pretrained("google/long-t5-tglobal-base")
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

# Fine-tune the model
trainer.train()

In [28]:
# # Set your OpenAI API key (if using GPT)
# openai.api_key = "your_openai_api_key"

# Step 1: Tokenize the Text
def tokenize_text(text):
    sentences = sent_tokenize(text)
    sentences = [sentence.replace("\n","") for sentence in sentences]
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    
    return sentences, tokenized_sentences

# Step 2: Simplify Words Using Context (Word Simplification)
def simplify_words(sentences):
    simplified_sentences = []
    
    # Load a Hugging Face model for word-level simplifications (e.g., T5)
    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
    model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

    for sentence in sentences:
        inputs = tokenizer(f"Simplify words: {sentence}", return_tensors="pt", max_length=256, truncation=True)
        outputs = model.generate(inputs.input_ids, max_length=128, num_beams=5, early_stopping=True)
        
        simplified_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
        simplified_sentences.append(simplified_sentence)
    
    return simplified_sentences

# Step 3: Simplify Sentences Using a Model Trained on a Good Dataset
def simplify_sentences(sentences):
    # Load a pre-trained model fine-tuned for sentence simplification
    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
    model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

    simplified_sentences = []
    for sentence in sentences:
        inputs = tokenizer(f"Simplify this sentence: {sentence}", return_tensors="pt", max_length=256, truncation=True)
        outputs = model.generate(inputs.input_ids, max_length=128, num_beams=5, early_stopping=True)
        simplified_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
        simplified_sentences.append(simplified_sentence)
    
    return simplified_sentences

def simlify_text(sentences):
    # Load model directly
    tokenizer = AutoTokenizer.from_pretrained("liamcripwell/pgdyn-simp")
    model = AutoModelForSeq2SeqLM.from_pretrained("liamcripwell/pgdyn-simp")

    simplified_sentences = []
    for sentence in sentences:
        # Tokenize the input sentence
        inputs = tokenizer.encode(sentence, return_tensors="pt", truncation=True)
        
        # Generate simplified text
        outputs = model.generate(inputs, max_length=1024, num_beams=5, early_stopping=True)
        
        # Decode the output tokens to a string
        simplified_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        simplified_sentences.append(simplified_text)
        
    return " ".join(simplified_sentences)


# # Step 4: Simplify the Entire Text Using SimpleWiki and SciNews Dataset
# def simplify_text_with_gpt(sentences):
#     simplified_paragraphs = []
    
#     for sentence in sentences:
#         # GPT prompt for further simplification
#         prompt = (
#             f"Here is a scientific sentence: \"{sentence}\". "
#             "Rewrite it as if explaining to a non-expert audience, keeping it simple and engaging."
#         )
        
#         response = openai.ChatCompletion.create(
#             model="gpt-4",
#             messages=[
#                 {"role": "system", "content": "You are an expert at simplifying scientific content."},
#                 {"role": "user", "content": prompt},
#             ],
#             max_tokens=150,
#         )
#         simplified_text = response['choices'][0]['message']['content'].strip()
#         simplified_paragraphs.append(simplified_text)
    
#     return " ".join(simplified_paragraphs)

# Main Pipeline
def simplify_scientific_article(article_text):
    
    # Step 1: Tokenize text into sentences and words
    sentences, _ = tokenize_text(article_text)
    # Step 2: Simplify words based on context
    word_simplified_words = simplify_words(sentences)
    print(" ".join(word_simplified_words))
    # Step 3: Simplify sentences using a good model
    sentence_simplified_sentences = simplify_sentences(word_simplified_words)
    print(" ".join(sentence_simplified_sentences))
    # # Step 4: Simplify the entire text for non-expert audiences using GPT-4
    # final_simplified_text = simplify_text_with_gpt(sentence_simplified_sentences)
    final_simplified_text = simlify_text(sentence_simplified_sentences)
    return final_simplified_text


In [27]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")


CUDA available: False
CUDA version: None
Device name: No GPU detected


In [None]:
# Input scientific article text
scientific_text = """
The experiment demonstrates that utilizing a controlled magnetic field can enhance the efficiency of ion propulsion in spacecraft, achieving velocities previously unattainable. 
This novel approach may revolutionize interplanetary travel by significantly reducing fuel requirements.
"""

# Simplify the article
simplified_article = simplify_scientific_article(scientific_text)
print("Simplified Article:\n", simplified_article)

# Simplify no pipeline
sentences, _ = tokenize_text(scientific_text)
simplified_article2 = simlify_text(sentences)
print(simplified_article2)

In [96]:
scientific_text = cleaned_dataset['train']['article'][14737]
references_text = cleaned_dataset['train']['summary'][14737]
# Simplify the article
simplified_article = simplify_scientific_article(scientific_text)
print("Simplified Article:\n", simplified_article)
# Simplify no pipeline
sentences, _ = tokenize_text(scientific_text)
simplified_article2 = simlify_text(sentences)
print(simplified_article2)


One of the most powerful techniques for studying the function of a gene is to disrupt the expression of that gene using genetic engineering strategies such as targeted recombination or viral integration of gene trap cassettes. The tremendous utility of these tools was recognized this year with the awarding of the Nobel Prize in Physiology or Medicine to Capecchi, Evans, and Smithies for their pioneering work in targeted recombination mutagenesis in mammals. Another noteworthy discovery made nearly a decade ago was the identification of a novel class of noncoding genes called microRNAs. MicroRNAs are among the largest known classes of regulatory elements with more than 1000 predicted to exist in the mouse genome. Over 50% of known microRNAs are located within introns of coding genes. Given that currently about half of the genes in mouse have been knocked out, we investigated the possibility that intronic microRNAs may have been coincidentally deleted or disrupted in some of these mouse 

explain metrics:
BLEU: 0: No n-gram matches between the candidate and the reference.
      1: Perfect match between the candidate and the reference.
      0.1 - 0.6: Typically seen in machine-generated texts. A higher score means better quality, but values depend on the task and how many references are available.
SARI: SARI score ranges from 0 to 1.
      0 means the simplification was poor, with a lot of added complexity or unnecessary changes.
      1 means the simplification was ideal, retaining meaning and improving readability without unnecessary deletions or additions.
FKGL: 1-5: Text is very easy to read, suited for young children.
      6-8: Easy to read, suitable for middle school students.
      9-12: Suitable for high school students.
      13-16: Complex text, for college students.
      17+: Very complex, suitable for graduate-level readers or specialists in a field.
BERTscore: 1.0: Perfect match between candidate and reference (high semantic similarity).
           0.5 - 0.9: Good similarity, but may still miss some semantic elements or have slight differences.
           0.0 - 0.4: Low similarity, indicating significant differences in meaning.


In [97]:
# Tokenizing the sentences for original and simplified texts
input_sentences, _ = tokenize_text(scientific_text)
reference_sentences, _ = tokenize_text(references_text)
output1_sentences, _ = tokenize_text(simplified_article)
output2_sentences, _ = tokenize_text(simplified_article2)

print("Input Sentences:", input_sentences)
print("Reference Sentences:", reference_sentences)
print("Output 1 Sentences:", output1_sentences)
print("Output 2 Sentences:", output2_sentences)

# Calculate BLEU (sentence-level)
bleu_score1 = sacrebleu.corpus_bleu(output1_sentences, [input_sentences]).score
bleu_score2 = sacrebleu.corpus_bleu(output2_sentences, [input_sentences]).score
print(f"BLEU score1: {bleu_score1}")
print(f"BLEU score2: {bleu_score2}")


# # Calculate SARI (sentence-level comparison) needs to be with the same amount of sentences!
# sari_score1 = corpus_sari(orig_sents=input_sentences, sys_sents=output1_sentences, refs_sents=[reference_sentences])
# sari_score2 = corpus_sari(orig_sents=input_sentences, sys_sents=output2_sentences, refs_sents=[reference_sentences])
# print(f"SARI score1: {sari_score1}")
# print(f"SARI score2: {sari_score2}")


# Calculate FKGL score
fkgl_score1 = textstat.flesch_kincaid_grade(simplified_article)
fkgl_score2 = textstat.flesch_kincaid_grade(simplified_article2)

print(f"FKGL score1: {fkgl_score1}")
print(f"FKGL score2: {fkgl_score2}")


# # Calculate BERTScore needs to be with the same amount of sentences!
# P1, R1, F1_1 = score(output1_sentences, reference_sentences, lang="en", rescale_with_baseline=True)
# P2, R2, F1_2 = score(output2_sentences, reference_sentences, lang="en", rescale_with_baseline=True)
# print(f"BERTScore (F1)1: {F1_1.mean().item()}")
# print(f"BERTScore (F1)2: {F1_2.mean().item()}")


# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Compute ROUGE scores
scores = scorer.score(" ".join(reference_sentences)," ".join(output1_sentences) )

# Print the scores
print("ROUGE-1 (unigrams):", scores['rouge1'])
print("ROUGE-2 (bigrams):", scores['rouge2'])
print("ROUGE-L (longest common subsequence):", scores['rougeL'])

# Compute ROUGE scores
scores = scorer.score(" ".join(reference_sentences)," ".join(output2_sentences) )

# Print the scores
print("ROUGE-1 (unigrams):", scores['rouge1'])
print("ROUGE-2 (bigrams):", scores['rouge2'])
print("ROUGE-L (longest common subsequence):", scores['rougeL'])

Input Sentences: ['One of the most powerful techniques for studying the function of a gene is to disrupt the expression of that gene using genetic engineering strategies such as targeted recombination or viral integration of gene trap cassettes.', 'The tremendous utility of these tools was recognized this year with the awarding of the Nobel Prize in Physiology or Medicine to Capecchi, Evans, and Smithies for their pioneering work in targeted recombination mutagenesis in mammals.', 'Another noteworthy discovery made nearly a decade ago was the identification of a novel class of noncoding genes called microRNAs.', 'MicroRNAs are among the largest known classes of regulatory elements with more than 1000 predicted to exist in the mouse genome.', 'Over 50% of known microRNAs are located within introns of coding genes.', 'Given that currently about half of the genes in mouse have been knocked out, we investigated the possibility that intronic microRNAs may have been coincidentally deleted or

In [99]:
print(len(simplified_article))
print(len(simplified_article2))
print(len(scientific_text))

2968
3122
5407


In [100]:
simplified_article

" One of the most powerful ways for studying the working of a gene is to stop the expression of that gene.  This year, the Nobel Prize in Physiology or Medicine was given to Capecchi, Evans, and Smithies for their pioneering work in targeted recombination in mammals.  Another important discovery made nearly 10 years ago was the discovery of a new class of noncoding genes.  There are more than 1000 predicted to exist in the mouse genome.  More than half of these genes are located within computer coding genes.  Many of the genes in mouse have been knocked out.  We have searched for cases where a microhet was located within or near a loci, or biological community.  Our results draw attention to the need for careful planning in future studies to minimize the accidental disruption of micropronAs.  Future studies may need to be reexamined to determine if loss of a microrecord helps to cause other problems.  In the mouse, changing a gene usually happens using a gene trap or targeted homologou

In [1]:
from transformers import pipeline

# Load a summarization pipeline
summarizer = pipeline("summarization", model="google/pegasus-xsum")

def create_headline(text):
    summary = summarizer(text, max_length=20, min_length=5, do_sample=False)
    return summary[0]['summary_text']

def structure_as_newspaper(simplified_text):
    headline = create_headline(simplified_text)
    return f"HEADLINE: {headline}\n\n{simplified_text}"

print(structure_as_newspaper(simplified_article))

  from .autonotebook import tqdm as notebook_tqdm
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


NameError: name 'simplified_article' is not defined

In [1]:
import cv2

cap = cv2.VideoCapture(0)

# Check if the camera is opened successfully
if not cap.isOpened():
    print("Error: Could not open webcam.")
    exit()

# Set a known working resolution (e.g., 640x480)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)

while True:
    ret, frame = cap.read()
    
    if not ret:
        print("Error: Failed to grab frame.")
        break

    cv2.imshow('Webcam Feed', frame)

    # Exit the loop on pressing 'q'
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


ModuleNotFoundError: No module named 'cv2'