In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import nltk
from nltk.tokenize import sent_tokenize
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# our first task is to find meaningful sentences/para from blog

> lets test with sample blog



In [2]:
nltk.download('punkt')

def split_into_meaningful_paragraphs(text, min_sentences=3, max_sentences=10, similarity_threshold=0.3):
    #split the text into sentences
    sentences = sent_tokenize(text)
    #initialize paragraphs
    paragraphs = []
    current_paragraph = []

    #TF-IDF vectorizer for calculating sentence similarity
    vectorizer = TfidfVectorizer()
    for i, sentence in enumerate(sentences):
        current_paragraph.append(sentence)

        #check if we have enough sentences to form a paragraph
        if len(current_paragraph) >= min_sentences:
            #calculate similarity between the current sentence and the next one
            if i + 1 < len(sentences):
                current_vec = vectorizer.fit_transform([' '.join(current_paragraph), sentences[i+1]])
                similarity = cosine_similarity(current_vec[0], current_vec[1])[0][0]

                #if similarity is low or we've reached max sentences, start a new paragraph
                if similarity < similarity_threshold or len(current_paragraph) >= max_sentences:
                    paragraphs.append(' '.join(current_paragraph))
                    current_paragraph = []

    #add any remaining sentences as the last paragraph
    if current_paragraph:
        paragraphs.append(' '.join(current_paragraph))

    return paragraphs

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
summary = """The Future of AI in Healthcare
Introduction: AI's Revolutionary Impact on Healthcare
Artificial Intelligence (AI) is poised to revolutionize the healthcare industry. From diagnosis to treatment planning, AI technologies are being integrated into various aspects of medical care. This integration promises to improve patient outcomes, reduce costs, and enhance the efficiency of healthcare systems worldwide.
Early Detection and Diagnosis
One of the most promising applications of AI in healthcare is in early detection and diagnosis of diseases. Machine learning algorithms can analyze medical images, such as X-rays, MRIs, and CT scans, with remarkable accuracy. These AI systems can often detect subtle abnormalities that might be overlooked by human radiologists, leading to earlier diagnoses and potentially life-saving interventions.
Personalized Treatment Plans
AI is also making significant strides in developing personalized treatment plans. By analyzing vast amounts of patient data, including genetic information, medical history, and lifestyle factors, AI algorithms can recommend tailored treatment options. This approach, known as precision medicine, allows healthcare providers to offer more effective and targeted therapies, minimizing side effects and improving overall patient care.
Drug Discovery and Development
The pharmaceutical industry is leveraging AI to accelerate drug discovery and development processes. Machine learning models can predict how potential drug compounds will interact with biological targets, significantly reducing the time and cost associated with traditional drug development methods. This could lead to faster development of new treatments for a wide range of diseases.
Administrative Efficiency and Cost Reduction
Beyond clinical applications, AI is also being used to streamline administrative tasks in healthcare settings. Natural language processing and machine learning algorithms can automate tasks such as medical coding, billing, and appointment scheduling. This not only reduces the administrative burden on healthcare professionals but also helps to minimize errors and improve overall efficiency.
Challenges and Ethical Considerations
While the potential benefits of AI in healthcare are immense, there are also significant challenges and ethical considerations to address. Issues such as data privacy, algorithmic bias, and the need for human oversight in AI-driven decision-making processes must be carefully managed. Ensuring that AI technologies are developed and implemented responsibly will be crucial for maintaining public trust and maximizing the benefits of these innovations.
Conclusion: A Collaborative Future
The future of AI in healthcare is not about replacing human medical professionals, but rather about augmenting their capabilities. By combining the analytical power of AI with the experience and intuition of healthcare providers, we can create a more effective, efficient, and patient-centered healthcare system. As AI continues to evolve, its impact on healthcare will undoubtedly grow, ushering in a new era of medical innovation and improved patient care."""

In [5]:
structured_blog = split_into_meaningful_paragraphs(summary)
print(structured_blog)

['But I also blame the people of the United States who can’t put their plans on hold for a little longer in order to remain safe, and for the health and well-being of others. Consistently masking up, only going out when absolutely necessary, and avoiding crowds goes a long way towards ending this pandemic. Every time I think people can‘t amaze me further with their stupidity, I’m sadly mistaken.', 'As of today, there have been 13.7m cases of the novel coronavirus, with almost 270k deaths. We’ve been stuck at home since March, invading one another’s space, constantly under each other. We even avoided a small Thanksgiving get-together to ensure we remained virus free.', 'But for all that is holy, stay home this holiday season and plan the vacations for when it’ll be safer. I know it sounds crazy, but it‘s true. We have a lot of people who are sick, and we’re not the only ones who are.', 'And we have a great deal of friends and family that are sick. And that’d be great if we could just st

# Lets structure the paras

In [6]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2

In [3]:
import pandas as pd
from datasets import Dataset, load_metric, load_dataset
from transformers import LEDTokenizer, LEDForConditionalGeneration
import torch

sample_paragraph = "Virat kohli is an inspiration to many people around the world"
data = [sample_paragraph]
df = pd.DataFrame(data, columns=['Paragraph'])
df["Paragraph"][0]
df_test = Dataset.from_pandas(df)
df_test

Dataset({
    features: ['Paragraph'],
    num_rows: 1
})

In [5]:
tokenizer = LEDTokenizer.from_pretrained("/content/drive/MyDrive/checkpoint-100")
model = LEDForConditionalGeneration.from_pretrained("/content/drive/MyDrive/checkpoint-100").to("cuda").half()

def generate_answer(batch):
  inputs_dict = tokenizer(batch["Paragraph"], padding="max_length", max_length=512, return_tensors="pt", truncation=True)
  input_ids = inputs_dict.input_ids.to("cuda")
  attention_mask = inputs_dict.attention_mask.to("cuda")
  global_attention_mask = torch.zeros_like(attention_mask)

  predicted_abstract_ids = model.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)
  batch["generated_heading"] = tokenizer.batch_decode(predicted_abstract_ids, skip_special_tokens=True)
  return batch

result = df_test.map(generate_answer, batched=True, batch_size=2)
result['generated_heading']

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Input ids are automatically padded from 512 to 1024 to be a multiple of `config.attention_window`: 1024


['Virat kohli']

# Yeah our heading generator working !!

In [10]:
def structure_blog(blog):
    paragraphs = split_into_meaningful_paragraphs(blog) # Assuming this function exists and works as expected
    structured_blog = []
    for paragraph in paragraphs:
        # Create a batch dictionary for the current paragraph
        batch = {"Paragraph": paragraph}
        heading = generate_answer(batch)["generated_heading"][0] # Access the generated heading
        structured_blog.append(f"## {heading}\n\n{paragraph}")
    return "\n\n".join(structured_blog)

In [13]:
s = structure_blog(summary)
print(s)

## The Future of AI in Healthcare

The Future of AI in Healthcare
Introduction: AI's Revolutionary Impact on Healthcare
Artificial Intelligence (AI) is poised to revolutionize the healthcare industry. From diagnosis to treatment planning, AI technologies are being integrated into various aspects of medical care. This integration promises to improve patient outcomes, reduce costs, and enhance the efficiency of healthcare systems worldwide. Early Detection and Diagnosis
One of the most promising applications of AI in healthcare is in early detection and diagnosis of diseases.

## Artificial Intelligence (AI) is making significant strides in developing personalized treatment plans.

Machine learning algorithms can analyze medical images, such as X-rays, MRIs, and CT scans, with remarkable accuracy. These AI systems can often detect subtle abnormalities that might be overlooked by human radiologists, leading to earlier diagnoses and potentially life-saving interventions. Personalized Treat

spliting done :) heading done :) Lets Move on to the questions generation

We'll gather all the functions generate questions for an entirely new text.


# Main

In [68]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import nltk
from nltk.tokenize import sent_tokenize

# Download necessary NLTK data
nltk.download('punkt', quiet=True)

class QuestionGenerator:
    def __init__(self, model_name="valhalla/t5-base-qg-hl"):
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

    def generate_questions(self, text, num_questions=5):
        sentences = sent_tokenize(text)
        questions = []

        for sentence in sentences:
            inputs = self.tokenizer.encode_plus(
                f"generate question: {sentence}",
                max_length=512,
                padding="max_length",
                truncation=True,
                return_tensors="pt"
            ).to(self.device)

            outputs = self.model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=64,
                num_return_sequences=1,
                num_beams=4,
                early_stopping=True
            )

            question = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            questions.append({"question": question, "answer": sentence})

            if len(questions) >= num_questions:
                break

        return questions[:num_questions]

def main():
    text = """
    Forrest Gump is a 1994 American comedy-drama film directed by Robert Zemeckis and written by Eric Roth.
    It is based on the 1986 novel of the same name by Winston Groom and stars Tom Hanks, Robin Wright, Gary Sinise,
    Mykelti Williamson and Sally Field. The story depicts several decades in the life of Forrest Gump (Hanks),
    a slow-witted but kind-hearted man from Alabama who witnesses and unwittingly influences several defining
    historical events in the 20th century United States. The film differs substantially from the novel.
    """

    qg = QuestionGenerator()
    questions = qg.generate_questions(text, num_questions=5)

    for i, qa in enumerate(questions, 1):
        print(f"Question {i}:")
        print(f"Q: {qa['question']}")
        print(f"A: {qa['answer']}\n")

if __name__ == "__main__":
    main()

Question 1:
Q: What is the name of the film that starred Robert Zemeckis?
A: 
    Forrest Gump is a 1994 American comedy-drama film directed by Robert Zemeckis and written by Eric Roth.

Question 2:
Q: What is the name of the movie that stars Tom Hanks?
A: It is based on the 1986 novel of the same name by Winston Groom and stars Tom Hanks, Robin Wright, Gary Sinise, 
    Mykelti Williamson and Sally Field.

Question 3:
Q: What is the name of Gump's character?
A: The story depicts several decades in the life of Forrest Gump (Hanks), 
    a slow-witted but kind-hearted man from Alabama who witnesses and unwittingly influences several defining 
    historical events in the 20th century United States.

Question 4:
Q: What is the main difference between the film and the novel?
A: The film differs substantially from the novel.



# HERE WE GO! ALL IN ONE INFERENCE

In [69]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, LEDTokenizer, LEDForConditionalGeneration
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datasets import Dataset
import pandas as pd

# Download necessary NLTK data
nltk.download('punkt', quiet=True)

class BlogProcessor:
    def __init__(self):
        self.qg_model_name = "valhalla/t5-base-qg-hl"
        self.qg_tokenizer = T5Tokenizer.from_pretrained(self.qg_model_name)
        self.qg_model = T5ForConditionalGeneration.from_pretrained(self.qg_model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.qg_model.to(self.device)

        # Initialize heading generation model
        self.hg_tokenizer = LEDTokenizer.from_pretrained("/content/drive/MyDrive/checkpoint-100")
        self.hg_model = LEDForConditionalGeneration.from_pretrained("/content/drive/MyDrive/checkpoint-100").to(self.device).half()

    def split_into_meaningful_paragraphs(self, text, min_sentences=3, max_sentences=10, similarity_threshold=0.3):
        sentences = sent_tokenize(text)
        paragraphs = []
        current_paragraph = []
        vectorizer = TfidfVectorizer()

        for i, sentence in enumerate(sentences):
            current_paragraph.append(sentence)

            if len(current_paragraph) >= min_sentences:
                if i + 1 < len(sentences):
                    current_vec = vectorizer.fit_transform([' '.join(current_paragraph), sentences[i+1]])
                    similarity = cosine_similarity(current_vec[0], current_vec[1])[0][0]

                    if similarity < similarity_threshold or len(current_paragraph) >= max_sentences:
                        paragraphs.append(' '.join(current_paragraph))
                        current_paragraph = []

        if current_paragraph:
            paragraphs.append(' '.join(current_paragraph))

        return paragraphs

    def generate_heading(self, paragraph):
        df = pd.DataFrame([paragraph], columns=['Paragraph'])
        dataset = Dataset.from_pandas(df)

        inputs_dict = self.hg_tokenizer(paragraph, padding="max_length", max_length=512, return_tensors="pt", truncation=True)
        input_ids = inputs_dict.input_ids.to(self.device)
        attention_mask = inputs_dict.attention_mask.to(self.device)
        global_attention_mask = torch.zeros_like(attention_mask)

        predicted_heading_ids = self.hg_model.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)
        heading = self.hg_tokenizer.decode(predicted_heading_ids[0], skip_special_tokens=True)

        return heading

    def generate_questions(self, text, num_questions=5):
        sentences = sent_tokenize(text)
        questions = []

        for sentence in sentences:
            inputs = self.qg_tokenizer.encode_plus(
                f"generate question: {sentence}",
                max_length=512,
                padding="max_length",
                truncation=True,
                return_tensors="pt"
            ).to(self.device)

            outputs = self.qg_model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=64,
                num_return_sequences=1,
                num_beams=4,
                early_stopping=True
            )

            question = self.qg_tokenizer.decode(outputs[0], skip_special_tokens=True)
            questions.append({"question": question, "answer": sentence})

            if len(questions) >= num_questions:
                break

        return questions[:num_questions]

    def process_blog(self, summary):
        # Split into paragraphs
        paragraphs = self.split_into_meaningful_paragraphs(summary)

        structured_blog = []
        all_questions = []

        for paragraph in paragraphs:
            # Generate heading
            heading = self.generate_heading(paragraph)

            # Generate questions
            questions = self.generate_questions(paragraph, num_questions=2)

            structured_blog.append(f"## {heading}\n\n{paragraph}")
            all_questions.extend(questions)

        # Combine everything
        final_blog = "\n\n".join(structured_blog)
        final_blog += "\n\n## Questions for Review\n"
        for i, qa in enumerate(all_questions, 1):
            final_blog += f"\n{i}. Q: {qa['question']}\n   A: {qa['answer']}\n"

        return final_blog

def main():
    summary = """
    The Future of AI in Healthcare
    Introduction: AI's Revolutionary Impact on Healthcare
    Artificial Intelligence (AI) is poised to revolutionize the healthcare industry. From diagnosis to treatment planning, AI technologies are being integrated into various aspects of medical care. This integration promises to improve patient outcomes, reduce costs, and enhance the efficiency of healthcare systems worldwide.
    Early Detection and Diagnosis
    One of the most promising applications of AI in healthcare is in early detection and diagnosis of diseases. Machine learning algorithms can analyze medical images, such as X-rays, MRIs, and CT scans, with remarkable accuracy. These AI systems can often detect subtle abnormalities that might be overlooked by human radiologists, leading to earlier diagnoses and potentially life-saving interventions.
    Personalized Treatment Plans
    AI is also making significant strides in developing personalized treatment plans. By analyzing vast amounts of patient data, including genetic information, medical history, and lifestyle factors, AI algorithms can recommend tailored treatment options. This approach, known as precision medicine, allows healthcare providers to offer more effective and targeted therapies, minimizing side effects and improving overall patient care.
    Drug Discovery and Development
    The pharmaceutical industry is leveraging AI to accelerate drug discovery and development processes. Machine learning models can predict how potential drug compounds will interact with biological targets, significantly reducing the time and cost associated with traditional drug development methods. This could lead to faster development of new treatments for a wide range of diseases.
    Administrative Efficiency and Cost Reduction
    Beyond clinical applications, AI is also being used to streamline administrative tasks in healthcare settings. Natural language processing and machine learning algorithms can automate tasks such as medical coding, billing, and appointment scheduling. This not only reduces the administrative burden on healthcare professionals but also helps to minimize errors and improve overall efficiency.
    Challenges and Ethical Considerations
    While the potential benefits of AI in healthcare are immense, there are also significant challenges and ethical considerations to address. Issues such as data privacy, algorithmic bias, and the need for human oversight in AI-driven decision-making processes must be carefully managed. Ensuring that AI technologies are developed and implemented responsibly will be crucial for maintaining public trust and maximizing the benefits of these innovations.
    Conclusion: A Collaborative Future
    The future of AI in healthcare is not about replacing human medical professionals, but rather about augmenting their capabilities. By combining the analytical power of AI with the experience and intuition of healthcare providers, we can create a more effective, efficient, and patient-centered healthcare system. As AI continues to evolve, its impact on healthcare will undoubtedly grow, ushering in a new era of medical innovation and improved patient care.
    """

    processor = BlogProcessor()
    processed_blog = processor.process_blog(summary)
    print(processed_blog)

if __name__ == "__main__":
    main()

## Read more about AI in healthcare.


    The Future of AI in Healthcare
    Introduction: AI's Revolutionary Impact on Healthcare
    Artificial Intelligence (AI) is poised to revolutionize the healthcare industry. From diagnosis to treatment planning, AI technologies are being integrated into various aspects of medical care. This integration promises to improve patient outcomes, reduce costs, and enhance the efficiency of healthcare systems worldwide. Early Detection and Diagnosis
    One of the most promising applications of AI in healthcare is in early detection and diagnosis of diseases.

## Artificial Intelligence (AI) is revolutionizing medical imaging.

Machine learning algorithms can analyze medical images, such as X-rays, MRIs, and CT scans, with remarkable accuracy. These AI systems can often detect subtle abnormalities that might be overlooked by human radiologists, leading to earlier diagnoses and potentially life-saving interventions. Personalized Treatment Plans
    AI 