In [1]:
# Install required packages
!pip install torch transformers networkx pandas requests beautifulsoup4 spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import torch
from transformers import AutoTokenizer
from transformers import T5ForConditionalGeneration
import networkx as nx
import pandas as pd
import requests
from bs4 import BeautifulSoup
import spacy
import re
from tqdm import tqdm
import matplotlib.pyplot as plt

In [3]:
def process_text_for_rebel(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.split()) > 5]
    return sentences

In [4]:
def initialize_rebel():
    """Initialize REBEL model and tokenizer"""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load REBEL specific model and tokenizer
    model_name = "Babelscape/rebel-large"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

    return device, tokenizer, model

def process_text_for_rebel(text):
    """Prepare text for REBEL by splitting into appropriate chunks"""
    # REBEL works best with sentence-level extraction
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)

    # Get sentences and clean them
    sentences = []
    for sent in doc.sents:
        # Clean the sentence
        clean_sent = re.sub(r'\s+', ' ', sent.text).strip()
        if len(clean_sent.split()) > 5:  # Only process sentences with more than 5 words
            sentences.append(clean_sent)

    return sentences

def extract_rebel_relations(sentence, tokenizer, model, device):
    inputs = tokenizer([sentence], return_tensors="pt", max_length=512, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    outputs = model.generate(**inputs, max_length=256, num_beams=3, num_return_sequences=1)

    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    relations = []

    for triple in decoded_output.split(')('):
        triple = triple.replace('(', '').replace(')', '')
        parts = triple.split(';')
        if len(parts) == 3:
            relations.append({
                'subject': parts[0].strip(),
                'relation': parts[1].strip(),
                'object': parts[2].strip()
            })

    return relations

def fetch_webpage_content(url):
    """Fetch and clean content from a webpage"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # Remove unwanted elements
        for element in soup(['script', 'style', 'nav', 'footer', 'header']):
            element.decompose()

        # Extract text content
        text = soup.get_text(separator=' ')
        text = re.sub(r'\s+', ' ', text).strip()

        return text

    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

def build_knowledge_graph(relations):
    """Build a NetworkX graph from extracted relations"""
    G = nx.DiGraph()

    for rel in relations:
        subject = rel['subject']
        relation = rel['relation']
        obj = rel['object']

        # Add nodes and edge
        G.add_node(subject)
        G.add_node(obj)
        G.add_edge(subject, obj, relation=relation)

    return G

def visualize_graph(G):
    """Visualize the knowledge graph"""
    plt.figure(figsize=(15, 10))
    pos = nx.spring_layout(G, k=1, iterations=50)

    nx.draw_networkx_nodes(G, pos, node_size=2000, node_color='lightblue')
    nx.draw_networkx_edges(G, pos, edge_color='gray', arrows=True)
    nx.draw_networkx_labels(G, pos, font_size=8)

    edge_labels = nx.get_edge_attributes(G, 'relation')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=6)

    plt.title("Parkinson's Disease Knowledge Graph")
    plt.axis('off')
    plt.tight_layout()
    plt.show()

In [5]:
def main():
    device, tokenizer, model = initialize_rebel()
    all_relations = []

    urls = [
        "https://www.ninds.nih.gov/health-information/disorders/parkinsons-disease",
        "https://www.apdaparkinson.org/living-with-parkinsons-disease/treatment-medication/medication/"
    ]

    for url in urls:
        content = fetch_webpage_content(url)
        if content:
            sentences = process_text_for_rebel(content)
            for sentence in tqdm(sentences, desc=f"Processing {url}"):
                relations = extract_rebel_relations(sentence, tokenizer, model, device)
                all_relations.extend(relations)

    relations_df = pd.DataFrame(all_relations)
    print("\nExtracted Relations:")
    print(relations_df)

    if not relations_df.empty:
        G = build_knowledge_graph(all_relations)
        visualize_graph(G)

        print(f"\nKnowledge Graph Statistics:")
        print(f"Number of nodes: {G.number_of_nodes()}")
        print(f"Number of edges: {G.number_of_edges()}")
        return G, relations_df
    else:
        print("No relations extracted. Check your input or model configuration.")
        return None, relations_df

if __name__ == "__main__":
    G, relations_df = main()

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/344 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

You are using a model of type bart to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at Babelscape/rebel-large and are newly initialized: ['decoder.block.0.layer.0.SelfAttention.k.weight', 'decoder.block.0.layer.0.SelfAttention.o.weight', 'decoder.block.0.layer.0.SelfAttention.q.weight', 'decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight', 'decoder.block.0.layer.0.SelfAttention.v.weight', 'decoder.block.0.layer.0.layer_norm.weight', 'decoder.block.0.layer.1.EncDecAttention.k.weight', 'decoder.block.0.layer.1.EncDecAttention.o.weight', 'decoder.block.0.layer.1.EncDecAttention.q.weight', 'decoder.block.0.layer.1.EncDecAttention.v.weight', 'decoder.block.0.layer.1.layer_norm.weight', 'decoder.block.0.layer.2.DenseReluDense.wi.weight', 'decoder.block.0.layer.2.DenseReluDense.wo.weight', 'decoder.block.0.layer.2.layer_norm.weight', 'decoder.block.1.layer.0.SelfAttention.k.weight', 'decoder.block.1.layer.0.SelfAttention.o.weight', 'decoder.block.1.layer.0.SelfAttentio


Extracted Relations:
Empty DataFrame
Columns: []
Index: []
No relations extracted. Check your input or model configuration.





2nd method

In [None]:
def initialize_rebel():
    """Initialize REBEL model and tokenizer with debug info"""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    model_name = "Babelscape/rebel-large"
    print(f"Loading tokenizer and model from: {model_name}")

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
        print("Successfully loaded model and tokenizer")
        return device, tokenizer, model
    except Exception as e:
        print(f"Error loading model: {e}")
        raise

def extract_rebel_relations(text, tokenizer, model, device):
    """Extract relations with detailed debugging"""
    try:
        # Format input text according to REBEL requirements
        input_text = f"extract triplets: {text}"
        print(f"\nProcessing text: {input_text}")

        # Tokenize with debug info
        encoding = tokenizer(
            input_text,
            return_tensors="pt",
            max_length=512,
            truncation=True,
            padding=True
        )
        print(f"Input token count: {len(encoding['input_ids'][0])}")

        # Move input to device
        input_ids = encoding["input_ids"].to(device)
        attention_mask = encoding["attention_mask"].to(device)

        # Generate with more detailed parameters
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=256,
            num_beams=5,
            num_return_sequences=3,
            early_stopping=True
        )
        print(f"Generated {len(outputs)} sequences")

        # Decode outputs with debug info
        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        print("\nDecoded outputs:")
        for idx, output in enumerate(decoded_outputs):
            print(f"Output {idx + 1}: {output}")

        # Extract relations
        relations = []
        for output in decoded_outputs:
            try:
                # Split on specific markers used by REBEL
                triplets = output.split('</s>')[0].split('<s>')
                print(f"\nProcessing triplets: {triplets}")

                for triplet in triplets:
                    if triplet.strip():
                        # Split on relation markers
                        parts = triplet.split('<triplet>')
                        if len(parts) > 1:
                            rel_parts = parts[1].split('<subj>')[1].split('<obj>')
                            if len(rel_parts) > 1:
                                subject = rel_parts[0].strip()
                                predicate_obj = rel_parts[1].split('<pred>')
                                if len(predicate_obj) > 1:
                                    relation = predicate_obj[1].strip()
                                    obj = predicate_obj[0].strip()

                                    relations.append({
                                        'subject': subject,
                                        'relation': relation,
                                        'object': obj
                                    })
                                    print(f"Extracted relation: {subject} - {relation} - {obj}")

            except Exception as e:
                print(f"Error processing output: {e}")
                continue

        return relations

    except Exception as e:
        print(f"Error in relation extraction: {e}")
        return []

def main():
    # Test with a very simple, explicit example
    sample_text = "Levodopa treats Parkinson's disease. The drug causes nausea as a side effect."
    print("\nInitializing REBEL...")
    device, tokenizer, model = initialize_rebel()

    print("\nExtracting relations...")
    relations = extract_rebel_relations(sample_text, tokenizer, model, device)

    print("\nExtracted Relations:")
    if relations:
        relations_df = pd.DataFrame(relations)
        print(relations_df)
    else:
        print("No relations extracted")

    return relations

if __name__ == "__main__":
    # Enable debug mode for transformers
    import logging
    logging.getLogger("transformers").setLevel(logging.DEBUG)

    relations = main()

loading file vocab.json from cache at /root/.cache/huggingface/hub/models--Babelscape--rebel-large/snapshots/44eb6cb4585df284ce6c4d6a7013f83fe473c052/vocab.json
loading file merges.txt from cache at /root/.cache/huggingface/hub/models--Babelscape--rebel-large/snapshots/44eb6cb4585df284ce6c4d6a7013f83fe473c052/merges.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Babelscape--rebel-large/snapshots/44eb6cb4585df284ce6c4d6a7013f83fe473c052/tokenizer.json
loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--Babelscape--rebel-large/snapshots/44eb6cb4585df284ce6c4d6a7013f83fe473c052/added_tokens.json
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--Babelscape--rebel-large/snapshots/44eb6cb4585df284ce6c4d6a7013f83fe473c052/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Babelscape--rebel-large/snapshots/44eb6cb4585df284ce6c4d6a701


Initializing REBEL...
Using device: cuda
Loading tokenizer and model from: Babelscape/rebel-large


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Babelscape--rebel-large/snapshots/44eb6cb4585df284ce6c4d6a7013f83fe473c052/config.json
You are using a model of type bart to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.
Model config T5Config {
  "_name_or_path": "model/rebel-large",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout": 0.1,
  "dropout_rate": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "en

Successfully loaded model and tokenizer

Extracting relations...

Processing text: extract triplets: Levodopa treats Parkinson's disease. The drug causes nausea as a side effect.
Input token count: 24
Generated 3 sequences

Decoded outputs:
Output 1: ysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysisysis Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professional Professi

In [None]:
!pip install transformers pandas networkx requests beautifulsoup4



3rd method

In [None]:
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import pandas as pd
import networkx as nx
import nltk
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch

# Download necessary NLTK data
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def initialize_extractor():
    """Initialize the REBEL model with proper configuration"""
    # Correct device detection
    device = 0 if torch.cuda.is_available() else -1
    return pipeline(
        'text2text-generation',
        model='Babelscape/rebel-large',
        tokenizer='Babelscape/rebel-large',
        max_length=512,
        device=device  # Use corrected device detection
    )


def scrape_website(url):
    """Scrape and clean website content"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        # Remove unwanted elements
        for element in soup(['script', 'style', 'nav', 'footer', 'header']):
            element.decompose()

        # Extract paragraphs
        paragraphs = soup.find_all('p')
        text = ' '.join(p.get_text() for p in paragraphs)

        return text.strip()
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return ""

def chunk_text(text, max_length=512):
    """Break text into smaller chunks"""
    sentences = nltk.sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(sentence.split())
        if current_length + sentence_length > max_length:
            if current_chunk:
                chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_length = sentence_length
        else:
            current_chunk.append(sentence)
            current_length += sentence_length

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

def extract_triplets(text_chunk, extractor):
    """Extract triplets from a chunk of text"""
    try:
        # Prepare input text
        input_text = f"extract triplets: {text_chunk}"

        # Generate triplets
        extracted_text = extractor(
            input_text,
            max_length=512,
            num_return_sequences=1,
            do_sample=False
        )[0]['generated_text']

        # Parse triplets
        triplets = []
        for line in extracted_text.split('\n'):
            if '<triplet>' in line:
                try:
                    # Handle different possible formats
                    if '|' in line:
                        # Format: <triplet> subject | relation | object
                        parts = line.split('|')
                        if len(parts) == 3:
                            subject = parts[0].replace('<triplet>', '').strip()
                            relation = parts[1].strip()
                            object_ = parts[2].strip()
                            triplets.append((subject, relation, object_))
                    else:
                        # Alternative format with XML-like tags
                        line = line.replace('<triplet>', '').replace('</triplet>', '')
                        if '<subj>' in line and '<obj>' in line and '<pred>' in line:
                            subj_start = line.find('<subj>') + 6
                            subj_end = line.find('</subj>')
                            pred_start = line.find('<pred>') + 6
                            pred_end = line.find('</pred>')
                            obj_start = line.find('<obj>') + 5
                            obj_end = line.find('</obj>')

                            if all(x >= 0 for x in [subj_start, subj_end, pred_start, pred_end, obj_start, obj_end]):
                                subject = line[subj_start:subj_end].strip()
                                relation = line[pred_start:pred_end].strip()
                                object_ = line[obj_start:obj_end].strip()
                                triplets.append((subject, relation, object_))
                except Exception as e:
                    print(f"Error parsing triplet line: {e}")
                    continue

        return triplets
    except Exception as e:
        print(f"Error extracting triplets: {e}")
        return []

def build_knowledge_graph(triplets):
    """Build a NetworkX graph from triplets"""
    G = nx.DiGraph()
    for subject, relation, object_ in triplets:
        G.add_edge(subject, object_, relation=relation)
    return G

def visualize_graph(G):
    """Visualize the knowledge graph"""
    plt.figure(figsize=(15, 10))
    pos = nx.spring_layout(G, k=1, iterations=50)

    nx.draw_networkx_nodes(G, pos, node_size=2000, node_color='lightblue')
    nx.draw_networkx_edges(G, pos, edge_color='gray', arrows=True)
    nx.draw_networkx_labels(G, pos, font_size=8)

    edge_labels = nx.get_edge_attributes(G, 'relation')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=6)

    plt.title("Parkinson's Disease Knowledge Graph")
    plt.axis('off')
    plt.tight_layout()
    plt.show()

In [None]:
def main():
    # Initialize the extractor
    extractor = initialize_extractor()

    # List of URLs
    urls = [
        "https://www.ninds.nih.gov/health-information/disorders/parkinsons-disease",
        "https://www.apdaparkinson.org/living-with-parkinsons-disease/treatment-medication/medication/",
        "https://my.clevelandclinic.org/health/treatments/parkinsons-disease-medications",
        "https://www.mayoclinic.org/diseases-conditions/parkinsons-disease/diagnosis-treatment/drc-20376062"
    ]

    all_triplets = []

    # Process each URL
    for url in tqdm(urls, desc="Processing URLs"):
        print(f"\nProcessing {url}")
        text = scrape_website(url)
        if text:
            # Break text into chunks
            chunks = chunk_text(text)
            print(f"Split into {len(chunks)} chunks")

            # Process each chunk
            for chunk in tqdm(chunks, desc="Processing chunks"):
                triplets = extract_triplets(chunk, extractor)
                all_triplets.extend(triplets)

    # Build and visualize knowledge graph
    if all_triplets:
        print("\nBuilding knowledge graph...")
        kg = build_knowledge_graph(all_triplets)

        print(f"\nKnowledge Graph Statistics:")
        print(f"Number of nodes: {kg.number_of_nodes()}")
        print(f"Number of edges: {kg.number_of_edges()}")

        # Create DataFrame
        df = pd.DataFrame(all_triplets, columns=['Subject', 'Relation', 'Object'])
        print("\nExtracted Triplets:")
        print(df)

        # Visualize
        visualize_graph(kg)

        # Save results
        #df.to_csv("parkinsons_drug_triplets.csv", index=False)
        #nx.write_gexf(kg, "parkinsons_drug_kg.gexf")

        return kg, df
    else:
        print("No triplets extracted")
        return None, None

if __name__ == "__main__":
    kg, df = main()

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Babelscape--rebel-large/snapshots/44eb6cb4585df284ce6c4d6a7013f83fe473c052/config.json
Model config BartConfig {
  "_name_or_path": "Babelscape/rebel-large",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 0,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_


Processing https://www.ninds.nih.gov/health-information/disorders/parkinsons-disease
Split into 6 chunks



Processing chunks:   0%|          | 0/6 [00:00<?, ?it/s][AGenerate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "max_length": 200,
  "num_beams": 4,
  "pad_token_id": 1
}


Processing chunks:  17%|█▋        | 1/6 [00:00<00:04,  1.17it/s][AGenerate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "max_length": 200,
  "num_beams": 4,
  "pad_token_id": 1
}


Processing chunks:  33%|███▎      | 2/6 [00:01<00:02,  1.44it/s][AGenerate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "max_length": 200,
  "num_beams": 4,
  "pad_token_id": 1
}


Processing chunks:  50%|█████     | 3/6 [00:02<00:02,  1.25it/s][AGenerate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "max


Processing https://www.apdaparkinson.org/living-with-parkinsons-disease/treatment-medication/medication/
Split into 1 chunks



Processing chunks:   0%|          | 0/1 [00:00<?, ?it/s][AGenerate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "max_length": 200,
  "num_beams": 4,
  "pad_token_id": 1
}


Processing chunks: 100%|██████████| 1/1 [00:00<00:00,  1.91it/s]
Processing URLs:  50%|█████     | 2/4 [00:05<00:04,  2.45s/it]


Processing https://my.clevelandclinic.org/health/treatments/parkinsons-disease-medications
Split into 3 chunks



Processing chunks:   0%|          | 0/3 [00:00<?, ?it/s][AGenerate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "max_length": 200,
  "num_beams": 4,
  "pad_token_id": 1
}


Processing chunks:  33%|███▎      | 1/3 [00:01<00:03,  1.60s/it][AGenerate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "max_length": 200,
  "num_beams": 4,
  "pad_token_id": 1
}


Processing chunks:  67%|██████▋   | 2/3 [00:02<00:01,  1.18s/it][AGenerate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "max_length": 200,
  "num_beams": 4,
  "pad_token_id": 1
}


Processing chunks: 100%|██████████| 3/3 [00:03<00:00,  1.24s/it]
Processing URLs:  75%|███████▌  | 3/4 [00:09<00:03,  3.24s/it]


Processing https://www.mayoclinic.org/diseases-conditions/parkinsons-disease/diagnosis-treatment/drc-20376062
Split into 5 chunks



Processing chunks:   0%|          | 0/5 [00:00<?, ?it/s][AYou seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "max_length": 200,
  "num_beams": 4,
  "pad_token_id": 1
}


Processing chunks:  20%|██        | 1/5 [00:00<00:03,  1.05it/s][AGenerate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "max_length": 200,
  "num_beams": 4,
  "pad_token_id": 1
}


Processing chunks:  40%|████      | 2/5 [00:01<00:02,  1.02it/s][AGenerate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 0,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "max_length": 200,
  "num_beams": 4,
  "pad_token_id": 1
}


Processing chunks:  60%|██████    | 3/5 [00:02<00:01,  1.20it/s][AGenerate config GenerationConfig 

No triplets extracted



