In [55]:
# Imports and BeautifulSoup Extraction
import requests
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# URL and headers for request
url = "https://www.ninds.nih.gov/health-information/disorders/parkinsons-disease"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

In [56]:
# Fetch page content
response = requests.get(url, headers=headers)
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract titles and paragraphs
    article_content = []
    current_title = None
    for element in soup.find_all(['h1', 'h2', 'h3', 'p']):
        if element.name in ['h1', 'h2', 'h3']:
            current_title = element.get_text(strip=True)
        elif element.name == 'p' and current_title:
            article_content.append({"title": current_title, "paragraph": element.get_text(strip=True)})

    # Print the extracted content
    print("Extracted Content:")
    for item in article_content:
        print(f"Title: {item['title']}")
        print(f"Paragraph: {item['paragraph']}")
        print("----")

else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")
    article_content = []


Extracted Content:
Title: What is Parkinson’s disease?
Paragraph: Parkinson's disease is a progressive movement disorder of the nervous system. It causes nerve cells (neurons) in parts of the brain to weaken, become damaged, and die, leading to symptoms that include problems with movement, tremor, stiffness, and impaired balance. As symptoms progress, people with Parkinson’s disease (PD) may have difficulty walking, talking, or completing other simple tasks.
----
Title: Parkinson’s and the brain
Paragraph: Although many brain areas are affected in Parkinson’s disease, the most common symptoms result from the loss of neurons in an area near the base of the brain called the substantia nigra. The neurons in this area produce dopamine. Dopamine is the chemical messenger that transmits  signals in the brain to produce smooth, purposeful movement. Studies have shown that most people with PD have lost 60 to 80% or more of the dopamine-producing cells in the substantia nigra by the time sympto

In [57]:
# Filter paragraphs based on specific titles and words
target_titles = ["Treating PD"]  # titles to focus on
keywords = ["Carbidopa-Levodopa", "levodopa", "Dopamine agonists", "MAO-B inhibitors", "COMT inhibitors", "Anticholinergics", "Amantadine"]  # Add specific keywords you want to focus on

# Filter paragraphs by target titles
filtered_content = [item for item in article_content if item['title'] in target_titles]

In [58]:
filtered_paragraphs = []
for item in filtered_content:
    paragraph = item['paragraph']
    # Check if any keyword exists in the paragraph
    if any(keyword.lower() in paragraph.lower() for keyword in keywords):
        filtered_paragraphs.append(item)

# Debug: Print filtered paragraphs
for item in filtered_paragraphs:
    print(f"Title: {item['title']}")
    print(f"Paragraph: {item['paragraph']}\n")

Title: Treating PD
Paragraph: Carbidopa-Levodopa.The cornerstone of PD therapy is a medication called levodopa (also known as L-dopa). Levodopa can reduce the movement-related symptoms of PD, but it does not replace lost nerve cells or stop its progression. Nerve cells can use levodopa to make dopamine and replenish the brain's reduced supply. People cannot simply take dopamine pills because dopamine does not easily cross theblood-brain barrier, a protective lining of cells inside blood vessels that regulate the transport of oxygen, glucose, medications, and other substances in the brain. People with PD are given levodopa combined with another substance called carbidopa. When added to levodopa, carbidopa prevents the conversion of levodopa into dopamine except for in the brain. This stops or diminishes the side effects of  excess dopamine in the bloodstream, such as nausea. Carbidopa-levodopa is often very successful at reducing or eliminating the tremors and other motor symptoms of PD

In [59]:
!pip install torch



In [60]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Load the REBEL Model
model_name = "Babelscape/rebel-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Function to extract triplets
def extract_triplets_with_rebel(paragraph):
    inputs = tokenizer(paragraph, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=512)
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Raw model output: {decoded_output}")  # Debugging line
    return decoded_output



# Function for post-processing triplets
def process_rebel_output(decoded_output):
    # Step 1: Tokenize raw output
    tokens = tokenize_output(decoded_output)
    print(f"Tokens: {tokens}")  # Debugging

    # Step 2: Group tokens into triplets
    triplets = group_into_triplets(tokens)
    print(f"Grouped Triplets: {triplets}")  # Debugging

    # Step 3: Validate and clean triplets
    cleaned_triplets = validate_triplets(triplets)
    print(f"Cleaned Triplets: {cleaned_triplets}")  # Debugging

    return cleaned_triplets


def clean_triplets(triplets):
    cleaned_triplets = []
    for subject, predicate, obj in triplets:
        # Ensure subject, predicate, and object are non-empty
        if subject and predicate and obj:
            # Ignore triplets with invalid predicates
            if predicate.lower() not in ["unknown", "undefined"]:
                cleaned_triplets.append((subject, predicate, obj))
        else:
            print(f"Ignored triplet: {subject}, {predicate}, {obj}")  # Debugging
    return cleaned_triplets



def clean_and_parse_triplets(decoded_output):
    triplets = process_rebel_output(decoded_output)  # Parse triplets
    cleaned_triplets = clean_triplets(triplets)  # Clean parsed triplets
    return cleaned_triplets


In [61]:
def remove_duplicate_triplets(triplets):
    return list(set(triplets))  # Remove duplicates


In [62]:
def tokenize_output(decoded_output):
    # Split by double spaces or other noticeable patterns
    tokens = [token.strip() for token in decoded_output.split("  ") if token.strip()]
    return tokens


In [63]:
def group_into_triplets(tokens):
    triplets = []
    for i in range(0, len(tokens) - 2, 3):  # Iterate in steps of 3
        try:
            subject = tokens[i]
            predicate = tokens[i + 1]
            obj = tokens[i + 2]
            triplets.append((subject, predicate, obj))
        except IndexError:
            print(f"Incomplete triplet at end of tokens: {tokens[i:]}")
    return triplets


In [64]:
def validate_triplets(triplets):
    valid_triplets = []
    for subject, predicate, obj in triplets:
        if predicate == "treats" and any(term in obj.lower() for term in ["levodopa", "carbidopa"]):
            # Skip invalid "treats" relationships
            print(f"Ignored invalid triplet: {subject}, {predicate}, {obj}")
            continue
        valid_triplets.append((subject, predicate, obj))
    return valid_triplets


In [65]:
def refine_treatment_relationships(triplets):
    refined_triplets = []
    for subject, predicate, obj in triplets:
        if predicate == "levodopa" and obj == "drug used for treatment":
            refined_triplets.append(("levodopa", "treats", "Parkinson's disease"))
        elif predicate == "motor symptoms of PD":
            refined_triplets.append((subject, "treats", obj))  # Standardize "motor symptoms of PD"
        else:
            refined_triplets.append((subject, predicate, obj))
    return refined_triplets


In [66]:
def standardize_predicates(triplets):
    predicate_mapping = {
        "levodopa": "treats",
        "motor symptoms of PD": "treats",
        "drug used for treatment": "treats",
        "has role": "has role"
    }
    standardized_triplets = []
    for subject, predicate, obj in triplets:
        predicate = predicate_mapping.get(predicate, predicate)  # Map predicate if applicable
        standardized_triplets.append((subject, predicate, obj))
    return standardized_triplets


In [67]:
def fix_truncated_objects(triplets):
    refined_triplets = []
    for subject, predicate, obj in triplets:
        if obj.endswith("r"):  # Handle truncated objects
            obj = obj.rstrip("r").strip()
        if obj:  # Ensure the object is not empty after cleanup
            refined_triplets.append((subject, predicate, obj))
        else:
            print(f"Ignored triplet with incomplete object: {subject}, {predicate}, {obj}")
    return refined_triplets


In [68]:
all_triples = []

for item in filtered_paragraphs:
    paragraph = item['paragraph']
    print(f"Processing paragraph: {paragraph}")

    # Step 1: Extract raw triplets
    try:
        decoded_output = extract_triplets_with_rebel(paragraph)
        print("Decoded Output:", decoded_output)
        triplets = clean_and_parse_triplets(decoded_output)
        print("Parsed Triplets Before Refinement:", triplets)
    except Exception as e:
        print(f"Error processing paragraph: {paragraph}")
        print(f"Exception: {e}")
        continue

    # Step 2: Fix truncated objects
    triplets = fix_truncated_objects(triplets)
    print("After Fixing Truncated Objects:", triplets)

    # Step 3: Refine relationships
    triplets = refine_treatment_relationships(triplets)
    print("After Refining Relationships:", triplets)

    # Step 4: Standardize predicates
    triplets = standardize_predicates(triplets)
    print("After Standardizing Predicates:", triplets)

    # Step 5: Remove redundant triplets
    triplets = remove_duplicate_triplets(triplets)
    print("After Removing Duplicates:", triplets)

    # Extend final results
    all_triples.extend(triplets)




Processing paragraph: Carbidopa-Levodopa.The cornerstone of PD therapy is a medication called levodopa (also known as L-dopa). Levodopa can reduce the movement-related symptoms of PD, but it does not replace lost nerve cells or stop its progression. Nerve cells can use levodopa to make dopamine and replenish the brain's reduced supply. People cannot simply take dopamine pills because dopamine does not easily cross theblood-brain barrier, a protective lining of cells inside blood vessels that regulate the transport of oxygen, glucose, medications, and other substances in the brain. People with PD are given levodopa combined with another substance called carbidopa. When added to levodopa, carbidopa prevents the conversion of levodopa into dopamine except for in the brain. This stops or diminishes the side effects of  excess dopamine in the bloodstream, such as nausea. Carbidopa-levodopa is often very successful at reducing or eliminating the tremors and other motor symptoms of PD during 

In [69]:
for triple in all_triples:
    print(f"Subject: {triple[0]}, Predicate: {triple[1]}, Object: {triple[2]}")

Subject: levodopa, Predicate: treats, Object: Parkinson's disease
Subject: Parkinson's disease, Predicate: medical condition treated, Object: carbidopa
Subject: carbidopa, Predicate: treats, Object: levodopa
Subject: psychosis, Predicate: hallucinations, Object: has effect
Subject: side effect, Predicate: potentially serious, Object: subclass of
Subject: apomorphine, Predicate: Dopamine agonist, Object: subject has role
Subject: ropinirole, Predicate: Dopamine agonist, Object: subject has role
Subject: pramipexole, Predicate: Dopamine agonist, Object: subject has role
Subject: rotigotine, Predicate: Dopamine agonist, Object: subject has role
Subject: rasagiline, Predicate: treats, Object: medical condition treated
Subject: motor symptoms of PD, Predicate: rasagiline, Object: drug used for treatment
Subject: tolcapone, Predicate: COMT inhibitor, Object: subject has role
Subject: opicapone, Predicate: COMT inhibitor, Object: subject has role
Subject: entacapone, Predicate: COMT inhibitor