# MedlinePlus

In [6]:
import os
import requests
from bs4 import BeautifulSoup
import time

# Define the base URL of the MedlinePlus articles
base_url = 'https://medlineplus.gov/ency/article/'

# Define the directory to save the disease definitions
save_dir = './data/medlineplus/'

# Ensure the save directory exists
os.makedirs(save_dir, exist_ok=True)

def get_disease_info(article_id):
    try:
        disease_url = f'{base_url}{article_id}.htm'
        response = requests.get(disease_url)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract the disease name and definition
        disease_name = soup.find('h1').text.strip()
        print(f"Fetching info for: {disease_name}")  # Debugging print
        definition_section = soup.find('div', class_='section')
        definition = definition_section.text.strip() if definition_section else "No definition found."
        
        return {
            'name': disease_name,
            'definition': definition
        }
    except requests.exceptions.RequestException as e:
        print(f"Error fetching disease info from {disease_url}: {e}")
        return None

def save_disease_info(disease_info):
    # Create a valid filename by removing characters that are not allowed in filenames
    filename = "".join(c for c in disease_info['name'] if c.isalnum() or c in (' ', '_')).rstrip()
    filepath = os.path.join(save_dir, f"{filename}.txt")
    
    with open(filepath, 'w', encoding='utf-8') as file:
        file.write(f"Name: {disease_info['name']}\n")
        file.write(f"Definition: {disease_info['definition']}\n")
    
    print(f"Saved: {filepath}")

def main():
    # Example range of article IDs to iterate through
    for article_id in range(0, 1000):  # Adjust range as needed
        article_id_str = f'{article_id:06}'  # Zero-pad to match the article ID format
        disease_info = get_disease_info(article_id_str)
        if disease_info:
            save_disease_info(disease_info)
        time.sleep(1)  # Add a delay to avoid overloading the server

if __name__ == "__main__":
    main()


Error fetching disease info from https://medlineplus.gov/ency/article/000000.htm: 404 Client Error: Not Found for url: https://medlineplus.gov/ency/article/000000.htm
Fetching info for: Broken bone
Saved: ./data/medlineplus/Broken bone.txt
Error fetching disease info from https://medlineplus.gov/ency/article/000002.htm: 404 Client Error: Not Found for url: https://medlineplus.gov/ency/article/000002.htm
Error fetching disease info from https://medlineplus.gov/ency/article/000003.htm: 404 Client Error: Not Found for url: https://medlineplus.gov/ency/article/000003.htm
Error fetching disease info from https://medlineplus.gov/ency/article/000004.htm: 404 Client Error: Not Found for url: https://medlineplus.gov/ency/article/000004.htm
Fetching info for: Allergic reactions
Saved: ./data/medlineplus/Allergic reactions.txt
Fetching info for: Traumatic amputation
Saved: ./data/medlineplus/Traumatic amputation.txt
Fetching info for: Breathing difficulties - first aid
Saved: ./data/medlineplus/B

### Data Uploading

## Upload a file

In [18]:
# Data uploader
import boto3

def upload_to_s3(file_name, bucket, object_name=None, aws_access_key_id=None, aws_secret_access_key=None, region_name=None):
    """
    Upload a file to an S3 bucket using put_object

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified, file_name is used
    :param aws_access_key_id: AWS access key ID
    :param aws_secret_access_key: AWS secret access key
    :param region_name: AWS region name
    :return: True if file was uploaded, else False
    """
    
    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Create an S3 client
    s3_client = boto3.client(
        's3',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        region_name=region_name
    )

    # Upload the file
    with open(file_name, 'rb') as file_data:
        s3_client.put_object(Bucket=bucket, Key=object_name, Body=file_data)
    print(f"File {file_name} uploaded to {bucket}/{object_name}")
    return True

# Example usage
file_name = "./data/medlineplus/CPR  adult and child after onset of puberty.txt"  # Local file path
bucket_name = "data-chunking-us"
object_name = f"raw/{file_name.split('/')[-1]}" # S3 object name

aws_access_key_id = ''
aws_secret_access_key = ''
region_name = 'us-east-1'  # e.g., 'us-west-1'

upload_to_s3(file_name, bucket_name, object_name, aws_access_key_id, aws_secret_access_key, region_name)


File ./data/medlineplus/CPR  adult and child after onset of puberty.txt uploaded to data-chunking-us/raw/CPR  adult and child after onset of puberty.txt


True

## Upload files to S3

In [12]:
import boto3
import os
from dotenv import load_dotenv

load_dotenv()
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
region_name = os.getenv('REGION_NAME')

def upload_to_s3(file_name, bucket, object_name=None, aws_access_key_id=None, aws_secret_access_key=None, region_name=None):
    """
    Upload a file to an S3 bucket using put_object

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified, file_name is used
    :param aws_access_key_id: AWS access key ID
    :param aws_secret_access_key: AWS secret access key
    :param region_name: AWS region name
    :return: True if file was uploaded, else False
    """
    
    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Remove spaces from the file name
    object_name = object_name.replace(' ', '_')

    # Create an S3 client
    s3_client = boto3.client(
        's3',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        region_name=region_name
    )

    # Upload the file
    with open(file_name, 'rb') as file_data:
        s3_client.put_object(Bucket=bucket, Key=object_name, Body=file_data)
    print(f"File {file_name} uploaded to {bucket}/{object_name}")
    return True

def upload_directory_to_s3(directory, bucket, aws_access_key_id, aws_secret_access_key, region_name):
    """
    Upload all files in a directory to an S3 bucket

    :param directory: Directory containing files to upload
    :param bucket: Bucket to upload to
    :param aws_access_key_id: AWS access key ID
    :param aws_secret_access_key: AWS secret access key
    :param region_name: AWS region name
    """
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            object_name = f"raw/{file.replace(' ', '_')}"  # S3 object name without spaces
            upload_to_s3(file_path, bucket, object_name, aws_access_key_id, aws_secret_access_key, region_name)

# Example usage
directory = "./data/medlineplus/"  # Local directory path
bucket_name = "data-chunking-us"

upload_directory_to_s3(directory, bucket_name, aws_access_key_id, aws_secret_access_key, region_name)


File ./data/medlineplus/Idiopathic hypersomnia.txt uploaded to data-chunking-us/raw/Idiopathic_hypersomnia.txt
File ./data/medlineplus/Hydatidiform mole.txt uploaded to data-chunking-us/raw/Hydatidiform_mole.txt
File ./data/medlineplus/Sinusitis.txt uploaded to data-chunking-us/raw/Sinusitis.txt
File ./data/medlineplus/Peripheral neuropathy.txt uploaded to data-chunking-us/raw/Peripheral_neuropathy.txt
File ./data/medlineplus/Multiple system atrophy  cerebellar subtype.txt uploaded to data-chunking-us/raw/Multiple_system_atrophy__cerebellar_subtype.txt
File ./data/medlineplus/Autoimmune disorders.txt uploaded to data-chunking-us/raw/Autoimmune_disorders.txt
File ./data/medlineplus/Dementia.txt uploaded to data-chunking-us/raw/Dementia.txt
File ./data/medlineplus/Diabetic ketoacidosis.txt uploaded to data-chunking-us/raw/Diabetic_ketoacidosis.txt
File ./data/medlineplus/Culturenegative endocarditis.txt uploaded to data-chunking-us/raw/Culturenegative_endocarditis.txt
File ./data/medline