In [12]:
from bs4 import BeautifulSoup
import requests
from requests.adapters import HTTPAdapter
# from requests.packages import urllib3

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.document_loaders import  WebBaseLoader
import tiktoken
from tqdm import tqdm

diseases = [
    'Hypertension', 'Hyperlipidemia', 'Obesity', 'Diabetes', 'Cataract', 
    'Dementia', 'Rhinitis', 'Gastritis', 'Periodontal%20disease', 'Hemorrhoids',
    'Alopecia'
    ]

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

urls = []
for disease in tqdm(diseases):
    for num in range(1,11):
        parent_url=f'https://www.mayoclinic.org/search/search-results?q={disease}&page={num}'
        res = requests.get(parent_url, headers=headers)
        soup = BeautifulSoup(res.content, 'html.parser')
        hrefs = [a['href'] for a in soup.find_all('a', class_='azsearchlink') if 'href' in a.attrs]
        # print(disease, hrefs)
        urls += hrefs
# len(urls)

tokenizer = tiktoken.get_encoding('cl100k_base')
def token_length(text):
    tokens = tokenizer.encode(text)
    return len(tokens)

# text split
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=30,
    length_function = token_length
)
urls = list(set(urls))

# session = requests.Session()
# retries = Retry(total=3, backoff_factor=1, status_forcelist=[])
requests_kwargs = {
    'timeout':10,
    # 'sesstion':session
}
all_articles = []
for url in tqdm(urls):
    try:
        docs = WebBaseLoader(url, requests_kwargs=requests_kwargs).load_and_split(text_splitter)
        all_articles+=docs
    except requests.exceptions.RequestException as e:
        print(f'URL {url} 처리 중 오류 발생: {e}')
        continue
all_articles[0]



100%|██████████| 11/11 [01:09<00:00,  6.29s/it]
100%|██████████| 1032/1032 [13:08<00:00,  1.31it/s]


Document(metadata={'source': 'https://www.mayoclinic.org/diseases-conditions/acne/doctors-departments/ddc-20368049', 'title': 'Acne - Doctors and departments - Mayo Clinic', 'language': 'en'}, page_content='Acne - Doctors and departments - Mayo Clinic\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nThis content does not have an English version.This content does not have an Arabic version.')

In [13]:
len(all_articles)

10654

In [14]:
import json

mayoclinic_diseases_info = []
for info in all_articles:
    mayoclinic_diseases_info.append(
        {
            'page_content':info.page_content,
            'metadata':info.metadata
         }
    )

with open('mayoclinic_diseases_info.json', 'w') as json_file:
    json.dump(mayoclinic_diseases_info, json_file, indent=4)