In [None]:
# https://github.com/slackapi/python-slack-sdk/issues/429
# !pip install nest_asyncio

In [None]:
from functools import reduce
from operator import add
from lxml import etree
import aiohttp
from bs4 import BeautifulSoup
import asyncio

ATC_DDD_URL = "https://www.whocc.no/atc_ddd_index/?code="
import nest_asyncio

nest_asyncio.apply()

In [None]:
def _wash_a_element(a_element):
    context = a_element.text
    href = a_element.get("href")[1:]
    code_name = a_element.get("href").split("&")[0].split("=")[-1]
    return {"code_name": code_name, "href": href, "context": context}


def _wash_text(atc_code, name, ddd, u, adm_r, note):
    _atc_code = atc_code.text
    _name = list(name)[0].text if list(name) else name.text
    _ddd = ddd.text
    _u = u.text
    _adm_r = adm_r.text
    _note = note.text
    return _atc_code, _name, _ddd, _u, _adm_r, _note


def get_table_to_dict(parsing):
    ul_table = parsing.select_one('ul table')
    parsed_html = etree.HTML(str(ul_table).replace("\xa0", "").replace("<sup>", "").replace("</sup>", ""))
    html_table = parsed_html.find("body/table")
    if html_table is None:
        return []
    table_as_list = list(html_table)
    table_headers = [col.text for col in table_as_list[0]]
    table_list_dict = []
    temp_atc_code = ""
    temp_name = ""
    for row in table_as_list[1:]:
        atc_code, name, ddd, u, adm_r, note = _wash_text(*tuple(row))
        if atc_code is not None and name is not None:
            temp_atc_code = atc_code
            temp_name = name
        if atc_code is None and name is None:
            atc_code = temp_atc_code
            name = temp_name
        table_context = [atc_code, name, ddd, u, adm_r, note]
        table_list_dict.append(dict(zip(table_headers, table_context)))
    return [dictionary for dictionary in table_list_dict]


async def fetch_list_job(link, session, level):
    async with session.get(link) as response:
        html_body = await response.text()
        parsing = BeautifulSoup(html_body, 'lxml')
        if level != 5:
            rowDataList = [_wash_a_element(i) for i in parsing.select('b a')]
            rowDataList = rowDataList[:level] if level == 1 else rowDataList[level - 1:]
        else:
            rowDataList = get_table_to_dict(parsing)
        return rowDataList


async def doing_job(code_list, level):
    url_list = [f"{ATC_DDD_URL}{code}" for code in code_list]
    async with aiohttp.ClientSession() as session:
        tasks = [asyncio.create_task(fetch_list_job(link, session, level)) for link in url_list]
        return await asyncio.gather(*tasks)


In [None]:
loop = asyncio.get_event_loop()
import string


In [None]:
%%time
atc_1 = reduce(add, loop.run_until_complete(doing_job(string.ascii_uppercase, 1)))

In [None]:
%%time
atc_2 = reduce(add, loop.run_until_complete(doing_job(string.ascii_uppercase, 2)))
atc_2_code = [a["code_name"] for a in atc_2]

In [None]:
%%time
atc_3 = reduce(add, loop.run_until_complete(doing_job(atc_2_code, 3)))
atc_3_code = [a["code_name"] for a in atc_3]

In [None]:
%%time
atc_4 = reduce(add, loop.run_until_complete(doing_job(atc_3_code, 4)))
atc_4_code = [a["code_name"] for a in atc_4]

In [None]:
%%time
atc_5 = reduce(add, loop.run_until_complete(doing_job(atc_4_code, 5)))

In [None]:
import pandas as pd

In [None]:
atc_1_df = pd.DataFrame(atc_1)
atc_1_df.to_excel('./demo_data/excel/ATC_L1.xlsx', index=False, encoding="UTF-8")
atc_1_df.to_csv('./demo_data/csv/ATC_L1.csv', index=False, encoding="UTF-8")

atc_2_df = pd.DataFrame(atc_2)
atc_2_df.to_excel('./demo_data/excel/ATC_L2.xlsx', index=False, encoding="UTF-8")
atc_2_df.to_csv('./demo_data/csv/ATC_L2.csv', index=False, encoding="UTF-8")

atc_3_df = pd.DataFrame(atc_3)
atc_3_df.to_excel('./demo_data/excel/ATC_L3.xlsx', index=False, encoding="UTF-8")
atc_3_df.to_csv('./demo_data/csv/ATC_L3.csv', index=False, encoding="UTF-8")

atc_4_df = pd.DataFrame(atc_4)
atc_4_df.to_excel('./demo_data/excel/ATC_L4.xlsx', index=False, encoding="UTF-8")
atc_4_df.to_csv('./demo_data/csv/ATC_L4.csv', index=False, encoding="UTF-8")

atc_5_df = pd.DataFrame(atc_5)
atc_5_df.to_excel('./demo_data/excel/ATC_L5.xlsx', index=False, encoding="UTF-8")
atc_5_df.to_csv('./demo_data/csv/ATC_L5.csv', index=False, encoding="UTF-8")


