In [193]:
import base64

import aiohttp
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm.asyncio import tqdm

In [194]:
url = "https://comic.naver.com"
res = requests.get(url + "/webtoon/creation?view=list")
bs = BeautifulSoup(res.content, 'html.parser')

hrefs = [
    webtoon['href']
    for section in bs.find_all('div', {'class': "section"})
    for webtoon in section.find_all('a', {'href': True, 'title': True})
]

In [200]:
async def retrieve_webtoon(href, session):
    async with session.get(url + href) as response:
        webtoon_res = await response.read()
        webtoon_bs = BeautifulSoup(webtoon_res, 'html.parser')

        title = webtoon_bs.find('span', {'class': "title"}).text
        authors = webtoon_bs.find('span', {'class': "wrt_nm"}).text.strip().replace(' / ', ',')
        genres = webtoon_bs.find('span', {'class': "genre"}).text
        genre = genres.split(', ')[1]
        src = webtoon_bs.find('img')['src']

        episodes_lst = webtoon_bs.find('table', {'class': "viewList"})
        images_src = episodes_lst.find_all('img', {'title': True})
        images_src = [image['src'] for image in images_src]

    async with session.get(src) as response:
        thumbnail_res = await response.read()
        thumbnail = base64.b64encode(thumbnail_res).decode()

    images = []
    for image_src in images_src:
        async with session.get(image_src) as response:
            image_res = await response.read()
            images.append(base64.b64encode(image_res).decode())

    return title, authors, genre, thumbnail, ','.join(images)

In [201]:
async with aiohttp.ClientSession() as session:
    ret = await tqdm.gather(*[retrieve_webtoon(href, session) for href in hrefs])

100%|██████████| 1808/1808 [02:26<00:00, 12.37it/s] 


In [210]:
df = pd.DataFrame(ret, columns=['Title', 'Authors', 'Genre', 'Thumbnail', 'Images'])

In [211]:
genre_kr2eng = {
    '로맨스': "Romance",
    '판타지': "Fantasy",
    '드라마': "Drama",
    '스포츠': "Sports",
    '개그': "Comedy",
    '스릴러': "Thriller",
    '액션': "Action",
    '일상': "Slice of Life",
    '감성': "Heartwarming",
    '무협/사극': "Historical"
}

df['Genre'].replace(genre_kr2eng, inplace=True)

In [209]:
df.to_csv('korean_webtoons.csv', index=False)