In [4]:
user_agent = ''
# Type your user agent in ""
# your user agent can be found here: https://www.whatismybrowser.com/detect/what-is-my-user-agent/

In [5]:
from bs4 import BeautifulSoup as bs
from collections import Counter
from konlpy.tag import Hannanum
from tqdm import tqdm
import aiohttp
import asyncio
import random
import requests
import time
import re
import matplotlib.pyplot as plt

In [9]:
import backoff

@backoff.on_exception(backoff.expo,
                      aiohttp.ClientError,
                      max_tries=8,
                      giveup=lambda e: e.status == 403)
async def fetch(session, url):
    async with session.get(url) as response:
        response.raise_for_status()
        return await response.text()


async def scrape_page(i, session):
    pg = str(i)
    try:
        url = f"https://cafe.naver.com/ArticleList.nhn?search.clubid=11262350&search.boardtype=L&search.totalCount=151&search.cafeId=11262350&search.page={pg}"
        html = await fetch(session, url)
        soup = bs(html, "lxml")
        articles = []
        parsed_datas = soup.find_all("a", {"class": "article"})
        if not parsed_datas:
            print(f"No articles found on page {pg}")
            return []
        for data in parsed_datas:
            article_text = str(data).strip().replace("\n", " ").replace("    ", " ")
            article_contents = re.findall(r'</span>(.*?)</a>', article_text)
            if article_contents:
                articles.append(article_contents[0].strip())
        await asyncio.sleep(random.uniform(1, 5))  # Polite delay between requests
        print(f"Page {pg} scraped successfully with {len(articles)} articles.")
        return articles
    except Exception as e:
        return []

async def main():
    all_articles = []
    async with aiohttp.ClientSession() as session:
        tasks = [scrape_page(i, session) for i in tqdm(range(1, 115))]
        results = await asyncio.gather(*tasks)
        for articles in results:
            all_articles.extend(articles)

    if all_articles:
        print(f"Total articles scraped: {len(all_articles)}")
        with open('text.txt', 'w', encoding='utf-8') as file:
            for article in all_articles:
                file.write(article + '\n')
    else:
        print("No articles were scraped.")

loop = asyncio.get_event_loop()
if loop.is_running():
    asyncio.ensure_future(main())
else:
    loop.run_until_complete(main())

100%|█████████████████████████████████████████████████████████████████████████████| 114/114 [00:00<00:00, 56720.13it/s]


Page 1049 scraped successfully with 8 articles.
Page 414 scraped successfully with 1 articles.
Page 836 scraped successfully with 0 articles.
Page 839 scraped successfully with 0 articles.
Page 805 scraped successfully with 0 articles.
Page 828 scraped successfully with 0 articles.
Page 802 scraped successfully with 1 articles.
Page 803 scraped successfully with 0 articles.
Page 799 scraped successfully with 1 articles.
Page 831 scraped successfully with 0 articles.
Page 804 scraped successfully with 0 articles.
Page 815 scraped successfully with 0 articles.
Page 807 scraped successfully with 0 articles.
Page 827 scraped successfully with 0 articles.
Page 814 scraped successfully with 0 articles.
Page 801 scraped successfully with 1 articles.
Page 816 scraped successfully with 0 articles.
Page 806 scraped successfully with 0 articles.
Page 819 scraped successfully with 0 articles.
Page 1121 scraped successfully with 8 articles.
Page 821 scraped successfully with 0 articles.
Page 568 sc

In [None]:
hannanum = Hannanum()
words = []

for article in articles:
    nouns = hannanum.nouns(article)
    words+=nouns

print(len(words))

In [None]:
counter = Counter(words)

In [None]:
from wordcloud import WordCloud

In [None]:
img = WordCloud(font_path = r"", # Type your font's location
    background_color="white",
    height = 1000,
    width = 1000).generate_from_frequencies(counter)
plt.imshow(img)