In [None]:
%matplotlib inline

import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

import random
import pytagcloud
import webbrowser
from konlpy.tag import Okt
from collections import Counter
from IPython.display import Image

source_url = "https://namu.wiki/RecentChanges"

req = requests.get(source_url)
html = req.content
soup = BeautifulSoup(html, 'lxml')
contents_table = soup.find(name='table')
table_body = contents_table.find(name="tbody")
table_rows = table_body.find_all(name="tr") #페이지마다 달라지는 부분

page_url_base = "https://namu.wiki"
page_urls = []
for index in range(0,len(table_rows)):
    first_td = table_rows[index].find_all('td')[0]
    td_url = first_td.find_all('a')
    if len(td_url) > 0:
        page_url = page_url_base + td_url[0].get('href')
        if 'png' not in page_url:
            page_urls.append(page_url)

page_urls = list(set(page_urls))


columns = ['title','category','content_text']
df = pd.DataFrame(columns = columns)

for page_url in page_urls:
    
    req = requests.get(page_url)
    html = req.content
    soup = BeautifulSoup(html, 'lxml')
    contents_table = soup.find(name="article")
    title = contents_table.find_all('h1')[0]
    
    if len(contents_table.find_all('ul')) >0:
        category =contents_table.find_all('ul')[0]
    else :
        category =None
        
    content_paragraphs = contents_table.find_all(name="div", attrs={"class":"wiki-paragraph"})
    content_corpus_list = []
    
    if title is not None:
        row_title = title.text.replace('\n', " ")
    else:
        row_title = ""
    
    if content_paragraphs is not None:
        for paragraphs in content_paragraphs:
            if paragraphs is not None:
                content_corpus_list.append(paragraphs.text.replace("\n", " "))
            else:
                content_corpus_list.append("")
    else:
        content_corpus_list.append("")
        
    if category is not None:
        row_category = category.text.replace("\n"," ")
    else:
        row_category = ""
        
    row = [row_title, row_category, "".join(content_corpus_list)]
    series = pd.Series(row, index=df.columns)
    df = df.append(series, ignore_index = True)

def text_cleaning(text):
    hangul = re.compile('[^ㄱ-ㅣ가-힣]+')
    result =hangul.sub('',text)
    return result
df['title'] = df['title'].apply(lambda x : text_cleaning(x))
df['category'] = df['category'].apply(lambda x : text_cleaning(x))
df['content_text'] = df['content_text'].apply(lambda x : text_cleaning(x))

title_corpus = "".join(df['title'].tolist())
category_corpus = "".join(df['category'].tolist())
content_corpus = "".join(df['content_text'].tolist())

with open("korean_stopwords.txt", encoding= 'utf8') as f:
    stopwords = f.readlines()
stopwords = [x.strip() for x in stopwords]

nouns_tagger = Okt()
nouns = nouns_tagger.nouns(title_corpus)
count = Counter(nouns)

remove_char_counter = Counter({x: count[x]for x in count if len(x)>1})
remove_char_counter = Counter({x: remove_char_counter[x]for x in count if x not in stopwords})

ranked_tags = remove_char_counter.most_common(40)
taglist = pytagcloud.make_tags(ranked_tags, maxsize=40)
pytagcloud.create_tag_image(taglist,'title_wordcloud.jpg', size=(400, 400), fontname='Nanum Gothic', rectangular=False)

Image(filename = 'title_wordcloud.jpg')

