In [None]:
%pip install feedparser pandas matplotlib tqdm konlpy wordcloud

In [None]:
import feedparser
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from konlpy.tag import Okt
from collections import Counter
from wordcloud import WordCloud
from PIL import Image

okt = Okt()

In [None]:
feed_df = pd.read_csv("dataset/feed_specs.csv")
feed_df

In [None]:
publishers = set(feed_df["publisher"].unique())
categories = set()
for cate in feed_df["categories"].unique():
    if "|" in cate:
        cate1, cate2 = cate.split("|")
        categories.add(cate1)
        categories.add(cate2)
    else:
        categories.add(cate)

categories

In [None]:
category = "entertainment"  # modify here!

economy_feed_df = feed_df[feed_df["categories"].str.contains(category)]

titles = []
descriptions = []

for _, (publisher, title, categories, url) in tqdm(
    economy_feed_df.iterrows(), total=len(economy_feed_df)
):
    feed = feedparser.parse(url)
    for entry in feed["entries"]:
        titles.append(entry["title"])
        descriptions.append(entry["description"])

In [None]:
title = " ".join(titles)
description = " ".join(descriptions)

target = title + " " + description

counts = Counter(filter(lambda s: len(s) > 1, okt.nouns(target)))

# preprocessing
for publisher in publishers:
    if publisher in counts:
        del counts[publisher]

for blacklist in ["기자"]:
    if blacklist in counts:
        del counts[blacklist]

counts.most_common(20)

In [None]:
font_path = r"C:\Windows\Fonts\malgun.ttf"
mask = np.array(Image.open("img/cloud.png"))
wc = WordCloud(
    font_path=font_path,
    background_color="#07061700",
    mask=mask,
    width=2000,
    height=1600,
    max_words=300,
)
wc.generate_from_frequencies(counts)

plt.figure(figsize=(10, 8))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()