## 이슈트래커 키워드 추출하기

In [3]:
import os
from dotenv import load_dotenv

load_dotenv()
project_id = os.environ.get('GOOGLE_CLOUD_PROJECT_ID')

In [4]:
# 스파크 설정
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp, udf

spark = SparkSession.builder\
  .appName('issue-tracker')\
  .config('spark.jars', '../spark-3.3-bigquery-0.32.0.jar')\
  .getOrCreate()

23/08/12 10:59:41 WARN Utils: Your hostname, HwangonJang-MacBookPro.local resolves to a loopback address: 127.0.0.1; using 172.30.1.29 instead (on interface en0)
23/08/12 10:59:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/08/12 10:59:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [12]:
# timestamp udf 설정
from datetime import datetime, timedelta

@udf('string')
def datetime_to_string(dt):
  dt -= timedelta(hours=9)
  return dt.strftime('%Y-%m-%d %H:%M:%S')

@udf('string')
def datetime_to_string_minus(min):
  dt = datetime.now()
  dt -= timedelta(minutes=min)
  return dt.strftime('%Y-%m-%d %H:%M:%S')

spark.udf.register("datetime_to_string", datetime_to_string)
spark.udf.register("datetime_to_string_minus", datetime_to_string_minus)

23/08/12 11:04:36 WARN SimpleFunctionRegistry: The function datetime_to_string replaced a previously registered function.


<function __main__.datetime_to_string_minus(min)>

In [6]:
# 데이터 조회
df = spark.read \
  .format('bigquery') \
  .load(f'{project_id}.news.news')

df.createOrReplaceTempView("news")

In [7]:
# 시간 truncate by 10 minutes
def truncate_seconds(dt, interval_minutes):
    truncated_minutes = int(dt.minute / interval_minutes) * interval_minutes
    return dt.replace(minute=truncated_minutes, second=0, microsecond=0)

interval_minutes = 10  # 잘라낼 분 단위

In [32]:
# 하루 동안의 뉴스 수집
query = """
SELECT
    id,
    title,
    to_timestamp(datetime_to_string(article_written_at)) as article_written_at,
    to_timestamp(datetime_to_string(scraped_at)) as scraped_at,
    category
FROM
    news
WHERE
    to_timestamp(datetime_to_string(article_written_at)) >= to_timestamp(datetime_to_string_minus(60 * 24))
    AND to_timestamp(datetime_to_string(article_written_at)) < to_timestamp(datetime_to_string_minus(0))
"""

# 스파크 SQL query 실행
issue_df = spark.sql(query)

In [33]:
# lazy execution - 실제로 값이 필요할 때 계산
issue_df.show()

[Stage 10:>                                                         (0 + 1) / 1]

+----------+---------------------------------+-------------------+-------------------+--------+
|        id|                            title| article_written_at|         scraped_at|category|
+----------+---------------------------------+-------------------+-------------------+--------+
|0000613995|   [날씨] 주말 전국 가끔 비…동...|2023-08-12 10:51:01|2023-08-12 11:00:00|    과학|
|0000037844|    치매까지?... '코골이'가 유...|2023-08-12 10:25:01|2023-08-12 10:30:00|    과학|
|0000018679|      '벤츠 잡은' 렉서스 ES300...|2023-08-12 10:31:01|2023-08-12 10:40:00|    과학|
|0004924612|  "여권도 타버린 듯" 하와이 덮...|2023-08-12 10:22:12|2023-08-12 10:30:00|    사회|
|0002182713|   “결혼할 사이잖아, 돈 줘” 부...|2023-08-12 10:32:03|2023-08-12 10:40:00|    사회|
|0000754778|탯줄 달린 신생아 종이봉투에 넣...|2023-08-12 10:49:01|2023-08-12 10:50:00|    사회|
|0003781294|수상한 걸음걸이 본 베테랑 경찰...|2023-08-12 10:44:01|2023-08-12 10:50:00|    사회|
|0012027923| 여당, 이재명 '안면인식장애'에...|2023-08-12 10:36:10|2023-08-12 10:40:00|    정치|
|0002402424|   '전 정부 탓'과 '카르텔 타파...|2023-08-1

                                                                                

In [36]:
issue_df.createOrReplaceTempView("issue_single_day")

# 1시간 동안의 정치 카테고리 뉴스 수집
query = """
SELECT
    id,
    title,
    article_written_at,
    scraped_at,
    category
FROM
    issue_single_day
WHERE
    article_written_at >= to_timestamp(datetime_to_string_minus(60))
    AND article_written_at < to_timestamp(datetime_to_string_minus(0))
    AND category == '정치'
"""

# 스파크 SQL query 실행
issue_hour_df = spark.sql(query)

In [37]:
issue_hour_df.show()

[Stage 12:>                                                         (0 + 1) / 1]

+----------+--------------------------------+-------------------+-------------------+--------+
|        id|                           title| article_written_at|         scraped_at|category|
+----------+--------------------------------+-------------------+-------------------+--------+
|0012027923|여당, 이재명 '안면인식장애'에...|2023-08-12 10:36:10|2023-08-12 10:40:00|    정치|
|0002402424|  '전 정부 탓'과 '카르텔 타파...|2023-08-12 10:31:01|2023-08-12 10:40:00|    정치|
+----------+--------------------------------+-------------------+-------------------+--------+



                                                                                

In [41]:
# 데이터 전처리
import pandas as pd

pd_issue_df = issue_df.select('title', 'category').toPandas()
pd_issue_hour_df = issue_hour_df.select('title', 'category').toPandas()

issues = pd_issue_df.copy()
for _ in range(3):
    issues = pd.concat([issues, pd_issue_hour_df], ignore_index=True)

                                                                                

In [42]:
issues.head(5)

Unnamed: 0,title,category
0,"""당근 3개로 피부톤을 바꿔?""...난리난 '태닝법'의 진짜 효과는?",과학
1,치매까지?... '코골이'가 유발하는 질병 3,과학
2,"'벤츠 잡은' 렉서스 ES300h, 소리 없이 강했다",과학
3,"""여권도 타버린 듯"" 하와이 덮친 산불…신혼여행 예약자들도 발동동",사회
4,"“결혼할 사이잖아, 돈 줘” 부자 행세로 7억원 받아챙긴 30대 실형",사회


In [43]:
from konlpy.tag import Okt

# Okt 객체 생성
okt = Okt()

def remove_bracket_text(title):
    return re.sub(r'\[.*?\]', '', title)

# '정치' 카테고리에 해당하는 모든 제목을 합친 후 명사 추출 작업 수행
category_politics = issues[issues['category'] == '정치']
category_politics['title'] = category_politics['title'].apply(remove_bracket_text)  # 대괄호 안 신문사 이름 삭제

all_titles = ' '.join(category_politics['title'])
tokens_const = okt.nouns(all_titles)

const_cnt = {}
max_words = 20

for word in tokens_const:
    const_cnt[word] = const_cnt.get(word, 0) + 1
sorted_w = sorted(const_cnt.items(), key=lambda kv: kv[1])
result = sorted_w[-max_words:]
result.reverse()

print(result)

FileNotFoundError: [Errno 2] JVM DLL not found: /usr/local/opt/liquibase/.install4j/jre.bundle/Contents/Home/lib/libjli.dylib
