In [None]:
import json
import pandas as pd
from konlpy.tag import  Okt
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
import numpy as np
import os

In [None]:
# 월별 데이터로 생성하기
json_list = os.listdir('../news-crawler/it/')
json_files = [file for file in json_list if file.endswith('.json')]  
data = []
df = pd.DataFrame()
for i in json_files:
    for line in open(('../news-crawler/it/'+i),"r", encoding='utf-8-sig'):
        df = pd.concat([df, pd.DataFrame(json.loads(line), columns=['id', 'title', 'content', 'date', 'like'])])  

In [None]:
# # 주차별 데이터로 생성하기
# with open("../news-crawler/it/2021-11-07-20.json", "r", encoding='utf-8-sig') as f:
#     tmp = json.load(f)
# df = pd.DataFrame(json.loads(line), columns=['id', 'title', 'content', 'date', 'like'])

In [None]:
okt = Okt() 
# okt.analyze  #구(Phrase) 분석
# okt.morphs   #형태소 분석
# okt.nouns    #명사 분석
# okt.pos      #형태소 분석 태깅

noun_list = []
for content in tqdm(df['content']): 
    nouns = okt.nouns(content)
    noun_list.append(nouns)
df['nouns'] = noun_list
print(df.head())

In [None]:
# 문서를 명사 집합으로 보고 문서 리스트로 치환 (tfidfVectorizer 인풋 형태를 맞추기 위해)
text = [" ".join(noun) for noun in df['nouns']]

tfidf_vectorizer = TfidfVectorizer(min_df = 5, ngram_range=(1,5))
tfidf_vectorizer.fit(text)
vector = tfidf_vectorizer.transform(text).toarray()

vector = np.array(vector) # Normalizer를 이용해 변환된 벡터
model = DBSCAN(eps=0.3,min_samples=6, metric = "cosine")
# 거리 계산 식으로는 Cosine distance를 이용
result = model.fit_predict(vector)
df['result'] = result

In [None]:
# Delete noise or garbage data
df = df[df['result'] != -1]
df = df[df['result'] != 0]

cluster_dict = {}

# Create clustering using titles
for i in df['result'].unique().tolist():
    cluster_dict[i] = df[df['result'] == i].title.tolist()

# Order by Hot topic    
sorted_cluster_dict = sorted(cluster_dict.items(), key=lambda x : len(x[1]), reverse=True)


In [None]:
final_dict = {}

# Create 10 data sets of clustered data
for idx, (cluster_num, titles) in enumerate(sorted_cluster_dict[:10]):
    keywords_list = []
    id_list = []
    like_list = []

    # Extract keyword from title
    for title in titles:
        keywords = okt.nouns(title)
        keywords_list.extend(keywords)
        # Add like / ID for sorting and linking
        id_list = df[df['result']==cluster_num].id.tolist()
        like_list = df[df['result']==cluster_num].like.tolist()
    # Create keyword standard
    standard_keyword = set(keywords_list)

    tmp_keyword = []

    # Keyword Counting
    for keyword in standard_keyword:
        tmp_keyword.append((keyword, keywords_list.count(keyword)))

    # Hot Keyword Top 3
    tmp_keyword.sort(key = lambda x : x[1], reverse=True)
    final_keyword = [keyword for keyword in tmp_keyword[:3]]

    # Create dictonary file
    final_data = []
    for id, title, like in zip(id_list, titles, like_list):
        id['like'] = like
        id['title'] = title
        final_data.append(id)
    final_dict[f"cluster{idx}"] = {}
    final_dict[f'cluster{idx}']['keyword'] = final_keyword
    final_dict[f'cluster{idx}']['data'] = final_data

In [None]:
final_dict