In [None]:
import json
import pandas as pd
from konlpy.tag import  Okt
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
import numpy as np
import os


json_list = os.listdir('../news-crawler')
json_files = [file for file in json_list if file.endswith('.json')]  
data = []
df = pd.DataFrame()
for i in json_files:
    for line in open(('../news-crawler/'+i),"r", encoding='utf-8-sig'):
        df = pd.concat([df, pd.DataFrame(json.loads(line), columns=['id', 'title', 'content', 'date', 'like'])])  

okt = Okt() 
# okt.analyze  #구(Phrase) 분석
# okt.morphs   #형태소 분석
# okt.nouns    #명사 분석
# okt.pos      #형태소 분석 태깅

noun_list = []
for content in tqdm(df['content']): 
    nouns = okt.nouns(content) # 명사만 추출하기, 결과값은 명사 리스트
    noun_list.append(nouns)
df['nouns'] = noun_list
print(df.head())

# 문서를 명사 집합으로 보고 문서 리스트로 치환 (tfidfVectorizer 인풋 형태를 맞추기 위해)
text = [" ".join(noun) for noun in df['nouns']]

tfidf_vectorizer = TfidfVectorizer(min_df = 5, ngram_range=(1,5))
tfidf_vectorizer.fit(text)
vector = tfidf_vectorizer.transform(text).toarray()

vector = np.array(vector) # Normalizer를 이용해 변환된 벡터
model = DBSCAN(eps=0.3,min_samples=6, metric = "cosine")
# 거리 계산 식으로는 Cosine distance를 이용
result = model.fit_predict(vector)
df['result'] = result

In [None]:
df.head()

In [None]:
df2 = df[df['result'] != -1]
df3 = df2[df2['result'] != 0]
df3.head()

In [None]:
cluster_dict = {}


for i in df3['result'].unique().tolist():
    cluster_dict[i] = df3[df3['result'] == i].title.tolist()

In [None]:
sorted_cluster_dict = sorted(cluster_dict.items(), key=lambda x : len(x[1]), reverse=True)

In [None]:
final_dict = {}

for idx, (cluster_num, texts) in enumerate(sorted_cluster_dict[:10]):
    keywords_list = []
    id_list = []
    like_list = []

    for text in texts:
        keywords = okt.nouns(text)
        keywords_list.extend(keywords)
        id_list = df3[df3['result']==cluster_num].id.tolist()
        like_list = df3[df3['result']==cluster_num].like.tolist()
    standard = set(keywords_list)

    final_keyword = []

    for keyword in standard:
        final_keyword.append((keyword, keywords_list.count(keyword)))

    final_keyword.sort(key = lambda x : x[1], reverse=True)
    output = [keyword for keyword, num in final_keyword[:3]]

    final = []
    for id, title, like in zip(id_list, texts, like_list):
        id['like'] = like
        id['title'] = title
        final.append(id)
    
    final_dict[f"cluster{idx}"] = {}
    final_dict[f'cluster{idx}']['keyword'] = output
    final_dict[f'cluster{idx}']['data'] = final

In [None]:
final_dict