In [1]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans

In [2]:
with open('./processed_song.json', 'r', encoding='UTF8') as f:
    data = json.load(f)
df = pd.json_normalize(data)

In [4]:
content = df['tags'].tolist()

In [9]:
n_clusters = 10
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(content)

kmeans = KMeans(n_clusters=n_clusters).fit(X)

labels = kmeans.labels_
centers = kmeans.cluster_centers_
df['labels'] = labels

In [14]:
def get_cluster_details(cluster_model, cluster_data, feature_names,
                       cluster_num, top_n_features=10):
    cluster_details = {}
    center_feature_idx = cluster_model.cluster_centers_.argsort()[:,::-1]
    
    for cluster_num in range(cluster_num):
        cluster_details[cluster_num] = {}
        cluster_details[cluster_num]['cluster'] = cluster_num
        
        top_ftr_idx = center_feature_idx[cluster_num, :top_n_features]
        top_ftr = [feature_names[idx] for idx in top_ftr_idx]
        top_ftr_val = cluster_model.cluster_centers_[cluster_num, top_ftr_idx].tolist()
        
        cluster_details[cluster_num]['top_features'] = top_ftr
        cluster_details[cluster_num]['top_featrues_value'] = top_ftr_val
        filenames = cluster_data[cluster_data['labels']==cluster_num]['tags']
        filenames = filenames.values.tolist()
        cluster_details[cluster_num]['filenames'] = filenames
    
    return cluster_details
feature_names = vectorizer.get_feature_names_out()
details = get_cluster_details(cluster_model=kmeans, cluster_data=df, feature_names=feature_names, cluster_num=n_clusters)

In [16]:
for key, value in details.items():
    print(key, value['top_features'])

0 ['감성', '기분전환', '인디', '발라드', '드라이브', '휴식', '카페', '힙합', '겨울', '아이돌']
1 ['회상', '추억', '이별', '슬픔', '발라드', '새벽', '감성', '힐링', '잔잔한', '사랑']
2 ['운동', '댄스', '아이돌', '스트레스', '드라이브', '기분전환', '신나는', 'kpop', '걸그룹', '케이팝']
3 ['설렘', '사랑', '달달', '연애', '고백', '데이트', '달달한', '두근두근', '벚꽃', '카페']
4 ['시원한', '여름', '청량한', '더위', '신나는', '여행', '트로피컬', '드라이브', '기분전환', '댄스']
5 ['어쿠스틱', '인디', '감성', '카페', '가을', '잔잔한', '휴식', '기타', '사랑', 'boywithluv']
6 ['새벽', '잔잔한', '감성', '휴식', '힐링', '인디', '카페', '새벽감성', '위로', '발라드']
7 ['설렘', '사랑', '힐링', '휴식', '까페', '기분전환', '새벽', '잔잔한', '인디', '여행']
8 ['이별', '슬픔', '발라드', '감성', '새벽', '비오는날', '눈물', '사랑', '외로움', '잔잔한']
9 ['알앤비', '힙합', '감성힙합', '소울', 'rnb', '감성', '드라이브', '새벽', '그루브', '트렌디']


In [18]:
for i in range(0, 10):
    print(F"Cluster {i}: {len(df[df['labels'] == i])}")

Cluster 0: 7765
Cluster 1: 1906
Cluster 2: 1759
Cluster 3: 1776
Cluster 4: 455
Cluster 5: 867
Cluster 6: 2464
Cluster 7: 1475
Cluster 8: 1917
Cluster 9: 1587
