# ✅**Cluster_to_File(완료).ipynb - 클러스터링 결과 저장** 
---
### 5만개의 Document를 K-means Clustering을 사용해 총 30개로 군집화
* 각 Document는 Cluster 별 Labeling 결과 저장 ('clustering_news.csv')
* 각 Cluster별 top_n_features 결과 저장 ('clustering_feature.csv')


> Used Method, Skills 
* Data PreProcessing
* K-means Clustering

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import csv,re 
import pandas as pd

In [None]:
# 불용어(Stop_word) 
with open('/content/drive/MyDrive/DataMining TeamProject/stopword.txt', 'r') as f:
    STOP_WORDS = f.read().replace('\n',' ').split()

# Data Preprocessing
def preprocessing(sentence):
    sentence = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z]', ' ', sentence)
    # 불용어들이 제거되지 않아서 추가했습니다.
    temp = [word for word in sentence.split() if word not in STOP_WORDS]
    return ' '.join(temp)

In [None]:
it = pd.read_csv('/content/drive/MyDrive/DataMining TeamProject/NewsData/IT과학_202111.csv', encoding='euc-kr')
politics = pd.read_csv('/content/drive/MyDrive/DataMining TeamProject/NewsData/정치_202111.csv', encoding='euc-kr')
world = pd.read_csv('/content/drive/MyDrive/DataMining TeamProject/NewsData/세계_202111.csv', encoding='euc-kr')
culture = pd.read_csv('/content/drive/MyDrive/DataMining TeamProject/NewsData/생활문화_202111.csv', encoding='euc-kr')
society = pd.read_csv('/content/drive/MyDrive/DataMining TeamProject/NewsData/사회_202111.csv', encoding='euc-kr')
economy = pd.read_csv('/content/drive/MyDrive/DataMining TeamProject/NewsData/경제_202111.csv', encoding='euc-kr')

it = it.iloc[:,[2,3]]
it.columns = ['title', 'content']
politics = politics.iloc[:, [2,3]]
politics.columns = ['title', 'content']
world = world.iloc[:,[2,3]]
world.columns = ['title', 'content']
culture = culture.iloc[:,[2,3]]
culture.columns = ['title', 'content']
society = society.iloc[:, [2,3]]
society.columns = ['title', 'content']
economy = economy.iloc[:, [2,3]]
economy.columns = ['title', 'content']

news = pd.concat([it,politics,world,culture,society,economy], ignore_index=True)
news['content_cleand'] = news['content'].apply(preprocessing)
content = news['content_cleand'].tolist()

In [None]:
# K-means Clustering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans

n_clusters = 100

vectorizer = TfidfVectorizer(stop_words=STOP_WORDS)
X = vectorizer.fit_transform(content)

X = normalize(X)

kmeans = KMeans(n_clusters=n_clusters).fit(X)

labels = kmeans.labels_
centers = kmeans.cluster_centers_

In [None]:
# Cluster 별 뉴스 저장 ('clustering_news.csv')
news['labels'] = labels
news.to_csv('/content/drive/MyDrive/DataMining TeamProject/clustering_news.csv', encoding='euc-kr')
print(kmeans)

In [None]:
# Cluster 내부의 데이터에서 핵심 단어 10개 추출
def get_cluster_details(cluster_model, cluster_data, feature_names,
                       cluster_num, top_n_features=10):
    cluster_details = {}
    center_feature_idx = cluster_model.cluster_centers_.argsort()[:,::-1]
    
    for cluster_num in range(cluster_num):
        cluster_details[cluster_num] = {}
        cluster_details[cluster_num]['cluster'] = cluster_num
        
        top_ftr_idx = center_feature_idx[cluster_num, :top_n_features]
        top_ftr = [feature_names[idx] for idx in top_ftr_idx]
        top_ftr_val = cluster_model.cluster_centers_[cluster_num, top_ftr_idx].tolist()
        
        cluster_details[cluster_num]['top_features'] = top_ftr
        cluster_details[cluster_num]['top_featrues_value'] = top_ftr_val
        filenames = cluster_data[cluster_data['labels']==cluster_num]['title']
        filenames = filenames.values.tolist()
        cluster_details[cluster_num]['filenames'] = filenames
    
    return cluster_details

clustering_feature = []
def save_cluster_details(cluster_details):
    for cluster_num, cluster_detail in cluster_details.items():
        clustering_feature.append(cluster_detail['top_features'])

feature_names = vectorizer.get_feature_names()
cluster_details = get_cluster_details(cluster_model=kmeans,
                                      cluster_data=news,
                                      feature_names=feature_names,
                                      cluster_num=100,
                                      top_n_features=20)
save_cluster_details(cluster_details)
print(clustering_feature)

In [None]:
# Cluster 별 상위 feature 저장 ('clustering_features.csv')
with open('/content/drive/MyDrive/DataMining TeamProject/clustering_feature.csv', 'w', encoding='euc-kr', newline='') as f:
    write = csv.writer(f)
    for row in enumerate(clustering_feature):
        write.writerow(row)