In [1]:
import pandas as pd
import glob, os
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_colwidth', 700)

In [2]:
# FILE_PATH 디렉토리 내에 있는 모든 *.data 파일 불러오기
FILE_PATH = "./topics"
all_files = glob.glob(os.path.join(FILE_PATH, "*.data"))
all_files

['./topics\\accuracy_garmin_nuvi_255W_gps.txt.data',
 './topics\\bathroom_bestwestern_hotel_sfo.txt.data',
 './topics\\battery-life_amazon_kindle.txt.data',
 './topics\\battery-life_ipod_nano_8gb.txt.data',
 './topics\\battery-life_netbook_1005ha.txt.data',
 './topics\\buttons_amazon_kindle.txt.data',
 './topics\\comfort_honda_accord_2008.txt.data',
 './topics\\comfort_toyota_camry_2007.txt.data',
 './topics\\directions_garmin_nuvi_255W_gps.txt.data',
 './topics\\display_garmin_nuvi_255W_gps.txt.data',
 './topics\\eyesight-issues_amazon_kindle.txt.data',
 './topics\\features_windows7.txt.data',
 './topics\\fonts_amazon_kindle.txt.data',
 './topics\\food_holiday_inn_london.txt.data',
 './topics\\food_swissotel_chicago.txt.data',
 './topics\\free_bestwestern_hotel_sfo.txt.data',
 './topics\\gas_mileage_toyota_camry_2007.txt.data',
 './topics\\interior_honda_accord_2008.txt.data',
 './topics\\interior_toyota_camry_2007.txt.data',
 './topics\\keyboard_netbook_1005ha.txt.data',
 './topics\\

In [3]:
# 파일 이름
file_names = []

# 내용
contents = []

for data_file in all_files:

  temp_df = pd.read_table(data_file, index_col=None, header=0, encoding='latin1')

  # 파일명 가져오기
  file_name = data_file.split("/")[-1]
  file_name = file_name.split(".")[0]

  # 파일명 리스트와 내용 리스트에 각각 내용을 저장
  file_names.append(file_name)
  contents.append(temp_df.to_string())

doc_df = pd.DataFrame({"filename" : file_names, "content": contents})
doc_df.head()

Unnamed: 0,filename,content
0,topics\accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\n0 but for the most part, we find that the Garmin software provides accurate directions, whereever we intend to go .\n1 This functi..."
1,topics\bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and very comfortable beds, a great shower and very clean bathrooms .\n0 The second room was smaller, with a very inconvenient bathroom layout, but at least it was quieter and we were able to sleep .\n1 ..."
2,topics\battery-life_amazon_kindle,"After I plugged it in to my USB hub on my computer to charge the battery the charging cord design is very clever !\n0 After you have paged tru a 500, page book one, page, at, a, time to get from Chapter 2 to Chapter 15, see how excited you are about a low battery and all the time it took to get there !\n1 ..."
3,topics\battery-life_ipod_nano_8gb,short battery life I moved up from an 8gb .\n0 I love this ipod except for the battery life .\n1 ...
4,topics\battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh Li, ion Battery , and a 1 .\n0 Not to mention that as of now..."


# 문서의 군집
- 수집한 문서가 어떤 특징에 의해 묶여있는지를 확인
- 주제에 의해 군집

## 전처리

In [4]:
import spacy

nlp = spacy.load("en_core_web_sm")

# 문장 한개에 대한 토큰화 처리 로직
def my_tokenizer(text):
  doc = nlp(text)
  return [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]

## Vectorization

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    tokenizer=my_tokenizer, # tfidf에게 커스텀 토크나이저를 사용하도록 설정
    ngram_range=(1, 2),
    min_df = 0.05,
    max_df = 0.95
)

feature_vector = tfidf_vectorizer.fit_transform(doc_df['content'])
feature_vector

<51x5486 sparse matrix of type '<class 'numpy.float64'>'
	with 40085 stored elements in Compressed Sparse Row format>

## KMeans 군집

In [6]:
from sklearn.cluster import KMeans

# 3개의 주제로 이루어진 데이터니까, 군집도 3개로 만듦.
km_cluster = KMeans(n_clusters=3, max_iter=10000, random_state=42)
km_cluster.fit(feature_vector)

In [7]:
# cluster 레이블 확인
cluster_labels = km_cluster.labels_
cluster_labels

array([2, 1, 0, 0, 0, 0, 2, 2, 2, 2, 0, 2, 0, 1, 1, 1, 2, 2, 2, 0, 1, 1,
       2, 0, 1, 2, 0, 2, 1, 2, 1, 1, 1, 2, 0, 0, 0, 2, 1, 1, 1, 0, 2, 2,
       0, 1, 1, 2, 2, 0, 2])

In [8]:
# 중요한거임. centroid 확인
cluster_centroids = km_cluster.cluster_centers_
cluster_centroids

array([[0.00195373, 0.00195373, 0.00195373, ..., 0.        , 0.00071118,
        0.        ],
       [0.00285233, 0.00285233, 0.00285233, ..., 0.0017929 , 0.00741083,
        0.00143834],
       [0.00143939, 0.00143939, 0.00143939, ..., 0.        , 0.        ,
        0.        ]])

In [9]:
doc_df['cluster_label'] = cluster_labels
doc_df.sort_values(by='cluster_label')

Unnamed: 0,filename,content,cluster_label
41,topics\size_asus_netbook_1005ha,"A few other things I'd like to point out is that you must push the micro, sized right angle end of the ac adapter until it snaps in place or the battery may not charge .\n0 The full size right shift k...",0
34,topics\screen_garmin_nuvi_255W_gps,It is easy to read and when touching the screen it works great !\n0 and zoom out buttons on the 255w to the same side of the screen which makes it a bit easier .\n1 ...,0
26,topics\performance_netbook_1005ha,"The Eee Super Hybrid Engine utility lets users overclock or underclock their Eee PC's to boost performance or provide better battery life depending on their immediate requirements .\n0 In Super Performance mode CPU, Z shows the bus speed to increase up to 169 .\n1 One...",0
49,topics\video_ipod_nano_8gb,"I bought the 8, gig Ipod Nano that has the built, in video camera .\n0 Itunes has an on, line store, where you may purchase and download music and videos which will install onto the ipod .\n1 ...",0
23,topics\navigation_amazon_kindle,"In fact, the entire navigation structure has been completely revised , I'm still getting used to it but it's a huge step forward .\n0 ...",0
19,topics\keyboard_netbook_1005ha,", I think the new keyboard rivals the great hp mini keyboards .\n0 Since the battery life difference is minimum, the only reason to upgrade would be to get the better keyboard .\n1 The keyboard is now as good as t...",0
35,topics\screen_ipod_nano_8gb,"As always, the video screen is sharp and bright .\n0 2, inch screen and a glossy, polished aluminum finish that one CNET editor described as looking like a Christmas tree ornament .\n1 ...",0
12,topics\fonts_amazon_kindle,"Being able to change the font sizes is awesome !\n0 For whatever reason, Amazon decided to make the Font on the Home Screen ...",0
44,topics\speed_windows7,"Windows 7 is quite simply faster, more stable, boots faster, goes to sleep faster, comes back from sleep faster, manages your files better and on top of that it's beautiful to look at and easy to use .\n0 , faster about 20% to 30% faster at running applications than my Vista , seriously\n1 ...",0
10,topics\eyesight-issues_amazon_kindle,"It feels as easy to read as the K1 but doesn't seem any crisper to my eyes .\n0 the white is really GREY, and to avoid considerable eye, strain I had to refresh pages every other page .\n1 The dream has always been a portable electronic device that could hold a ton of reading material, automate subscriptions and fa...",0


## 군집별 핵심 단어(핵심 Feature) 추출 하기
- 센트로이드 구하기
- 값이 큰 인덱스를 내림차순으로 정렬(????)
    - argsort
- 클러스터 별로 핵심 단어를 데이터 프레임의 칼럼으로 부터 추출

In [10]:
# cluster_model : 군집 모델
# cluster_data : 데이터 프레임
# feature_names : 단어 목록
# cluster_num : 클러스터 개수
# top_n_features : 대표 단어 몇 개를 볼 것인지 결정

def get_cluster_details(cluster_model, cluster_data, feature_names, cluster_num, top_n_features=10):

  # top N feature, 파일 이름, feature tf-idf값
  cluster_details = {}

  # 군집의 센트로이드 좌표를 얻어내기
  cluster_centroid = cluster_model.cluster_centers_

  # centroid 좌표의 제일 큰 값이 그 군집의 핵심 단어
  center_features = cluster_centroid.argsort()[:, ::-1]

  #  각 군집 별 순환을 통해 핵심 단어, feature 값(tfidf값), 각 군집에 속해있는 파일이름 얻어내기
  for c_num in range(cluster_num):

    # 군집 별 데이터를 담아낼 준비
    cluster_details[c_num] = {}
    cluster_details[c_num]['cluster'] = c_num # 클러스터 번호 담기

    # 핵심 단어 목록 담아주기 (top N개)
    top_n_feature_indexes = center_features[c_num, :top_n_features]
    top_features = [ feature_names[idx] for idx in top_n_feature_indexes ]
    cluster_details[c_num]['top_features'] = top_features

    # 핵심이 되는 단어들의 tfidf값
    top_n_feature_values = cluster_centroid[c_num, top_n_feature_indexes].tolist()
    cluster_details[c_num]['top_feature_value'] = top_n_feature_values

    # 군집에 포함된 파일명 담아주기
    file_names = cluster_data[cluster_data['cluster_label'] == c_num]['filename']
    file_names = file_names.tolist()

    cluster_details[c_num]['filenames'] = file_names

  return cluster_details

In [11]:
cluster_detail_info = get_cluster_details(
    cluster_model = km_cluster,
    cluster_data = doc_df,
    feature_names = tfidf_vectorizer.get_feature_names_out(), # get_feature_names_out : tfidf의 단어 목록
    cluster_num=3
)

pd.DataFrame(cluster_detail_info)

Unnamed: 0,0,1,2
cluster,0,1,2
top_features,"[screen, battery, keyboard, battery life, life, size, video, button, page, font]","[room, hotel, service, staff, food, location, bathroom, clean, price, parking]","[interior, seat, mileage, comfortable, gas, direction, quality, gas mileage, voice, car]"
top_feature_value,"[0.19939472038967973, 0.18974291883188898, 0.10309011098139251, 0.1004631607195522, 0.0923613680238176, 0.07893394075660798, 0.07870749582379354, 0.07124999521248998, 0.06988191917634684, 0.06348869005986651]","[0.2603903466326337, 0.1974570079009183, 0.176013604793594, 0.14907587044745343, 0.12564322921473856, 0.1234215714853121, 0.07331443779984378, 0.0708373973440595, 0.060543124031785266, 0.055595358273447394]","[0.11152874279968289, 0.09400380424424805, 0.08719266422020207, 0.06206590600448321, 0.05991324643285342, 0.05853617179843467, 0.057923978104779034, 0.05648542188466271, 0.05426773306071173, 0.05304703812739652]"
filenames,"[topics\battery-life_amazon_kindle, topics\battery-life_ipod_nano_8gb, topics\battery-life_netbook_1005ha, topics\buttons_amazon_kindle, topics\eyesight-issues_amazon_kindle, topics\fonts_amazon_kindle, topics\keyboard_netbook_1005ha, topics\navigation_amazon_kindle, topics\performance_netbook_1005ha, topics\screen_garmin_nuvi_255W_gps, topics\screen_ipod_nano_8gb, topics\screen_netbook_1005ha, topics\size_asus_netbook_1005ha, topics\speed_windows7, topics\video_ipod_nano_8gb]","[topics\bathroom_bestwestern_hotel_sfo, topics\food_holiday_inn_london, topics\food_swissotel_chicago, topics\free_bestwestern_hotel_sfo, topics\location_bestwestern_hotel_sfo, topics\location_holiday_inn_london, topics\parking_bestwestern_hotel_sfo, topics\price_holiday_inn_london, topics\rooms_bestwestern_hotel_sfo, topics\rooms_swissotel_chicago, topics\room_holiday_inn_london, topics\service_bestwestern_hotel_sfo, topics\service_holiday_inn_london, topics\service_swissotel_hotel_chicago, topics\staff_bestwestern_hotel_sfo, topics\staff_swissotel_chicago]","[topics\accuracy_garmin_nuvi_255W_gps, topics\comfort_honda_accord_2008, topics\comfort_toyota_camry_2007, topics\directions_garmin_nuvi_255W_gps, topics\display_garmin_nuvi_255W_gps, topics\features_windows7, topics\gas_mileage_toyota_camry_2007, topics\interior_honda_accord_2008, topics\interior_toyota_camry_2007, topics\mileage_honda_accord_2008, topics\performance_honda_accord_2008, topics\price_amazon_kindle, topics\quality_toyota_camry_2007, topics\satellite_garmin_nuvi_255W_gps, topics\seats_honda_accord_2008, topics\sound_ipod_nano_8gb, topics\speed_garmin_nuvi_255W_gps, topics\transmission_toyota_camry_2007, topics\updates_garmin_nuvi_255W_gps, topics\voice_garmin_nuvi_255W_gps]"


# 문서 유사도
- 코사인 유사도 활용