In [3]:
import pandas as pd
import glob, os

path = '../../../datasets/uci_opinion/OpinosisDataset1.0/topics/'

all_files = glob.glob(os.path.join(path, '*.data'))
filename_list = []
opinion_text = []

for file_ in all_files:
    df = pd.read_table(file_, index_col=None, header=0, encoding='latin1')

    filename_ = file_.split('/')[-1]
    filename = filename_.split('.')[0]

    filename_list.append(filename)
    opinion_text.append(df.to_string())

document_df = pd.DataFrame({'filename':filename_list, 'opinion_text':opinion_text})
document_df.head()

Unnamed: 0,filename,opinion_text
0,food_swissotel_chicago,...
1,service_bestwestern_hotel_sfo,...
2,transmission_toyota_camry_2007,...
3,quality_toyota_camry_2007,...
4,battery-life_amazon_kindle,...


In [4]:
document_df['opinion_text'][0]

"                                                                                                                                                                                                                                                                                                                                                                 The food for our event was delicious .\n0                                                                                                                                                                                                                                                                                                                         The food in the lounge was great and very fresh, , , salads, sandwiches etc .\n1                                                                                                                                                                                                                    

In [5]:
from nltk.stem import WordNetLemmatizer
import nltk
import string

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
lemmar = WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmar.lemmatize(token) for token in tokens]

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))



In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english', ngram_range=(1, 2), min_df=0.05, max_df=0.85)

feature_vect = tfidf_vect.fit_transform(document_df['opinion_text'])



In [8]:
from sklearn.cluster import KMeans

km_cluster = KMeans(n_clusters=5, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_
cluster_centers = km_cluster.cluster_centers_

In [10]:
document_df['cluster_label'] = cluster_label
document_df.head()

Unnamed: 0,filename,opinion_text,cluster_label
0,food_swissotel_chicago,...,1
1,service_bestwestern_hotel_sfo,...,1
2,transmission_toyota_camry_2007,...,4
3,quality_toyota_camry_2007,...,4
4,battery-life_amazon_kindle,...,3


In [15]:
document_df[document_df['cluster_label']==4].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
17,comfort_honda_accord_2008,...,4
5,comfort_toyota_camry_2007,...,4
8,gas_mileage_toyota_camry_2007,...,4
37,interior_honda_accord_2008,...,4
24,interior_toyota_camry_2007,...,4
35,mileage_honda_accord_2008,...,4
15,performance_honda_accord_2008,...,4
3,quality_toyota_camry_2007,...,4
11,seats_honda_accord_2008,...,4
14,sound_ipod_nano_8gb,headphone jack i got a clear case for it a...,4


In [16]:
km_cluster = KMeans(n_clusters=3, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_
cluster_centers = km_cluster.cluster_centers_

document_df['cluster_label'] = cluster_label
document_df.sort_values(by='cluster_label')

Unnamed: 0,filename,opinion_text,cluster_label
11,seats_honda_accord_2008,...,0
2,transmission_toyota_camry_2007,...,0
3,quality_toyota_camry_2007,...,0
35,mileage_honda_accord_2008,...,0
5,comfort_toyota_camry_2007,...,0
37,interior_honda_accord_2008,...,0
8,gas_mileage_toyota_camry_2007,...,0
17,comfort_honda_accord_2008,...,0
24,interior_toyota_camry_2007,...,0
15,performance_honda_accord_2008,...,0


In [17]:
cluster_centers = km_cluster.cluster_centers_
print(cluster_centers.shape)
print(cluster_centers)

(3, 4611)
[[0.         0.00084138 0.         ... 0.         0.         0.        ]
 [0.0104721  0.         0.         ... 0.00735716 0.         0.        ]
 [0.         0.00099499 0.00174637 ... 0.         0.00183397 0.00144581]]


In [22]:
def get_cluster_detail(cluster_model, cluster_data, feature_names, clusters_num, top_n_features=10):
    cluster_details = {}

    centroid_feature_ordered_ind = cluster_model.cluster_centers_.argsort()[:, ::-1]

    for cluster_num in range(clusters_num):
        cluster_details[cluster_num] = {}
        cluster_details[cluster_num]['cluster'] = cluster_num

        top_feature_indexes = centroid_feature_ordered_ind[cluster_num, :top_n_features]
        top_features = [feature_names[ind] for ind in top_feature_indexes]

        top_feature_values = cluster_model.cluster_centers_[cluster_num, top_feature_indexes].tolist()

        cluster_details[cluster_num]['top_features'] = top_features
        cluster_details[cluster_num]['top_features_value'] = top_feature_values
        filenames = cluster_data[cluster_data['cluster_label'] == cluster_num]['filename']
        filenames = filenames.values.tolist()

        cluster_details[cluster_num]['filenames'] = filenames

    return cluster_details

In [23]:
def print_cluster_details(cluster_details):
    for cluster_num, cluster_detail in cluster_details.items():
        print('Cluster {0}'.format(cluster_num))
        print('Top Features :', cluster_detail['top_features'])
        print('Reviews 파일멍 :', cluster_detail['filenames'][:7])
        print('='*100)

In [25]:
feature_names = tfidf_vect.get_feature_names()

cluster_details = get_cluster_detail(cluster_model=km_cluster, cluster_data=document_df, feature_names=feature_names, clusters_num=3, top_n_features=10)
print_cluster_details(cluster_details)

Cluster 0
Top Features : ['interior', 'seat', 'mileage', 'comfortable', 'gas', 'quality', 'gas mileage', 'transmission', 'car', 'performance']
Reviews 파일멍 : ['transmission_toyota_camry_2007', 'quality_toyota_camry_2007', 'comfort_toyota_camry_2007', 'gas_mileage_toyota_camry_2007', 'seats_honda_accord_2008', 'sound_ipod_nano_8gb', 'performance_honda_accord_2008']
Cluster 1
Top Features : ['screen', 'battery', 'keyboard', 'battery life', 'life', 'kindle', 'direction', 'voice', 'size', 'map']
Reviews 파일멍 : ['battery-life_amazon_kindle', 'keyboard_netbook_1005ha', 'fonts_amazon_kindle', 'accuracy_garmin_nuvi_255W_gps', 'features_windows7', 'speed_windows7', 'updates_garmin_nuvi_255W_gps']
Cluster 2
Top Features : ['room', 'hotel', 'service', 'staff', 'food', 'location', 'bathroom', 'clean', 'price', 'parking']
Reviews 파일멍 : ['food_swissotel_chicago', 'service_bestwestern_hotel_sfo', 'rooms_swissotel_chicago', 'rooms_bestwestern_hotel_sfo', 'staff_swissotel_chicago', 'price_holiday_inn_lon