In [None]:
#open files
mode = 'B'
path = 'dataset path'
filename ='dataset file *.csv'
stwfile = 'stopwords file *.xlsx'

In [None]:
import re
import numpy as np
import pandas as pd
import scipy.sparse as ss
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
%matplotlib inline

import MeCab
from pprint import pprint
from tqdm import tqdm
from konlpy.tag import Mecab 
from konlpy.tag import *

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer
from scipy.sparse import diags

mc = Mecab(dicpath='The path of the MeCab-ko dictionary.') # The path of the MeCab-ko dictionary.

#load stopwords
stwDF = pd.read_excel(path+stwfile, sheet_name='stopwords')
stwlist = stwDF.space.to_list()

class MyTokenizer:
    
    def __init__(self, tagger):
        self.tagger = tagger
    
    def __call__(self, sent):
        postags=['NNP', 'NNG', 'VV', 'VA', 'SL', 'VV+ETN'] 
        pos = self.tagger.pos(sent)
        pos = [word for (word, pos) in mc.pos(sent, flatten=True) if pos in postags and len(word)>1]
        pos = [word for word in pos if word not in stwlist]
        return pos

my_tokenizer = MyTokenizer(Mecab(dicpath='C:/mecab/mecab-ko-dic'))

In [None]:
#load dataset
readfile = pd.read_csv (path+filename, encoding='utf-8')
tokenized_input = [' '.join(my_tokenizer(line)) for line in readfile['complaint']] 

In [None]:
#import and embedding both Criteria name and its keywords 

criteriaDF = pd.read_excel(path+stwfile, sheet_name=mode)
criteriaDict = criteriaDF.to_dict()
criteria_name = list(criteriaDict.keys())
print("criteria_name: ",criteria_name)

criteria_keywords=[]
for k in criteriaDict.values():
    temp =[ word for word in list(k.values()) if type(word) is not float]
    criteria_keywords.append(temp)
# print("criteria_keywords: ",criteria_keywords)

# class name + keywords 
input_className=[]
for k, v in zip(criteria_name, criteria_keywords): 
    v.append(k)
    input_className.append(' '.join(v))
    
print("Anchor word list: ", input_className)
print(len(input_className))

In [None]:
#label encoding
criteria_num = list(range(0, len(criteria_name)))
print(criteria_num)

encoded_dict= {}
for k, v in zip(criteria_name, criteria_num):
    encoded_dict[k] = v

y_label = readfile['label'].to_list()

encoded_dict


In [None]:
#count- vectorization 
#min_df = minimum number of document that specific word appear : 특정 단어가  n 개 문서 이하에서 등장하면 제외, 
# analyzer='char' 

vectorizer = CountVectorizer(max_features=1000, binary=False, 
                             ngram_range =(1,1), tokenizer= my_tokenizer, stop_words =stwlist)

doc_word = vectorizer.fit_transform(tokenized_input) 

#csr_matrix: numpy array  > a compressed sparse row matrix로 변환 
doc_word = ss.csr_matrix(doc_word)

vectorizer.vocabulary_
idx2vocab = [vocab for vocab, idx in sorted(vectorizer.vocabulary_.items(), key=lambda x:x[1])]

dft_words = list(np.asarray(vectorizer.get_feature_names()))
not_digit_inds = [ind for ind, word in enumerate(dft_words) if not word.isdigit()]
doc_word = doc_word[:,not_digit_inds]
words    = [word for ind,word in enumerate(dft_words) if not word.isdigit()]


print("doc_word's shape: ",doc_word.shape)
print("length of idx2vocab: ",len(idx2vocab), idx2vocab[:10])
print("len of dft_words",len(dft_words))
print("len of not_digit_inds",len(not_digit_inds))
print("len of words: ",len(words))

In [None]:
print(input_className)
#print anchor words

awlist= []
for aw in input_className:   #criteria_name input_className
    awlist.append(my_tokenizer(aw))
print(awlist)


In [None]:
# weight calculation
from sklearn.feature_selection import mutual_info_classif

results = dict(zip(vectorizer.get_feature_names(), 
                   mutual_info_classif(doc_word, awlist, discrete_features='auto')))

sorting = sorted(results.items(), key = lambda x : x[1], reverse = True)
aw =[]
miscore=[]
for k, v in enumerate(sorting):
    aw.append(v[0])
    miscore.append(v[1])
    

# aw_weight ={}
# for i in range(len(awlist)): 
#     for j in range(len(awlist[i])):
#         aw_weight[awlist[i][j]]= aw_w
# print(aw_weight)

In [None]:
matrix = doc_word.toarray()
print(matrix.shape)
#get_feature_names() 
dft_words_weight = np.ones(len(words))

for key, value in aw_weight.items():
    if key in words:
        dft_words_weight[words.index(key)]= value

print(dft_words_weight.shape)

In [None]:
weight_diag = diags(dft_words_weight, 0)
print(weight_diag.shape)
weight_diag = diags(dft_words_weight, 0)
print(weight_diag.shape)

np_weight_diag = weight_diag.toarray()
print(np_weight_diag.shape)


doc_word_w = ss.csr_matrix(np.matmul(matrix,np_weight_diag))
doc_word_w


## CorEx Topic Model
The main parameters of the CorEx topic model are:

- n_hidden: number of topics ("hidden" as in "hidden latent topics")
- words: words that label the columns of the doc-word matrix (optional)
- docs: document labels that label the rows of the doc-word matrix (optional)
- max_iter: number of iterations to run through the update equations (optional, defaults to 200)
- verbose: if verbose=1, then CorEx will print the topic TCs with each iteration
- seed: random number seed to use for model initialization (optional)

As shown in the example, 
- clusters gives the variable clusters for each hidden factor Y_j and 
- labels gives the labels for each sample for each Y_j. 
- Probabilistic labels can be accessed with p_y_given_x.

In [None]:
# CorEx-AnchorWords run
"""
Choosing anchor strength: 
the anchor strength controls how much weight CorEx puts towards maximizing the mutual information 
between the anchor words and their respective topics. 
Anchor strength should always be set at a value greater than 1, 
since setting anchor strength between 0 and 1 only recovers the unsupervised CorEx objective. 
Empirically, setting anchor strength from 1.5-3 seems to nudge the topic model towards the anchor words. 
Setting anchor strength greater than 5 is strongly enforcing that the CorEx topic model find 
a topic associated with the anchor words.

We encourage users to experiment with the anchor strength and determine what values are best for their needs.
"""

import corextopic.corextopic as ct
import corextopic.vis_topic as vt 

criteria_num = len(awlist) #awlist
anchor_words = awlist  #awlist
anchor_strength_v = 10

anchoring_model= ct.Corex(n_hidden =criteria_num, seed=37) 
anchoring_model.fit(doc_word_w, 
                    words=words,
                    anchors= anchor_words, 
                    anchor_strength=anchor_strength_v);


#Overall TC : Topic Correlation and Model selection :
print(anchoring_model.tcs.shape) # k_topics
print(np.sum(anchoring_model.tcs))
print("The overall total correlation is the sum of the total correlation per each topic.")
print("TC value= ", anchoring_model.tc)
print("For an anchored CorEx topic model, the topics are not sorted, and are outputted such that the anchored topics come first.")


### OUTPUT of CorEx

- text_files/groups.txt Lists the variables in each group.

- text_files/labels.txt Gives a column for each latent factor (in layer 1) and a row for each patient/sample. The entry is the value of the latent factor (0,…dim_hidden-1)

- text_files/cont_labels.txt Gives a continuous number to sort each patient with respect to each latent factor.

- relationships For each latent factor, it shows pairwise plots between the top genes in each group. Each point corresponds to a sample/patient and the color corresponds to the learned latent factor.

In [None]:
"""
 CorEx Attributes
    ----------
    
    labels : array, [n_samples, n_hidden]
        Label for each hidden unit for each sample.

    clusters : array, [n_visible]
        Cluster label for each input variable.

    p_y_given_x : array, [n_samples, n_hidden]
        p(y_j=1|x) for each sample.

    alpha : array-like, shape [n_hidden, n_visible]
        Adjacency matrix between input variables and hidden units. In range [0,1].

    mis : array, [n_hidden, n_visible]
        Mutual information between each (visible/observed) variable and hidden unit

    tcs : array, [n_hidden]
        X_G = a group of word types 
        Y = a topic to be learned 
        corex는 TC(X_Gj;Y_j) 를 maximize 하여 latent topics를 통해 문서내 단어의 종속성을 최대한 설명하려고 함 
        TC(X_Gj;Y_j) for each hidden unit

    tc : float
        Convenience variable = Sum_j tcs[j]

    tc_history : array
        Shows value of TC over the course of learning. Hopefully, it is converging.

    words : list of strings
        Feature names that label the corresponding columns of X
        

"""

tcs = anchoring_model.tcs
mis = anchoring_model.mis 
sign = anchoring_model.sign
alpha = anchoring_model.alpha
resLabels = anchoring_model.labels
p_y_given_x = anchoring_model.p_y_given_x
ac_topics = anchoring_model.get_topics()
row_label = list(map(str, range(anchoring_model.n_samples)))

print("tcs's shape:" ,tcs.shape)
print("mis's shape:" ,mis.shape)
print("sign's shape:" ,sign.shape)
print("alpha's shape:" ,alpha.shape)
print("resLabels'shape:", resLabels.shape)
print("p_y_given_x's shape:" ,p_y_given_x.shape)

In [None]:
"""
 topics, list or list of lists
            Each list is a topic. If only topic is being queried, then only a
            single, non-nested list. Each topic list contains a series of
            3-tuples for the top N words. 
            1) The first entry is either the string or the column integer index of the word, depending on `print_words`
            and whether `words` is available. 
            2) The second entry is the mutual information (MI) of the word with the topic. 
            3) The third entry is the sign of correlation of the word with the topic. If it is
            positive (1), then the word's presence is informative for the topic.
            If it is negative (-1), then word's absensce is informative for the
            topic
"""
getTopicWordlist={}
for n,topic in enumerate(ac_topics):
    topic_words,_,_ = zip(*topic)
    getTopicWordlist[n]= topic_words
    print('{}: '.format(n) + ', '.join(topic_words))
    
twDF = pd.DataFrame({'topicTemrs': list(getTopicWordlist.values()), 'tc': tcs})

twDF['ct']= criteria_name


print(encoded_dict)

twDF

In [None]:

maxPlabel=[]
ns, m = p_y_given_x.shape
print(ns, m)

for l in range(ns):
    temp = list(map(lambda q: '{:.3f}'.format(q),list(np.log(p_y_given_x[l, :]))))
    temp = list(map(float,temp))
    maxPlabel.append(temp.index(max(temp)))

print("Selected label or each sample",maxPlabel[:10])
print(len(maxPlabel))

In [None]:
pred_y_str=[]
for pred_y_idx in maxPlabel:
    for k,v in encoded_dict.items():
        if pred_y_idx== v: 
            pred_y_str.append(k)
print(len(pred_y_str))        

In [None]:
reClassifyingDF =pd.DataFrame({'complaint': tokenized_input, 'y_pred':pred_y_str, 'y_ans_str': y_label})
reClassifyingDF

In [None]:
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve

print(classification_report(y_label, pred_y_str))
print(precision_recall_fscore_support(y_label, pred_y_str,  average='micro'))
print("TC value= ", anchoring_model.tc)

In [None]:
criteria_name_A =['Cabinet', 
                    'Gas and fire system', 
                    'Condensation',
                    'Structural defect',
                    'Heating system',
                    'Leakage',
                    'Paperwork',
                    'Woodframe work',
                    'Masonry',
                    'Flooring',
                    'Stonework',
                    'Finish of sink',
                    'Airconditioning system',
                    'Appliance',
                    'Sanitary and plumbing',
                    'Electrical system',
                    'Doors and windows',
                    'Tiling',
                    'Communication system']

criteria_name_B=['Opening', 'Condensation', 'Stability', 'Leakage', 'Step', 'Detachement', 'Poor surface', 'Uninstallation', 
                  'Corrosion', 'Misalignment', 'Contamination', 'Disconnection', 'Out of orders', 'On/Off defect', 'Poor joint', 
                  'Scratch', 'Caulking defect', 'Crack', 'Broken']



In [None]:
import seaborn as sns # used for plot interactive graph.
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
%matplotlib inline
font = {'family' : 'Malgun Gothic',
        'size'   : 12}

plt.rc('font', **font)

# plt.rc('font', family='Malgun Gothic')

conf_mat = confusion_matrix(y_label, pred_y_str)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, cmap="Blues", fmt='d', annot_kws={"size": 10},
            xticklabels=criteria_name_B, 
            yticklabels=criteria_name_B)
plt.ylabel('True Label', size=12)
plt.xlabel('Predicted Label', size=12)
plt.title("Confusion Matrix ", size=15);


In [None]:
from sklearn.preprocessing import scale
from sklearn.metrics import silhouette_samples, silhouette_score, davies_bouldin_score
import matplotlib.pyplot as plt

y_ans_dict = {i: criteria_name[i] for i in range(0, len(criteria_name))}
print(y_ans_dict)

ans_idx = []
for anslabel in y_label:
    for k, v in y_ans_dict.items():
        if anslabel == v:
            ans_idx.append(k)
            
reClassifyingDF['y_ans_idx'] = ans_idx 
reClassifyingDF

In [None]:
ans_idx= np.array(ans_idx).reshape(-1,1)
y_pred = maxPlabel
y_pred = np.array(y_pred).reshape(-1,1)
print(type(ans_idx))


score_samples = silhouette_samples(p_y_given_x, y_pred)
print(score_samples.shape)
print(score_samples)

reClassifyingDF['silhoutte_coeff'] = score_samples

print('silhouette Coefficient: {:.4f}'.format(silhouette_score(p_y_given_x, y_pred)))
