In [1]:
from lbl2vec import Lbl2Vec
import pandas as pd
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument
from gensim.parsing.preprocessing import strip_tags
from sklearn.metrics import f1_score

### Data Load

In [20]:
train_org = pd.read_excel("C:\\Users\\yukir\\Documents\\Monicas_workspace\\Derma_v2\\Lbl2Vec\\data_derma\\Lbl2Vec_data\\★★[0512]for_train.xlsx",index_col=0)
test_org = pd.read_excel("C:\\Users\\yukir\\Documents\\Monicas_workspace\\Derma_v2\\Lbl2Vec\\data_derma\\Lbl2Vec_data\\★★[0512]for_val.xlsx",index_col=0)

labels = pd.read_csv("C:\\Users\\yukir\\Documents\\Monicas_workspace\\Derma_v2\\Lbl2Vec\\data_derma\\Lbl2Vec_data\\★★(0513)labels_v4.csv", sep =';')

# split keywords by separator and save them as array
labels['keywords'] = labels['keywords'].apply(lambda x: x.split(' '))

# convert description keywords to lowercase
labels['keywords'] = labels['keywords'].apply(lambda description_keywords: [keyword.lower() for keyword in description_keywords])

# get number of keywords for each class
labels['number_of_keywords'] = labels['keywords'].apply(lambda row: len(row))

In [23]:
ag_train = train_org.copy()
ag_test = test_org.copy()

In [21]:
labels

Unnamed: 0,class_index,class_name,keywords,number_of_keywords
0,1,Reliability,"[진료, 치료, 설명, 처방, 시술, 주사, 효과, 제거, 가격, 해주시, 압출, ...",55
1,2,Responsiveness,"[시간, 대기, 예약, 바로, 금방, 접수, 주말, 점심시간, 오전, 일찍, 당일,...",22
2,3,Assurance,"[선생님, 의사, 직원, 원장님, 간호사, 관리, 실장, 안내, 전문의, 원장, 친...",59
3,4,Empathy,"[상담, 추천, 도움, 말씀, 사항, 권유, 주의, 응대, 대답, 이야기, 이해, ...",35
4,5,Tangible,"[병원, 시설, 근처, 내부, 데스크, 편이, 건물, 위치, 주차, 분위기, 의원,...",73


In [24]:
ag_train.head()

Unnamed: 0,class,sentence
2634,4,주변에서 추천받고
211366,3,피부과 전문의 시고 정말 잘
222105,4,게 아주 만족했습니다 상담실장님 살짝 이것저것 권하셔서
147497,3,그것도 확인해보는 게
110297,2,1개씩 처방 가능하다고 하더라고요ᅲᅲ여러 개씩 해주는


### Tokenizeing

In [25]:
# doc: document text string
# returns tokenized document
# strip_tags removes meta tags from the text
# simple preprocess converts a document into a list of lowercase tokens, ignoring tokens that are too short or too long 
# simple preprocess also removes numerical values as well as punktuation characters
def tokenize(doc):
    return simple_preprocess(strip_tags(doc), deacc=True, min_len=2, max_len=15)

In [26]:
# add data set type column
ag_train['data_set_type'] = 'train'
ag_test['data_set_type'] = 'test'

# concat train and test data
ag_full_corpus = pd.concat([ag_train,ag_test]).reset_index(drop=True)

In [27]:
ag_full_corpus.head()

Unnamed: 0,class,sentence,data_set_type
0,4,주변에서 추천받고,train
1,3,피부과 전문의 시고 정말 잘,train
2,4,게 아주 만족했습니다 상담실장님 살짝 이것저것 권하셔서,train
3,3,그것도 확인해보는 게,train
4,2,1개씩 처방 가능하다고 하더라고요ᅲᅲ여러 개씩 해주는,train


In [29]:
# tokenize and tag documents combined title + description for Lbl2Vec training
ag_full_corpus['tagged_docs'] = ag_full_corpus.apply(lambda row: TaggedDocument(tokenize(row['sentence']), [str(row.name)]), axis=1)

In [30]:
ag_full_corpus.head()

Unnamed: 0,class,sentence,data_set_type,tagged_docs
0,4,주변에서 추천받고,train,"([주변에서, 추천받고], [0])"
1,3,피부과 전문의 시고 정말 잘,train,"([피부과, 전문의, 시고, 정말], [1])"
2,4,게 아주 만족했습니다 상담실장님 살짝 이것저것 권하셔서,train,"([아주, 만족했습니다, 상담실장님, 살짝, 이것저것, 권하셔서], [2])"
3,3,그것도 확인해보는 게,train,"([그것도, 확인해보는], [3])"
4,2,1개씩 처방 가능하다고 하더라고요ᅲᅲ여러 개씩 해주는,train,"([개씩, 처방, 가능하다고, 하더라고요ᅲᅲ여러, 개씩, 해주는], [4])"


In [31]:
ag_full_corpus.tagged_docs

0                                       ([주변에서, 추천받고], [0])
1                                 ([피부과, 전문의, 시고, 정말], [1])
2                ([아주, 만족했습니다, 상담실장님, 살짝, 이것저것, 권하셔서], [2])
3                                       ([그것도, 확인해보는], [3])
4                ([개씩, 처방, 가능하다고, 하더라고요ᅲᅲ여러, 개씩, 해주는], [4])
                                ...                        
167124       ([어려워요ㅋ, 카운터, 보시는, 분이, 기계적이라, 예약하는], [167124])
167125                      ([그래도, 친절한, 느낌이었습니다], [167125])
167126                      ([정도에요, 대기도, 많지, 않고], [167126])
167127    ([희미, 원장님, 진짜, 친절하고, 실력도, 좋으시고, 너무, 좋아요, 돈만, 있...
167128    ([이분은, 뭔가, 그중에서도, 비교적, 태도가, 좋으신, 느낌, 근데, 설명은, ...
Name: tagged_docs, Length: 167129, dtype: object

In [32]:
# add doc_key column
ag_full_corpus['doc_key'] = ag_full_corpus.index.astype(str)

In [33]:
# add class_name column
ag_full_corpus = ag_full_corpus.merge(labels, left_on='class', right_on='class_index', how='left').drop(['class', 'keywords'], axis=1)

In [34]:
ag_full_corpus.head()

Unnamed: 0,sentence,data_set_type,tagged_docs,doc_key,class_index,class_name,number_of_keywords
0,주변에서 추천받고,train,"([주변에서, 추천받고], [0])",0,4,Empathy,35
1,피부과 전문의 시고 정말 잘,train,"([피부과, 전문의, 시고, 정말], [1])",1,3,Assurance,59
2,게 아주 만족했습니다 상담실장님 살짝 이것저것 권하셔서,train,"([아주, 만족했습니다, 상담실장님, 살짝, 이것저것, 권하셔서], [2])",2,4,Empathy,35
3,그것도 확인해보는 게,train,"([그것도, 확인해보는], [3])",3,3,Assurance,59
4,1개씩 처방 가능하다고 하더라고요ᅲᅲ여러 개씩 해주는,train,"([개씩, 처방, 가능하다고, 하더라고요ᅲᅲ여러, 개씩, 해주는], [4])",4,2,Responsiveness,22


### Train Lbl2Vec

In [37]:
# init model with parameters
# 실제로 데이터셋에서 labeled class를 쓰진않음
lbl2vec_model = Lbl2Vec(keywords_list=list(labels['keywords']), tagged_documents=ag_full_corpus['tagged_docs'][ag_full_corpus['data_set_type']=='train'], label_names=list(labels['class_name']), similarity_threshold=0.30, min_num_docs=100, epochs=10)

In [38]:
# train model
lbl2vec_model.fit()

2022-05-13 00:30:43,793 - Lbl2Vec - INFO - Train document and word embeddings
2022-05-13 00:32:33,312 - Lbl2Vec - INFO - Train label embeddings


### Predict

In [39]:
# predict similarity scores
model_docs_lbl_similarities = lbl2vec_model.predict_model_docs()

2022-05-13 01:00:33,271 - Lbl2Vec - INFO - Get document embeddings from model
2022-05-13 01:00:33,367 - Lbl2Vec - INFO - Calculate document<->label similarities


In [40]:
#서로 중복되는게 많아서 그럴 수도 있음
model_docs_lbl_similarities.head()

Unnamed: 0,doc_key,most_similar_label,highest_similarity_score,Reliability,Responsiveness,Assurance,Empathy,Tangible
0,0,Tangible,0.932575,0.932574,0.932572,0.932569,0.932571,0.932575
1,1,Responsiveness,0.740074,0.740056,0.740074,0.740058,0.740072,0.740072
2,2,Empathy,0.913169,0.913162,0.913168,0.913167,0.913169,0.913158
3,3,Reliability,0.899278,0.899278,0.899275,0.899273,0.899274,0.899278
4,4,Empathy,0.95955,0.959539,0.959548,0.959545,0.95955,0.959548


In [None]:
#서로 중복되는게 많아서 그럴 수도 있음
model_docs_lbl_similarities.head()

Unnamed: 0,doc_key,most_similar_label,highest_similarity_score,Reliability,Responsiveness,Assurance,Empathy,Tangible
0,0,Tangible,0.932575,0.932574,0.932572,0.932569,0.932571,0.932575
1,1,Responsiveness,0.740074,0.740056,0.740074,0.740058,0.740072,0.740072
2,2,Empathy,0.913169,0.913162,0.913168,0.913167,0.913169,0.913158
3,3,Reliability,0.899278,0.899278,0.899275,0.899273,0.899274,0.899278
4,4,Empathy,0.95955,0.959539,0.959548,0.959545,0.95955,0.959548


### Evaluation

In [41]:
# merge DataFrames to compare the predicted and true topic labels
evaluation_train = model_docs_lbl_similarities.merge(ag_full_corpus[ag_full_corpus['data_set_type']=='train'], left_on='doc_key', right_on='doc_key')

In [42]:
y_true_train = evaluation_train['class_name']
y_pred_train = evaluation_train['most_similar_label']
print('F1 score:',f1_score(y_true_train, y_pred_train, average='micro'))

F1 score: 0.23305385817820093


### Predict

# predict similarity scores of new test documents (they were not used during Lbl2Vec training)
new_docs_lbl_similarities = lbl2vec_model.predict_new_docs(tagged_docs=ag_full_corpus['tagged_docs'][ag_full_corpus['data_set_type']=='test'])

In [43]:
# predict similarity scores of new test documents (they were not used during Lbl2Vec training)
new_docs_lbl_similarities = lbl2vec_model.predict_new_docs(tagged_docs=ag_full_corpus['tagged_docs'][ag_full_corpus['data_set_type']=='test'])

2022-05-13 01:40:40,171 - Lbl2Vec - INFO - Calculate document embeddings
2022-05-13 01:40:46,172 - Lbl2Vec - INFO - Calculate document<->label similarities


In [44]:
new_docs_lbl_similarities.head()

Unnamed: 0,doc_key,most_similar_label,highest_similarity_score,Reliability,Responsiveness,Assurance,Empathy,Tangible
0,133703,Tangible,-0.002174,-0.002176,-0.002176,-0.002177,-0.002176,-0.002174
1,133704,Reliability,0.749737,0.749737,0.749716,0.749728,0.749713,0.749714
2,133705,Empathy,0.679435,0.679422,0.679434,0.679433,0.679435,0.679411
3,133706,Tangible,0.871421,0.871405,0.871416,0.871403,0.871413,0.871421
4,133707,Responsiveness,0.826436,0.826421,0.826436,0.82642,0.826433,0.826429


In [45]:
# merge DataFrames to compare the predicted and true topic labels
evaluation_test = new_docs_lbl_similarities.merge(ag_full_corpus[ag_full_corpus['data_set_type']=='test'], left_on='doc_key', right_on='doc_key')

In [46]:
y_true_test = evaluation_test['class_name']
y_pred_test = evaluation_test['most_similar_label']
print('F1 score:',f1_score(y_true_test, y_pred_test, average='micro'))

F1 score: 0.22078621432417878
