In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -U sentence-transformers
!pip install kiwipiepy
!pip install ipython-autotime

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

In [3]:
# 셀 별로 러닝타임 측정
%load_ext autotime

time: 311 µs (started: 2024-06-12 18:29:06 +00:00)


In [4]:
%cd /content/drive/MyDrive/Colab_Notebooks/elderly_disorder/experiment

/content/drive/MyDrive/Colab_Notebooks/elderly_disorder/experiment
time: 3.87 ms (started: 2024-06-12 18:29:06 +00:00)


In [5]:
import pandas as pd
import numpy as np
import torch
import re
import os
import ast
from kiwipiepy import Kiwi
from sentence_transformers import SentenceTransformer, util

# CUDA 캐시 지우기
torch.cuda.empty_cache()
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

# 각종 모델 정의
kiwi = Kiwi()  # Kiwi 초기화
word_vectors = SentenceTransformer('paraphrase-MiniLM-L6-v2')  # 정신질환 특화 Sentence-Transformer 모델 로드

# 파일 로드
file_path = '../data/vocab_augmented.csv'
vocab_df = pd.read_csv(file_path)
# 열의 모든 데이터를 리스트로 변환
vocab_df['tokens'] = vocab_df['tokens'].apply(ast.literal_eval)

# 새로운 단어사전 파일 로드
new_vocab_path = '../data/new_vocab.csv'
new_vocab_df = pd.read_csv(new_vocab_path)

# 데이터 확인
vocab_df.head()

  from tqdm.autonotebook import tqdm, trange


True
Tesla T4


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Unnamed: 0,mental_disorder,tokens
0,obsessive_compulsive_disorder,"[예방, 고통, 사건, 이상, 중요, 강박, 물질, 활동, 반응, 현실, 억제, 효..."
1,PTSD(posttraumatic_stress_disorder),"[침해, 이상, 사건, 고통, 중요, 물질, 반응, 부정, 장기간, 가족, 효과, ..."
2,paranoid_personality_disorder,"[사건, 착취, 사유, 발언, 기인, 효과, 이용, 명성, 위협, 두려움, 과정, ..."
3,ADHD,"[침해, 주의, 학업, 활동, 집중력, 응답, 약속, 사항, 소지품, 과제, 정리,..."
4,bipolar_disorder,"[주의, 고통, 중요, 활동, 과민, 과대, 감소, 부주의, 자존감, 과대망상, 구..."


time: 22.5 s (started: 2024-06-12 18:29:06 +00:00)


In [6]:
'''
유사한 단어 찾기 및 단어사전에 추가
※문제점: 한자가 나오거나 너무 동떨어진 단어 추가 발생 => 한자 정규식 확인 및 유사도 일정 수치 이하 탈락시킴
'''

def is_hanja(word):
    # 한자 범위를 정규 표현식으로 정의
    hanja_range = re.compile(r'[\u4e00-\u9fff]+')
    return hanja_range.search(word) is not None

def augment_dictionary_with_similar_words(model, word_dictionary, new_vocab_tokens, top_n=3, similarity_threshold=0.7):
    augmented_dictionary = {}
    new_tokens_added = {}
    token_similar_words_added = {}  # 어떤 토큰에서 어떤 유사한 단어들이 추가되었는지 기록

    existing_tokens = set([token for sublist in word_dictionary['tokens'] for token in sublist])

    for disorder, tokens in zip(word_dictionary["mental_disorder"], word_dictionary["tokens"]):
        augmented_tokens = tokens.copy()
        new_tokens_for_disorder = []
        token_similar_words = {}  # 현재 질환에 대해 토큰별 추가된 유사한 단어들

        target_tags = ['NNG', 'NNP', 'NR', 'NP', 'VCN', 'MAG', 'XPN']

        for token in tokens:
            try:
                token_embedding = model.encode(token, convert_to_tensor=True)
                # 새로운 단어에 대해 임베딩 생성
                new_vocab_embeddings = model.encode(new_vocab_tokens, convert_to_tensor=True)
                cos_sim = util.pytorch_cos_sim(token_embedding, new_vocab_embeddings)
                similar_words_idx = torch.topk(cos_sim, top_n, largest=True).indices[0]
                similar_words = [new_vocab_tokens[idx] for idx in similar_words_idx]

                similar_words_added = []

                for idx, word in zip(similar_words_idx, similar_words):
                    if cos_sim[0][idx].item() >= similarity_threshold and not is_hanja(word) and word not in existing_tokens:
                        analyzed = kiwi.analyze(word)[0][0]  # 첫 번째 분석 결과
                        word_tags = [morph.tag for morph in analyzed]  # 형태소 태그 추출

                        # target_tags에 포함된 태그가 있는지 확인
                        if any(tag in target_tags for tag in word_tags) and word not in augmented_tokens:
                            augmented_tokens.append(word)
                            new_tokens_for_disorder.append(word)
                            similar_words_added.append(word)
                            existing_tokens.add(word)

                if similar_words_added:
                    token_similar_words[token] = similar_words_added

            except KeyError:
                continue

        augmented_dictionary[disorder] = augmented_tokens
        new_tokens_added[disorder] = new_tokens_for_disorder
        token_similar_words_added[disorder] = token_similar_words

    return augmented_dictionary, new_tokens_added, token_similar_words_added

# 새로운 단어사전에서 모든 토큰 추출
new_vocab_tokens = new_vocab_df['Word'].tolist()

############## 파라미터 수정
augmented_filtered_token_disorder, new_tokens_added, token_similar_words_added = augment_dictionary_with_similar_words(word_vectors, vocab_df, new_vocab_tokens, top_n=5, similarity_threshold=0.65)

# 어떤 토큰에서 어떤 유사한 단어들이 추가되었는지 출력
for disorder, tokens_info in token_similar_words_added.items():
    print(f"Disorder: {disorder}")
    for token, similar_words in tokens_info.items():
        print(f"  Token '{token}' added similar words: {similar_words}")

print()

Disorder: obsessive_compulsive_disorder
  Token '예방' added similar words: ['이제서야', '끊임없이', '상관없이', '슈퍼마켓', '생계유지']
  Token '고통' added similar words: ['공통', '공로', '공모', '공고']
  Token '사건' added similar words: ['사전', '선자', '선사', '선고']
  Token '이상' added similar words: ['이장', '인상', '이랑', '이성']
  Token '중요' added similar words: ['주역', '주원', '주연', '중역']
  Token '강박' added similar words: ['각방', '각각', '가방', '극장']
  Token '물질' added similar words: ['물기', '미술', '비주얼', '불길']
  Token '활동' added similar words: ['혈당', '합동', '출동', '홀랑']
  Token '반응' added similar words: ['방안', '반항', '방어', '반영']
  Token '현실' added similar words: ['행실', '병실', '분실']
  Token '억제' added similar words: ['언제', '업적', '어제', '억척']
  Token '효과' added similar words: ['효녀', '협의', '확', '화제']
  Token '충동' added similar words: ['충당', '명동', '충족']
  Token '하루' added similar words: ['후자', '하마', '합주']
  Token '감소' added similar words: ['고삼', '감사', '감성', '감시']
  Token '설명' added similar words: ['설령', '성별', '선명', '술병']
  Token '정리' added

time: 27min 33s (started: 2024-06-12 18:29:28 +00:00)
