In [4]:
# !pip install transformers
# !pip install torch

### 사전 준비물

1. Pretrain된 BERT모델
2. 질의응답 Dataset


### 진행과정

1. 사용자의 질문(query)을 입력받는다.
2. query를 pretrained BERT의 입력으로 넣어, query 문장에 해당하는 [CLS] token hidden을 얻는다.
3. 사전에 준비된 질의응답 DataSet에 존재하는 모든 질문들을 pretrained BERT의 입력으로 넣어, 질문들에 해당하는 [CLS] token hidden을 얻는다.
4. query의 [CLS] token hidden과 질문들의 [CLS] token hidden간의 코사인 유사도를 구한다.
5. 가장 높은 코사인 유사도를 가진 질문의 답변을 반환시켜준다.
6. 위 과정 반복.

In [7]:
import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer

In [8]:
MODEL_NAME = 'bert-base-multilingual-cased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


AttributeError: 'BertModel' object has no attribute 'prarameters'

In [12]:
model.parameters

<bound method Module.parameters of BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropou

In [15]:
# cls token을 얻기 위한 함수
def get_cls_token(sent_A): # 문장이 입력
    model.eval()
    tokenized_sent = tokenizer(
        sent_A,
        return_tensors="pt",
        truncation=True,
        add_special_tokens=True,
        max_length=128
    )
    with torch.no_grad(): # 그라디엔트 계산 비활성화
        outputs = model( # **tokenized_sent -> 명시적으로 표시하기 위해 아래로 표현
            input_ids = tokenized_sent['input_ids'],
            attention_mask = tokenized_sent['attention_mask'],
            token_type_ids = tokenized_sent['token_type_ids']
        )
    logits = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()
    return logits

In [36]:
query = 'PPL너무 심해'
query_cls_hidden = get_cls_token(query)
print(query_cls_hidden)
print(query_cls_hidden.shape)

[[-3.86869609e-02 -7.38065392e-02  3.86235923e-01  1.14278890e-01
  -2.11086348e-01 -2.94315487e-01 -6.71451688e-02  1.03092641e-02
  -7.78711438e-02  2.09223986e-01 -3.94438207e-02 -1.54055029e-01
   1.01203151e-01  1.32729888e-01 -6.32130384e-01 -2.68081784e-01
  -1.92343533e-01  4.40752566e-01  5.27532250e-02  3.09119642e-01
   1.67338848e-02  3.55735272e-02 -2.29636699e-01  1.19728565e-01
   1.66454017e-01 -5.89520872e-01 -1.07969530e-01 -8.46345872e-02
   2.62847424e-01  2.48759627e-01  6.51723742e-02 -2.83377841e-02
  -1.40974820e-01  1.87241971e-01 -1.67921782e-01  6.53701052e-02
  -1.96701944e+00 -1.83534995e-01  8.72325525e-03 -2.04787806e-01
  -1.28635362e-01  3.11237909e-02 -1.25798076e-01  4.17607948e-02
  -1.24375537e-01  1.30921888e+00 -1.52322412e-01 -9.69932228e-02
   1.41810143e+00  1.11565560e-01 -5.71179800e-02 -6.77577496e-01
   2.09821135e-01 -1.57215750e+00  1.13460943e-01  1.55448437e-01
   2.14178205e-01 -5.80352060e-02  9.50742736e-02  1.57561526e-03
  -2.04569

In [20]:
data_df = pd.read_csv('../Documents/git/Chatbot_data/ChatbotData.csv')
data_df

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0
...,...,...,...
11818,훔쳐보는 것도 눈치 보임.,티가 나니까 눈치가 보이는 거죠!,2
11819,훔쳐보는 것도 눈치 보임.,훔쳐보는 거 티나나봐요.,2
11820,흑기사 해주는 짝남.,설렜겠어요.,2
11821,힘든 연애 좋은 연애라는게 무슨 차이일까?,잘 헤어질 수 있는 사이 여부인 거 같아요.,2


In [32]:
type(data_cls_hidden)

numpy.ndarray

In [21]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

dataset_cls_hidden = []
chatbot_Question = data_df['Q']
for q in chatbot_Question:
    q_cls = get_cls_token(q)
    dataset_cls_hidden.append(q_cls)
data_cls_hidden = np.array(dataset_cls_hidden).squeeze(axis=1)
print(data_cls_hidden) # 데이터 셋의 질문에 대한 [CLS] 토큰 벡터
print(data_cls_hidden.shape)

[[-9.83642936e-02 -2.87346616e-02 -6.73831880e-01 ...  7.08659530e-01
   6.99952990e-02  3.66942137e-01]
 [ 6.49078786e-02  7.18162060e-02 -1.40972048e-01 ...  1.38333037e-01
   1.65952072e-01  1.12921156e-01]
 [ 4.07803804e-04  2.28369981e-02  2.58352309e-01 ...  1.35420680e-01
   2.67340362e-01  9.57452580e-02]
 ...
 [ 1.53779164e-01 -2.82409340e-02 -8.10719132e-02 ...  2.28785023e-01
   1.65324360e-02  1.29616022e-01]
 [-4.51041684e-02  1.04225785e-01 -1.89268559e-01 ...  5.91341257e-01
   2.69855976e-01 -2.99957246e-02]
 [ 3.52815874e-02  3.98814678e-03  1.61026150e-01 ...  6.02967858e-01
   4.56987321e-03  4.15168941e-01]]
(11823, 768)


In [37]:
cos_sim = cosine_similarity(query_cls_hidden, data_cls_hidden)
print(cos_sim)

[[0.4756525  0.9124478  0.9389174  ... 0.93778825 0.91623616 0.87848544]]


In [None]:
np.save('data_cls_hidden_save', data_cls_hidden)

In [38]:
top_question = np.argmax(cos_sim)

print('나의 질문:',  query)
print('저장된 답변:', data_df['A'][top_question])

나의 질문: PPL너무 심해
저장된 답변: 저도 해보고 싶네요.


In [39]:
!pip install fastapi

Collecting fastapi


[notice] A new release of pip is available: 23.1.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip



  Downloading fastapi-0.95.2-py3-none-any.whl (56 kB)
                                              0.0/57.0 kB ? eta -:--:--
     ---------------------------------------- 57.0/57.0 kB 3.1 MB/s eta 0:00:00
Collecting starlette<0.28.0,>=0.27.0 (from fastapi)
  Downloading starlette-0.27.0-py3-none-any.whl (66 kB)
                                              0.0/67.0 kB ? eta -:--:--
     ---------------------------------------- 67.0/67.0 kB 3.8 MB/s eta 0:00:00
Installing collected packages: starlette, fastapi
Successfully installed fastapi-0.95.2 starlette-0.27.0


In [41]:
!pip install "uvicorn[standard]"


[notice] A new release of pip is available: 23.1.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting uvicorn[standard]
  Downloading uvicorn-0.22.0-py3-none-any.whl (58 kB)
                                              0.0/58.3 kB ? eta -:--:--
     ---------------------------------------- 58.3/58.3 kB ? eta 0:00:00
Collecting httptools>=0.5.0 (from uvicorn[standard])
  Downloading httptools-0.5.0-cp39-cp39-win_amd64.whl (145 kB)
                                              0.0/145.1 kB ? eta -:--:--
     ---------------------------------------- 145.1/145.1 kB ? eta 0:00:00
Collecting watchfiles>=0.13 (from uvicorn[standard])
  Downloading watchfiles-0.19.0-cp37-abi3-win_amd64.whl (270 kB)
                                              0.0/270.9 kB ? eta -:--:--
     ------------------------------------- 270.9/270.9 kB 16.3 MB/s eta 0:00:00
Collecting websockets>=10.4 (from uvicorn[standard])
  Downloading websockets-11.0.3-cp39-cp39-win_amd64.whl (124 kB)
                                              0.0/124.7 kB ? eta -:--:--
     -------------------------------------- 12


[notice] A new release of pip is available: 23.1.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


ERROR: Could not find a version that satisfies the requirement request (from versions: none)
ERROR: No matching distribution found for request

[notice] A new release of pip is available: 23.1.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip
