# cpx챗봇 만들기

트랜스포머의 인코더 부분을 사용하여 한국어 문장 인코딩(벡터화)

In [1]:
!pip install -q sentence-transformers

In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

## SentenceBERT를 이용한 문장 인코딩

https://huggingface.co/jhgan/ko-sroberta-multitask

In [3]:
encoder = SentenceTransformer('jhgan/ko-sroberta-multitask')

sentences = ["안녕하세요?", "한국어 문장 임베딩을 위한 버트 모델입니다."]
embeddings = encoder.encode(sentences)

print(embeddings)

[[-0.37510476 -0.77338415  0.5927711  ...  0.57923514  0.3268347
  -0.65089625]
 [-0.09361722 -0.18191545 -0.19230829 ... -0.03165793  0.3041255
  -0.26793614]]


In [4]:
sentences = ["안녕하세요"]
embeddings = encoder.encode(sentences)

print(embeddings)

[[-1.78593785e-01 -5.61293781e-01  4.46276426e-01 -1.50603548e-01
  -3.76700237e-02 -7.62825370e-01  2.09378600e-01 -3.64589281e-02
   2.98453152e-01 -2.41037101e-01 -6.68550551e-01 -2.28057504e-01
  -6.40330195e-01  8.62435848e-02 -1.20845959e-01  3.24341953e-01
   2.35195234e-01 -1.59346774e-01  3.03968161e-01 -3.05746943e-01
   3.84012341e-01  4.27564055e-01  4.51850474e-01  6.87622249e-01
   1.16420828e-01  6.62420571e-01  2.50278503e-01 -1.45289108e-01
   7.07308590e-01 -2.81397942e-02 -6.45833910e-01 -4.65397947e-02
  -4.00333017e-01 -1.24737797e-02  3.64073932e-01  3.63886118e-01
  -3.57513815e-01 -4.03465748e-01 -3.31617177e-01 -1.97076231e-01
  -4.95101839e-01  4.62570116e-02  5.58891110e-02  5.01659393e-01
  -3.42604637e-01 -9.15144265e-01 -2.68441945e-01 -4.71062273e-01
  -1.06321611e-01 -1.18266203e-01 -3.35126445e-02  1.28225297e-01
   2.69816548e-01  6.53009117e-01 -2.77158648e-01 -1.02365449e-01
   1.48780435e-01 -2.40916967e-01  8.61789227e-01 -2.89628863e-01
   5.26642

## 데이터셋 로드

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/Garlic-Ryu/chatbot/master/cpxdatasetori.csv')

df.head()

Unnamed: 0,구분,유저,챗봇
0,자기소개/환자확인,안녕하세요,안녕하세요.
1,자기소개/환자확인,안녕하세요? 학생 한의사입니다.,안녕하세요.
2,자기소개/환자확인,환자분 성함과 나이 말씀해 주시겠어요?,"네, 저는 37세 김지훈입니다."
3,자기소개/환자확인,이름과 나이 말씀해 주시겠어요?,"네, 저는 37세 김지훈입니다."
4,자기소개/환자확인,이름과 나이가 어떻게 되세요?,"네, 저는 37세 김지훈입니다."


## 데이터 전처리

- 필요없는 칼럼 제거
- 챗봇 내용 없는 행 제거

In [7]:
'''
df = df.drop(columns=['Unnamed: 3'])

df.head()
'''

"\ndf = df.drop(columns=['Unnamed: 3'])\n\ndf.head()\n"

In [8]:
print(len(df))

df = df.dropna()

print(len(df))

1069
1069


## 챗봇의 원리

In [9]:
df.loc[0, '유저']

'안녕하세요'

In [10]:
encoder.encode(df.loc[0, '유저'])

array([-1.78593785e-01, -5.61293781e-01,  4.46276426e-01, -1.50603548e-01,
       -3.76700237e-02, -7.62825370e-01,  2.09378600e-01, -3.64589281e-02,
        2.98453152e-01, -2.41037101e-01, -6.68550551e-01, -2.28057504e-01,
       -6.40330195e-01,  8.62435848e-02, -1.20845959e-01,  3.24341953e-01,
        2.35195234e-01, -1.59346774e-01,  3.03968161e-01, -3.05746943e-01,
        3.84012341e-01,  4.27564055e-01,  4.51850474e-01,  6.87622249e-01,
        1.16420828e-01,  6.62420571e-01,  2.50278503e-01, -1.45289108e-01,
        7.07308590e-01, -2.81397942e-02, -6.45833910e-01, -4.65397947e-02,
       -4.00333017e-01, -1.24737797e-02,  3.64073932e-01,  3.63886118e-01,
       -3.57513815e-01, -4.03465748e-01, -3.31617177e-01, -1.97076231e-01,
       -4.95101839e-01,  4.62570116e-02,  5.58891110e-02,  5.01659393e-01,
       -3.42604637e-01, -9.15144265e-01, -2.68441945e-01, -4.71062273e-01,
       -1.06321611e-01, -1.18266203e-01, -3.35126445e-02,  1.28225297e-01,
        2.69816548e-01,  

## 모든 데이터셋 인코딩

In [11]:
df['embedding'] = pd.Series([[]] * len(df)) # dummy

df['embedding'] = df['유저'].map(lambda x: list(encoder.encode(x)))

df.head()



Unnamed: 0,구분,유저,챗봇,embedding
0,자기소개/환자확인,안녕하세요,안녕하세요.,"[-0.17859378, -0.5612938, 0.44627643, -0.15060..."
1,자기소개/환자확인,안녕하세요? 학생 한의사입니다.,안녕하세요.,"[-0.1790539, -0.42837209, 0.5460607, -0.276322..."
2,자기소개/환자확인,환자분 성함과 나이 말씀해 주시겠어요?,"네, 저는 37세 김지훈입니다.","[0.11551518, -1.0544999, 0.020485755, 0.151953..."
3,자기소개/환자확인,이름과 나이 말씀해 주시겠어요?,"네, 저는 37세 김지훈입니다.","[0.1782463, -0.8947584, 0.23265925, -0.2111499..."
4,자기소개/환자확인,이름과 나이가 어떻게 되세요?,"네, 저는 37세 김지훈입니다.","[-0.04089392, -0.96877515, 0.2909906, -0.13000..."


## 예제 문장 입력

In [12]:
text = '안녕하세요'

embedding = encoder.encode(text)

embedding

array([-1.78593785e-01, -5.61293781e-01,  4.46276426e-01, -1.50603548e-01,
       -3.76700237e-02, -7.62825370e-01,  2.09378600e-01, -3.64589281e-02,
        2.98453152e-01, -2.41037101e-01, -6.68550551e-01, -2.28057504e-01,
       -6.40330195e-01,  8.62435848e-02, -1.20845959e-01,  3.24341953e-01,
        2.35195234e-01, -1.59346774e-01,  3.03968161e-01, -3.05746943e-01,
        3.84012341e-01,  4.27564055e-01,  4.51850474e-01,  6.87622249e-01,
        1.16420828e-01,  6.62420571e-01,  2.50278503e-01, -1.45289108e-01,
        7.07308590e-01, -2.81397942e-02, -6.45833910e-01, -4.65397947e-02,
       -4.00333017e-01, -1.24737797e-02,  3.64073932e-01,  3.63886118e-01,
       -3.57513815e-01, -4.03465748e-01, -3.31617177e-01, -1.97076231e-01,
       -4.95101839e-01,  4.62570116e-02,  5.58891110e-02,  5.01659393e-01,
       -3.42604637e-01, -9.15144265e-01, -2.68441945e-01, -4.71062273e-01,
       -1.06321611e-01, -1.18266203e-01, -3.35126445e-02,  1.28225297e-01,
        2.69816548e-01,  

## 입력된 문장에 대해 유사도 계산

In [13]:
df['similarity'] = df['embedding'].map(lambda x: cosine_similarity([embedding], [x]).squeeze())

df.head()

Unnamed: 0,구분,유저,챗봇,embedding,similarity
0,자기소개/환자확인,안녕하세요,안녕하세요.,"[-0.17859378, -0.5612938, 0.44627643, -0.15060...",1.0
1,자기소개/환자확인,안녕하세요? 학생 한의사입니다.,안녕하세요.,"[-0.1790539, -0.42837209, 0.5460607, -0.276322...",0.458203
2,자기소개/환자확인,환자분 성함과 나이 말씀해 주시겠어요?,"네, 저는 37세 김지훈입니다.","[0.11551518, -1.0544999, 0.020485755, 0.151953...",0.416047
3,자기소개/환자확인,이름과 나이 말씀해 주시겠어요?,"네, 저는 37세 김지훈입니다.","[0.1782463, -0.8947584, 0.23265925, -0.2111499...",0.448215
4,자기소개/환자확인,이름과 나이가 어떻게 되세요?,"네, 저는 37세 김지훈입니다.","[-0.04089392, -0.96877515, 0.2909906, -0.13000...",0.471876


## 유사도가 가장 높은 답변을 채택

In [14]:
answer = df.loc[df['similarity'].idxmax()]

print('구분', answer['구분'])
print('유사한 질문', answer['유저'])
print('챗봇 답변', answer['챗봇'])
print('유사도', answer['similarity'])

구분 자기소개/환자확인
유사한 질문 안녕하세요
챗봇 답변 안녕하세요.
유사도 1.0


## 챗봇

In [15]:
!pip install -q gradio

In [16]:
!pip install -U typing-extensions




In [None]:
import gradio as gr

def greet(user):
    embedding = encoder.encode(user)

    df['distance'] = df['embedding'].map(lambda x: cosine_similarity([embedding], [x]).squeeze())
    answer = df.loc[df['distance'].idxmax()]

    return answer['챗봇']

demo = gr.Interface(fn=greet, inputs="text", outputs="text")

demo.launch(debug=True, share=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://e5c9b9b261f37fb0ce.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


In [None]:
import gradio as gr

def greet(user, history=[]):
    embedding = encoder.encode(user)

    df['distance'] = df['embedding'].map(lambda x: cosine_similarity([embedding], [x]).squeeze())
    answer = df.loc[df['distance'].idxmax()]

    history.append([user, answer['챗봇']])

    return history, history

demo = gr.Interface(fn=greet, inputs=["text", "state"], outputs=["chatbot", "state"])

demo.launch(debug=True, share=True)