# Hugging Face - pipline

In [40]:
!pip install transformers datasets xformers -q

## pipeline - 감정분석

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
result = classifier(["We are very happy.", "We hope you don't hate it.", "I don't hate you", "I don't like you", "soso"])
print(result)

classifier("좋아")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9998819828033447}, {'label': 'NEGATIVE', 'score': 0.5308590531349182}, {'label': 'POSITIVE', 'score': 0.9985570311546326}, {'label': 'NEGATIVE', 'score': 0.9986074566841125}, {'label': 'POSITIVE', 'score': 0.9838406443595886}]


[{'label': 'NEGATIVE', 'score': 0.6970565319061279}]

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", "matthewburke/korean_sentiment")
classifier(['좋아', "별로야"])

## pipeline - 텍스트 생성

In [None]:
from transformers import pipeline

generator = pipeline("text-generation")
result = generator("once upon a time, ")
print(result[0]["generated_text"])

No model was supplied, defaulted to gpt2 and revision 6c0e608 (https://huggingface.co/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


once upon a time,  the last-ditch effort might have been to try and bring it back. But it never worked out, and then a half century later, perhaps, even more so.
It would be a shame, then


In [None]:
from transformers import pipeline

generator = pipeline("text-generation", "skt/kogpt2-base-v2")
result = generator("옛날 옛적에")
print(result[0]['generated_text'])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


옛날 옛적에 지은 별채였다.
1980년 봄, 서울 동대문구 장안동에서 개나리 넝쿨째 굴러 떨어지는 소나무를 보았다.
이 넝쿨째 굴러 올라가야 한다는데 나는 그렇게 하지 못했다.
하지만 넝쿨이 굴


## pipeline - 이미지 분류 모델

In [None]:
img = "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png"

classifier = pipeline("image-classification")
result = classifier(img)
print(result)

from IPython.display import Image
Image(img)

In [None]:
caption = pipeline("image-to-text")
result = caption(img)
print(result)

No model was supplied, defaulted to ydshieh/vit-gpt2-coco-en and revision 65636df (https://huggingface.co/ydshieh/vit-gpt2-coco-en).
Using a pipeline without specifying a model name and revision in production is not recommended.
Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


[{'generated_text': 'two birds are standing next to each other '}]


## pipeline - sound to text

In [None]:
from datasets import load_dataset

dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
print(dataset[0])

{'path': '/root/.cache/huggingface/datasets/downloads/extracted/a19fbc5032eacf25eab0097832db7b7f022b42104fbad6bd5765527704a428b9/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', 'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/a19fbc5032eacf25eab0097832db7b7f022b42104fbad6bd5765527704a428b9/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', 'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
        0.        ,  0.        ]), 'sampling_rate': 8000}, 'transcription': 'I would like to set up a joint account with my partner', 'english_transcription': 'I would like to set up a joint account with my partner', 'intent_class': 11, 'lang_id': 4}


In [None]:
print(dataset[0]['path'])

/root/.cache/huggingface/datasets/downloads/extracted/a19fbc5032eacf25eab0097832db7b7f022b42104fbad6bd5765527704a428b9/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav


In [None]:
from IPython.display import Audio
Audio(dataset[0]['path'])

In [None]:
recognizer = pipeline("automatic-speech-recognition")
result = recognizer(dataset[0]['path'])
print(result)

No model was supplied, defaulted to facebook/wav2vec2-base-960h and revision 55bb623 (https://huggingface.co/facebook/wav2vec2-base-960h).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)okenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

{'text': 'I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT'}


In [None]:
from datasets import load_dataset

dataset = load_dataset("PolyAI/minds14", name="ko-KR", split="train")
print(dataset[0])

Generating train split: 0 examples [00:00, ? examples/s]

{'path': '/root/.cache/huggingface/datasets/downloads/extracted/a19fbc5032eacf25eab0097832db7b7f022b42104fbad6bd5765527704a428b9/ko-KR~ATM_LIMIT/602bef265f67b421554f65e7.wav', 'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/a19fbc5032eacf25eab0097832db7b7f022b42104fbad6bd5765527704a428b9/ko-KR~ATM_LIMIT/602bef265f67b421554f65e7.wav', 'array': array([0.00024414, 0.        , 0.        , ..., 0.00073242, 0.00048828,
       0.00048828]), 'sampling_rate': 8000}, 'transcription': 'app Manager 하고 싶은데 최대 금액이 얼마인지요', 'english_transcription': 'I want to do app manager, what is the maximum amount', 'intent_class': 3, 'lang_id': 8}


In [None]:
from IPython.display import Audio
Audio(dataset[0]['path'])

In [None]:
recognizer = pipeline("automatic-speech-recognition", "Hyuk/wav2vec2-korean-v2")
result = recognizer(dataset[0]['path'])
print(result)

# Gradio - 앱만들기
- 딥러닝 엔지니어가 앱을 쉽게 만들 수 있도록 도와주는 프레임워크

In [17]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.9.0-py3-none-any.whl (16.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.105.0-py3-none-any.whl (93 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.1.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==0.7.2 (from gradio)
  Downloading gradio_client-0.7.2-py3-none-any.whl (304 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m304.6/304.6 kB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (138 k

In [None]:
import gradio as gr

In [None]:
import gradio as gr

def greet(name):
    print(name)
    return "Hello " + name + "!"

demo = gr.Interface(fn=greet, inputs="text", outputs="text")

demo.launch(share=True, debug=True)

In [None]:
from transformers import pipeline
import gradio as gr

def greet(sentence):
    classifier = pipeline("sentiment-analysis", "matthewburke/korean_sentiment")
    result = classifier(sentence)
    d = {
        'LABEL_1': '긍정',
        'LABEL_0': '부정'
    }

    return f"{sentence}\n 이 문장이 {d[result[0]['label']]} 일 확률은 {result[0]['score'] * 100}%"

demo = gr.Interface(fn=greet, inputs="text", outputs="text")

demo.launch(share=True, debug=True)

In [None]:
import gradio as gr

def greet(name, is_morning, temperature):
    salutation = "Good morning" if is_morning else "Good evening"
    greeting = f"{salutation} {name}. It is {temperature} degrees today"
    celsius = (temperature - 32) * 5 / 9
    return greeting, round(celsius, 2)

demo = gr.Interface(
    fn=greet,
    inputs=["text", "checkbox", gr.Slider(0, 100)],
    outputs=["text", "number"],
)
demo.launch()


In [None]:
import gradio as gr

def greet(image):
    return image.rotate(180)

demo = gr.Interface(fn=greet, inputs=gr.Image(type='pil', source='webcam'), outputs="image")

demo.launch(share=True, debug=True)

## 챗봇

In [None]:
import gradio as gr

def chat(msg, history):
    history.append((msg, "안녕. 반가워"))
    print(history)
    return "", history

with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.ClearButton([msg, chatbot])
    msg.submit(chat, [msg, chatbot], [msg, chatbot])

demo.launch(debug=True)

In [None]:
from transformers import pipeline

chat = pipeline("text-generation", max_length=1000, model="EasthShin/Youth_Chatbot_Kogpt2-base")
result = chat("안녕")
print(result)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/143 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.83M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/110 [00:00<?, ?B/s]

[{'generated_text': '안녕에 대해 안심을 주는 사람이 있어 안심되시는군요.'}]


In [None]:
import gradio as gr
from transformers import pipeline

chat_model = pipeline("text-generation", max_length=1000, model="EasthShin/Youth_Chatbot_Kogpt2-base")

def chat(msg, history):
    prompt = [f"유저: {h[0]}\n챗봇: {h[1]}\n" for h in history]
    prompt = "\n".join(prompt) + f'\n유저: {msg}\n챗봇: '
    print(prompt)

    result = chat_model(prompt)
    history.append((msg, result[0]["generated_text"]))
    return "", history

with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.ClearButton([msg, chatbot])
    msg.submit(chat, [msg, chatbot], [msg, chatbot])

demo.launch(debug=True)

# Stable diffusion

In [None]:
!pip install -q transformers[sentencepiece] gradio openai diffusers accelerate

In [None]:
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
import torch

model_id = "stabilityai/stable-diffusion-2-1-base"

scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
pipe = StableDiffusionPipeline.from_pretrained(model_id,
                                               scheduler=scheduler,
                                               torch_dtype=torch.float16,
                                               revision="fp16")
pipe = pipe.to("cuda")

In [None]:
import matplotlib.pyplot as plt

image = pipe("rainbow pig").images[0]
plt.imshow(image)

# Open AI

In [7]:
!pip install openai



In [None]:
!pip install openai==0.28

In [None]:
import os
import openai

openai.api_key = ""

response = openai.Completion.create(
  model="text-davinci-003",
  prompt="옛날 옛적에",
  temperature=1,
  max_tokens=512,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=0
)

print(response.choices[0].text)

In [59]:
import os
from openai import OpenAI

# openai.api_key=""

# # client = OpenAI(
# #     # This is the default and can be omitted
# #     api_key=os.environ.get(""),
# # )

# chat_completion = client.chat.completions.create(
#     model="gpt-3.5-turbo",
#     messages=[
#         {"role": "system", "content": "넌 이제부터 나의 오랜 친구야"},
#         {"role": "user", "content": "안녕 친구"},
#         {"role": "assistant", "content": "안녕! 너와 함께 시간을 보낼 수 있어서 정말 기쁘다. 꿈이나 고민, 얘기할 것이 있으면 언제든지 말해줘! 함께 즐거운 시간 보내자~"},
#         {"role": "user", "content": "나도 기뻐, 우리 언제보나 본지 오래되었네"},
#     ],
#     temperature=1,
#     max_tokens=256,
#     top_p=1,
#     frequency_penalty=0,
#     presence_penalty=0
# )
# print(chat_completion.choices[0].message['content'])

import openai

# optional; defaults to `os.environ['OPENAI_API_KEY']`
openai.api_key=""

# all client options can be configured just like the `OpenAI` instantiation counterpart


completion = openai.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "넌 이제부터 나의 오랜 친구야"},
        {"role": "user", "content": "안녕 친구"},
        {"role": "assistant", "content": "안녕! 너와 함께 시간을 보낼 수 있어서 정말 기쁘다. 꿈이나 고민, 얘기할 것이 있으면 언제든지 말해줘! 함께 즐거운 시간 보내자~"},
        {"role": "user", "content": "나도 기뻐, 우리 언제보나 본지 오래되었네"},
    ],
    temperature=1,
    max_tokens=256,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
)
print(completion.choices[0].message.content)

맞아, 정말 오랜만이야. 우리 서로 어떤 일이 있었는지 얘기해봐야겠어. 만나서 이야기하면서 그 동안 놓친 소식들도 나눠볼까? 언제 가능해?


In [None]:
import os
from openai import OpenAI

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get(OPENAI_API_KEY=""),
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Say this is a test",
        }
    ],
    model="gpt-3.5-turbo",
)

In [None]:
import os
import openai

openai.api_key = ""

response = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "넌 이제부터 나의 오랜 친구야"},
    {"role": "user", "content": "안녕 친구"},
    {"role": "assistant", "content": "안녕! 너와 함께 시간을 보낼 수 있어서 정말 기쁘다. 꿈이나 고민, 얘기할 것이 있으면 언제든지 말해줘! 함께 즐거운 시간 보내자~"},
    {"role": "user", "content": "나도 기뻐, 우리 언제보나 본지 오래되었네"},
  ],
  temperature=1,
  max_tokens=256,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=0
)
print(response.choices[0].message['content'])

그렇지! 정말 오랜만이야. 언제든지 만나서 좋은 시간 보내자. 뭐 하고 싶은 게 있어? 영화 보기, 카페 가기, 산책하기 등등 어떤 것이든 상관없어. 너의 의견을 들어보자!


## 텍스트 생성 앱

In [None]:
!pip install gradio

Collecting gradio
  Downloading gradio-3.43.1-py3-none-any.whl (20.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.1/20.1 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.103.1-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.1.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==0.5.0 (from gradio)
  Downloading gradio_client-0.5.0-py3-none-any.whl (298 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.2/298.2 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx (from gradio)
  Downloading httpx-0.24.1-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import gradio as gr
import openai

openai.api_key = ""

def 텍스트생성(prompt):
    response = openai.Completion.create(
        model="text-davinci-003",
        prompt=prompt,
        temperature=0.7,
        max_tokens=1024,
    )
    return response['choices'][0]['text'].strip()

demo = gr.Interface(fn=텍스트생성, inputs="text", outputs="text")
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://0dace65a95f3a7974d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [None]:
import gradio as gr
import openai

openai.api_key = ""

def 텍스트생성(prompt):
    response = openai.Completion.create(
        model="text-davinci-003",
        prompt=f"'{prompt}' 위의 문장을 영어로 번역해줘",
        temperature=0.7,
        max_tokens=1024,
    )
    return response['choices'][0]['text'].strip()

demo = gr.Interface(fn=텍스트생성, inputs="text", outputs="text")
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://68b97e23c1cd36dd70.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




## 챗봇

In [None]:
import gradio as gr

import os
import openai

openai.api_key = ""

messages = []

def chat(msg, history):
    messages.append({"role": "user", "content": msg})
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "system", "content": "넌 이제부터 나의 오랜 친구야"}, *messages[-10:]],
        temperature=1,
        max_tokens=1000
    )
    messages.append({"role": "assistant", "content": response.choices[0].message['content']})
    print(messages)

    history.append((msg, response.choices[0].message['content']))
    return "", history

with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    send = gr.Button("Send")
    clear = gr.ClearButton([msg, chatbot])

    msg.submit(chat, [msg, chatbot], [msg, chatbot])
    send.click(chat, [msg, chatbot], [msg, chatbot])

demo.launch(share=True)

## 피자가게 주문 받는 챗봇

In [None]:
import gradio as gr
import os
import openai

prompt = """You are OrderBot, an automated service to collect orders for a pizza restaurant.
You first greet the customer, then collects the order, and then asks if it's a pickup or delivery.
You wait to collect the entire order, then summarize it and check for a final time if the customer wants to add anything else.
If it's a delivery, you ask for an address. Finally you collect the payment.
Make sure to clarify all options, extras and sizes to uniquely identify the item from the menu.
You respond in a short, very conversational friendly style.
The menu includes
 pepperoni pizza 12.95, 10.00, 7.00
 cheese pizza 10.95, 9.25, 6.50
 eggplant pizza 11.95, 9.75, 6.75
 fries 4.50, 3.50
 greek salad 7.25
Toppings:
 extra cheese 2.00,
 mushrooms 1.50
 sausage 3.00
 canadian bacon 3.50
 AI sauce 1.50
 peppers 1.00
Drinks:
 coke 3.00, 2.00, 1.00
 sprite 3.00, 2.00, 1.00
 bottled water 5.00
"""

openai.api_key = ""

messages = []

def chat(msg, history):
    messages.append({"role": "user", "content": msg})
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "system", "content": prompt}, *messages[-10:]],
        temperature=1,
        max_tokens=1000
    )
    messages.append({"role": "assistant", "content": response.choices[0].message['content']})
    print(messages)

    history.append((msg, response.choices[0].message['content']))
    return "", history

with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    send = gr.Button("Send")
    clear = gr.ClearButton([msg, chatbot])

    msg.submit(chat, [msg, chatbot], [msg, chatbot])
    send.click(chat, [msg, chatbot], [msg, chatbot])

demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://768d6fbbf10e2822a3.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [None]:
messages

# 토큰화

In [None]:
!pip install transformers -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast

# 토큰화 함수
tokenizer = PreTrainedTokenizerFast.from_pretrained("taeminlee/kogpt2")
# 다음 단어 함수
model = GPT2LMHeadModel.from_pretrained("taeminlee/kogpt2")

Downloading (…)okenizer_config.json:   0%|          | 0.00/109 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.93M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/577 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


Downloading model.safetensors:   0%|          | 0.00/510M [00:00<?, ?B/s]

In [None]:
text = "아들아 너는 계획이 다 있니? 아니 난 이따가 전화할께"
tokens = tokenizer.encode(text, return_tensors="pt")
print(len(tokens[0]))
print(tokens)

17
tensor([[ 2656, 47485,  1681, 47441, 10635,   148,   105, 47515, 47774,   491,
          1130,   104, 47701, 47448,  2948, 47558, 47796]])


In [None]:
outputs = model(tokens) # 모든 토큰들의 확률값
print(outputs[0].shape)
outputs = outputs[0][0, -1, :]
print(outputs)
print(outputs[605])
len(outputs)

torch.Size([1, 17, 50000])
tensor([  1.6416,  13.9180, -13.5638,  ...,  -4.4081,  -2.7817,  -1.4270],
       grad_fn=<SliceBackward0>)
tensor(5.9681, grad_fn=<SelectBackward0>)


50000

In [None]:
token = outputs.argmax(-1) # 가장 높은 확률의 토큰
decoded = tokenizer.decode(token)
print(token, decoded)

tensor(1) </s>


In [None]:
for t in range(600, 610):
    print(t, tokenizer.decode(t))


600 운데
601 되었다
602 최고
603 기에
604 항
605 있어
606 이라는
607 넘
608 폭
609 번


In [None]:
tokens

tensor([[ 2656, 47485,  1681, 47441, 10635,   148,   105, 47515, 47774,   491,
          1130,   104, 47701, 47448,  2948, 47558, 47796]])

In [None]:
"아들아 너는 계획이 다 있니? 아니 난 이따가 전화할께"
[ 2656, 47485,  1681, 47441, 10635,   148,   105, 47515, 47774,   491, 1130,   104, 47701, 47448,  2948, 47558, 47796]

input = [[2656., 47485.,  1681., 47441., 10635.]]
target = [47485.,  1681., 47441., 10635.,   148.]

outputs = model(tokens)
outputs[0].shape

torch.Size([1, 17, 50000])

In [None]:
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast

# 토큰화 함수
tokenizer = PreTrainedTokenizerFast.from_pretrained("taeminlee/kogpt2")
# 다음 단어 함수
model = GPT2LMHeadModel.from_pretrained("taeminlee/kogpt2")

text = "아들아 너는 계획이 다"
tokens = tokenizer.encode(text, return_tensors="pt")
tokens

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


tensor([[ 2656, 47485,  1681, 47441, 10635,   148]])

In [None]:
outputs = model.generate(tokens)
print(outputs[0])
generated_text = tokenizer.decode(outputs[0])
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([ 2656, 47485,  1681, 47441, 10635,   148,   605,     1,     0,   155,
          872,  7892,  8274,   605,     1,     0,   155,   872,  7892,  8274])
아들아 너는 계획이 다 있어</s><s> 나 지금 집에 가고 있어</s><s> 나 지금 집에 가고


# vector DB

In [None]:
!pip install langchain openai huggingface_hub -q
!pip install sentence_transformers chromadb -q

In [None]:
examples = [
"""
    무하마드 알리와 앨런 튜링 중 누가 더 오래 살았나요?

    여기에 후속 질문이 필요한가요? 예.
    이어서 질문하세요: 무하마드 알리가 사망했을 때 그의 나이는 몇 살이었나요?
    중간 답변: 무하마드 알리는 사망 당시 74세였습니다.
    후속 질문: 앨런 튜링은 사망했을 때 몇 살이었나요?
    중급 정답: 앨런 튜링은 사망 당시 41세였습니다.
    최종 정답은 무하마드 알리
""", """
    크레이그리스트의 창립자는 언제 태어났나요?

    여기에 후속 질문이 필요한가요? 예.
    후속 질문입니다: 크레이그리스트의 창립자는 누구인가요?
    중간 답변: 크레이그리스트는 크레이그 뉴마크가 설립했습니다.
    후속 질문: 크레이그 뉴마크는 언제 태어났나요?
    중급 정답: 크레이그 뉴마크는 1952년 12월 6일에 태어났습니다.
    따라서 최종 정답은 1952년 12월 6일
""", """
    조지 워싱턴의 외할아버지는 누구였나요?

    여기에 후속 질문이 필요한가요? 예.
    계속하세요: 조지 워싱턴의 어머니는 누구였나요?
    중간 수준의 답입니다: 조지 워싱턴의 어머니는 메리 볼 워싱턴이었습니다.
    후속 질문: 메리 볼 워싱턴의 아버지는 누구였습니까?
    중급 정답: 메리 볼 워싱턴의 아버지는 조셉 볼이었습니다.
    최종 정답은 조셉 볼
""", """
    죠스와 카지노 로얄의 감독이 모두 같은 나라 출신인가요?

    여기에서 후속 질문이 필요한가요? 예.
    후속 질문이 필요합니다: 죠스의 감독은 누구인가요?
    중간 수준의 답변: 죠스의 감독은 스티븐 스필버그입니다.
    후속 질문: 스티븐 스필버그는 어디 출신인가요?
    중급 정답: 미국입니다.
    후속 질문: 카지노 로얄의 감독은 누구인가요?
    중급 정답: 카지노 로얄의 감독은 마틴 캠벨입니다.
    후속 질문: 마틴 캠벨은 어디 출신인가요?
    중급 답변: 뉴질랜드입니다.
    최종 정답은 아니요
"""
]

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

db = Chroma.from_texts(
    collection_name="sample",
    texts=examples,
    embedding=HuggingFaceEmbeddings(model_name="jhgan/ko-sbert-sts")
)

In [None]:
question = "메리 볼 워싱턴의 딸은 누구였나요?"
doc = db.similarity_search(question, k=1)
print(doc[0].page_content)


    조지 워싱턴의 외할아버지는 누구였나요?

    여기에 후속 질문이 필요한가요? 예.
    계속하세요: 조지 워싱턴의 어머니는 누구였나요?
    중간 수준의 답입니다: 조지 워싱턴의 어머니는 메리 볼 워싱턴이었습니다.
    후속 질문: 메리 볼 워싱턴의 아버지는 누구였습니까?
    중급 정답: 메리 볼 워싱턴의 아버지는 조셉 볼이었습니다.
    최종 정답은 조셉 볼



In [None]:
question = "무하마드 알리는 몇 살에 죽었나요?"
doc = db.similarity_search(question, k=1)
print(doc[0].page_content)


    무하마드 알리와 앨런 튜링 중 누가 더 오래 살았나요?

    여기에 후속 질문이 필요한가요? 예.
    이어서 질문하세요: 무하마드 알리가 사망했을 때 그의 나이는 몇 살이었나요?
    중간 답변: 무하마드 알리는 사망 당시 74세였습니다.
    후속 질문: 앨런 튜링은 사망했을 때 몇 살이었나요?
    중급 정답: 앨런 튜링은 사망 당시 41세였습니다.
    최종 정답은 무하마드 알리



In [None]:
question = "무하마드 알리는 몇 살에 죽었나요?"
doc = db.similarity_search(question, k=1)

# 프롬프트 엔지니어링
text = f"""<ref>{doc[0].page_content}</ref>
<ref>를 참고해서 답변합니다. 최종 정답만 말합니다.

질문: {question}
"""

print(text)

<ref>
    무하마드 알리와 앨런 튜링 중 누가 더 오래 살았나요?

    여기에 후속 질문이 필요한가요? 예.
    이어서 질문하세요: 무하마드 알리가 사망했을 때 그의 나이는 몇 살이었나요?
    중간 답변: 무하마드 알리는 사망 당시 74세였습니다.
    후속 질문: 앨런 튜링은 사망했을 때 몇 살이었나요?
    중급 정답: 앨런 튜링은 사망 당시 41세였습니다.
    최종 정답은 무하마드 알리
</ref>
<ref>를 참고해서 답변합니다. 최종 정답만 말합니다. 

질문: 무하마드 알리는 몇 살에 죽었나요?



In [None]:
import openai

openai.api_key = ""

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "system", "content": "질문에 답변합니다."},
              {"role": "user", "content": text}],
    temperature=1,
    max_tokens=1000
)
print(response.choices[0].message.content)

무하마드 알리는 74세에 죽었습니다.


In [None]:
import openai

openai.api_key = ""

question = "죠스를 만든 감독은 누구야?"
doc = db.similarity_search(question, k=1)

# 프롬프트 엔지니어링
text = f"""<ref>{doc[0].page_content}</ref>
<ref>를 참고해서 답변합니다. 최종 정답만 말합니다.
<ref>에서 참고할 수 없는 질문이라면, "참고 자료가 부족합니다."라고 대답합니다.

질문: {question}
"""

print(text)

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "system", "content": "질문에 답변합니다."},
              {"role": "user", "content": text}],
    temperature=1,
    max_tokens=1000
)
print(response.choices[0].message.content)

<ref>
    죠스와 카지노 로얄의 감독이 모두 같은 나라 출신인가요?

    여기에서 후속 질문이 필요한가요? 예.
    후속 질문이 필요합니다: 죠스의 감독은 누구인가요?
    중간 수준의 답변: 죠스의 감독은 스티븐 스필버그입니다.
    후속 질문: 스티븐 스필버그는 어디 출신인가요?
    중급 정답: 미국입니다.
    후속 질문: 카지노 로얄의 감독은 누구인가요?
    중급 정답: 카지노 로얄의 감독은 마틴 캠벨입니다.
    후속 질문: 마틴 캠벨은 어디 출신인가요?
    중급 답변: 뉴질랜드입니다.
    최종 정답은 아니요
</ref>
<ref>를 참고해서 답변합니다. 최종 정답만 말합니다. 
<ref>에서 참고할 수 없는 질문이라면, "참고 자료가 부족합니다."라고 대답합니다.

질문: 죠스를 만든 감독은 누구야?

죠스를 만든 감독은 스티븐 스필버그입니다.


# 남해 카페 챗봇

In [46]:
!wget https://github.com/blackdew/ml-tensorflow/raw/master/data/%EC%B9%B4%ED%8E%98-20230905T134359Z-001.zip

--2023-12-14 07:18:13--  https://github.com/blackdew/ml-tensorflow/raw/master/data/%EC%B9%B4%ED%8E%98-20230905T134359Z-001.zip
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/blackdew/ml-tensorflow/master/data/%EC%B9%B4%ED%8E%98-20230905T134359Z-001.zip [following]
--2023-12-14 07:18:14--  https://raw.githubusercontent.com/blackdew/ml-tensorflow/master/data/%EC%B9%B4%ED%8E%98-20230905T134359Z-001.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 128628 (126K) [application/zip]
Saving to: ‘카페-20230905T134359Z-001.zip’


2023-12-14 07:18:14 (8.47 MB/s) - ‘카페-20230905T134359Z-001.zip’ saved [1286

In [47]:
!unzip /content/카페-20230905T134359Z-001.zip
!ls /content/카페

Archive:  /content/카페-20230905T134359Z-001.zip
  inflating: 카페/카페 더 힐스.txt  
  inflating: 카페/CAFE IN 1035.txt  
  inflating: 카페/CAFE설리.txt   
  inflating: 카페/행복베이커리.txt  
  inflating: 카페/안화.txt       
  inflating: 카페/카페샌드.txt  
  inflating: 카페/카페공간비.txt  
  inflating: 카페/베를리너크라펜.txt  
  inflating: 카페/카페향기 Field cafe_.txt  
  inflating: 카페/남해의숲.txt  
  inflating: 카페/백년유자 2호점.txt  
  inflating: 카페/할로브리즈.txt  
  inflating: 카페/카페 하녹.txt  
  inflating: 카페/어쩌다남해.txt  
  inflating: 카페/높들.txt       
  inflating: 카페/카사비앙카 카페.txt  
  inflating: 카페/완벽한인생 설리스카이워크.txt  
  inflating: 카페/카페파니.txt  
  inflating: 카페/플로잉 클라우드.txt  
  inflating: 카페/모래위의발자국.txt  
  inflating: 카페/팥파이스.txt  
  inflating: 카페/배스킨라빈스 경남남해.txt  
  inflating: 카페/버디베어.txt  
  inflating: 카페/설빙 남해점.txt  
  inflating: 카페/오실재.txt    
  inflating: 카페/카페끌림.txt  
  inflating: 카페/카페아몬드.txt  
  inflating: 카페/안녕,남해 Cafe.txt  
  inflating: 카페/베를린성.txt  
  inflating: 카페/보물섬 유자빵.txt  
  inflating: 카페/로드17.txt     
  inflating: 카페/남해육쪽마늘빵카페.tx

In [49]:
import glob

cafes = []
files = glob.glob("/content/카페/*.txt")
for f in files:
    with open(f, "r") as f:
        cafes.append(f.read())

print(len(cafes))
print(cafes[0])

112

"카페굿데이"

카테고리: ['카페,디저트', '카페']
메뉴: 브루잉커피 6,000
전화: (010-9323-3265)
주소: 경상남도 남해군 남면 무지개로 380 (경상남도 남해군 남면 임포리 388-2) (127.8700947, 34.7460425)
디저트가 맛있어요, 대화하기 좋아요, 커피가 맛있어요, 음료가 맛있어요, 매장이 청결해요, 친절해요, 사진이 잘 나와요, 인테리어가 멋져요, 뷰가 좋아요, 주차하기 편해요

카페굿데이 카페의 분위기와 인테리어: 아기자기한 인테리어로 예쁘고 매력적임
카페굿데이 카페의 메뉴: 다양한 커피 메뉴와 직접 만든 디저트를 제공
카페굿데이 카페의 서비스 및 기타: 사장님의 친절한 서비스
카페굿데이 카페의 종합적인 평가: 남해 여행 시 들러야 할 가치가 있는 카페


In [51]:
!pip install langchain

Collecting langchain
  Downloading langchain-0.0.350-py3-none-any.whl (809 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m809.1/809.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.3-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.0.2 (from langchain)
  Downloading langchain_community-0.0.3-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core<0.2,>=0.1 (from langchain)
  Downloading langchain_core-0.1.0-py3-none-any.whl (189 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.1/189.1 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langsmith<0.1.0,>=0.0.63 (from langchain)
  Downloading langsmith-0.

In [None]:
!pip install sentence-transformers

In [None]:
!pip install chromadb

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

db = Chroma.from_texts(
    collection_name="namhae",
    texts=cafes,
    embedding=HuggingFaceEmbeddings(model_name="jhgan/ko-sbert-sts")
)

In [None]:
doc = db.similarity_search("커피가 맛있는 카페 추천", k=3)
for d in doc:
    print(d.page_content)

In [None]:
import random
question = "커피가 맛있는 카페 추천"
doc = db.similarity_search(question, k=10)

# 프롬프트 엔지니어링
doc = [f"<ref>{d.page_content}\n</ref>\n" for d in random.sample(doc, 3)]
ref = '\n'.join(doc)

text = f"""{ref}
<ref>를 참고해서 답변합니다. 최종 정답만 말합니다.
<ref>에서 참고할 수 없는 질문이라면, \"참고 자료가 부족합니다.\"라고 대답합니다.

질문: {question}
"""

print(text)

In [None]:
import openai

openai.api_key=""

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "system", "content": "질문에 답변합니다."},
              {"role": "user", "content": text}],
    temperature=0.7,
    max_tokens=1000
)
print(response.choices[0].message.content)

남해커피마을협동조합 카페와 어울림꿈뜨락협동조합 카페는 커피가 맛있다고 리뷰에 언급되어 있습니다. 따라서 이 두 곳을 추천할 수 있습니다.


In [None]:
import gradio as gr
import os
import openai
import random

prompt = """대한민국 남해군 카페 정보를 알려주는 챗봇.
만일, reference를 참고해야 하는 질문이면, reference를 참고해서 답변한다.
만밀, reference를 참고할 필요가 없는 질문이라면, reference를 참고하지 않고 답변한다.
만일, reference에 질문에 적합한 정보가 없다면, "답변을 위한 충분한 자료가 확보되지 않았습니다" 라고 답변한다. """

openai.api_key = ""

messages = []

def chat(question, history):
    doc = db.similarity_search(question, k=10)
    # 프롬프트 엔지니어링
    doc = [f"<ref>{d.page_content}\n</ref>\n" for d in random.sample(doc, 1)]
    ref = '\n'.join(doc)
    text = f"""{ref}
    <ref>를 참고해서 답변합니다. 최종 정답만 말합니다.

    질문: {question}"""

    print(text)
    messages.append({"role": "user", "content": text})
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "system", "content": prompt}, *messages[-10:]],
        temperature=1,
        max_tokens=4096
    )
    messages.append({"role": "assistant", "content": response.choices[0].message['content']})
    print(messages)

    history.append((question, response.choices[0].message['content']))
    return "", history

with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    send = gr.Button("Send")
    clear = gr.ClearButton([msg, chatbot])

    msg.submit(chat, [msg, chatbot], [msg, chatbot])
    send.click(chat, [msg, chatbot], [msg, chatbot])

demo.launch(share=True, debug=True)

In [38]:
!pip install gradio

