# 듣고 질문에 답하기 유형 데이터 만들기

- 질문 생성 만들기
- 질문에 대한 오디오 만들기


## 질문 생성 Chain

In [2]:
import json
from typing import List

from tqdm.notebook import tqdm
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser, CommaSeparatedListOutputParser
from langchain.pydantic_v1 import BaseModel, Field
from langchain.schema import HumanMessage, AIMessage, StrOutputParser
import pandas as pd
import os

In [34]:
os.environ["OPENAI_API_KEY"] = "Your API Key"
model = ChatOpenAI(model="gpt-3.5-turbo")

### 질문 주제 샘플링하기

In [4]:
csv_parser = CommaSeparatedListOutputParser()

In [5]:
csv_format_instruction = csv_parser.get_format_instructions()

In [12]:
subjet_prompt_template = PromptTemplate.from_template(template="영어 시험에 나올 법한 일상적인 주제를 단어 형식으로 만들어줘.\n{format_instruction}",
                                                      partial_variables={"format_instruction": csv_format_instruction})

In [13]:
subject_chain = subjet_prompt_template | model | csv_parser

In [14]:
subject_chain.invoke({})

['family',
 'school',
 'work',
 'hobbies',
 'food',
 'travel',
 'health',
 'technology',
 'environment',
 'relationships']

In [15]:
csv_parser.invoke(model.invoke(f"영어 시험에 나올 법한 일상적인 주제를 단어 형식으로 만들어줘.\n{csv_format_instruction}"))

['family',
 'school',
 'friends',
 'hobbies',
 'sports',
 'music',
 'movies',
 'travel',
 'food',
 'technology']

In [16]:
subject_list = subject_chain.invoke({})

In [17]:
subject_list

['family',
 'school',
 'work',
 'hobbies',
 'food',
 'travel',
 'health',
 'technology',
 'environment',
 'entertainment']

In [18]:
subject_list = subject_list[:4]

In [26]:
subject_list

['family', 'school', 'work', 'hobbies']

### 질문 만들기

In [85]:
model = ChatOpenAI(model="gpt-4-1106-preview")

In [86]:
# Please create a different type from the previous questions {prev_questions}

In [103]:
template = """\
# 이전 질문들 {prev_questions}
- 이전 질문들과는 다른 유형으로 만들어줘
- 영어 시험에 나올 법한 {input} 주제에 관한 쉬운 질문 하나 만들어줘.
- 상대방과 연관지어 만들어줘
- 한 문장만 만들어줘
- 여러 예시 만들지마
- 영어로"""


question_prompt_template = PromptTemplate.from_template(template=template)

In [104]:
question_chain = question_prompt_template | model | StrOutputParser()

In [105]:
question_list = []
for subject in tqdm(subject_list):
    question_list.append(question_chain.invoke({"input": subject, "prev_questions": question_list}))
    # question_list.append(question_chain.invoke({"input": subject}))

  0%|          | 0/4 [00:00<?, ?it/s]

In [106]:
question_list

['How many siblings do you have?',
 'What is your favorite subject in school and why?',
 'How do you think having siblings might influence your teamwork skills in the workplace?',
 'Can you tell me about a hobby of yours that helps you relax after a long day?']

## 질문에 대한 오디오 파일 만들기

In [107]:
from openai import OpenAI

In [108]:
client = OpenAI()

In [109]:
def gen_speech_file(text, output_file_path):
    response = client.audio.speech.create(
        model="tts-1",
        voice="alloy", # alloy, echo, fable, onyx, nova, and shimmer
        input=text
    )
    response.stream_to_file(output_file_path)

In [110]:
# !mkdir -p ./data/speaking__listen_and_answer

In [111]:
save_dir = "./data/speaking__listen_and_answer"

In [112]:
question_list

['How many siblings do you have?',
 'What is your favorite subject in school and why?',
 'How do you think having siblings might influence your teamwork skills in the workplace?',
 'Can you tell me about a hobby of yours that helps you relax after a long day?']

In [113]:
record_list = []

for i, q in tqdm(enumerate(question_list), total=len(question_list)):
    output_file_path = f"{save_dir}/question_{i}.wav"
    gen_speech_file(q, output_file_path)

    record = {"question": q, "audio_file_path": output_file_path}
    record_list.append(record)

  0%|          | 0/4 [00:00<?, ?it/s]

  response.stream_to_file(output_file_path)


In [114]:
df = pd.DataFrame(record_list)
df

Unnamed: 0,question,audio_file_path
0,How many siblings do you have?,./data/speaking__listen_and_answer/question_0.wav
1,What is your favorite subject in school and why?,./data/speaking__listen_and_answer/question_1.wav
2,How do you think having siblings might influen...,./data/speaking__listen_and_answer/question_2.wav
3,Can you tell me about a hobby of yours that he...,./data/speaking__listen_and_answer/question_3.wav


In [115]:
df.to_csv(f"{save_dir}/question_and_audio.csv", index=False)

In [116]:
from IPython.display import Audio

In [117]:
Audio(f"{save_dir}/question_1.wav")