<a href="https://colab.research.google.com/github/Jr-Einstein/3D-Model/blob/main/GPT-4-Voice-Chat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GPT-4 Voice chat, based on Whisper, ChatGPT GPT-4

[By Brain Assistant](https://assistant.ruoguedu.com)

With the assistant, You can chat with ChatGPT with voice by Whisper API. Open it from [Colab](https://colab.research.google.com/github/davideuler/awesome-assistant-api/blob/main/GPT-4-Voice-Chat.ipynb)

A demo from:
[Awesome Assistant API.](https://github.com/davideuler/awesome-assistant-api)

The Tools and core technologies used here:
``` Markdown
** Whisper, asr and tts api

** GPT-4, Completion API

```

In [1]:
!pip install openai whisper requests pydub

Collecting whisper
  Downloading whisper-1.1.10.tar.gz (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Building wheels for collected packages: whisper
  Building wheel for whisper (setup.py) ... [?25l[?25hdone
  Created wheel for whisper: filename=whisper-1.1.10-py3-none-any.whl size=41120 sha256=33fd89b7f1798849a9e68856ea2b9cce4dfa11db9b40748adc53f6567c07a6ab
  Stored in directory: /root/.cache/pip/wheels/21/65/ee/4e6672aabfa486d3341a39a04f8f87c77e5156149299b5a7d0
Successfully built whisper
Installing collected packages: pydub, whisper
Successfully installed pydub-0.25.1 whisper-1.1.10


In [2]:
import os
# setup OpenAI api_key
import getpass

if not os.environ.get('OPENAI_API_KEY'):
    os.environ['OPENAI_API_KEY'] = getpass.getpass("Enter the OpenAI API Key(which starts with sk-): ")


Enter the OpenAI API Key(which starts with sk-): ··········


In [3]:
from openai import OpenAI

import time
import base64
import requests

# for colab camera
from IPython.display import display, Javascript
from google.colab.output import eval_js
from base64 import b64decode

client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))


# all imports
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode
from io import BytesIO

from pydub import AudioSegment

RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""



class AudioApp:
    def __init__(self, api_key):
        self.api_key = api_key

    def record_raw(self, sec=3):
      display(Javascript(RECORD))
      s = output.eval_js('record(%d)' % (sec*1000))
      b = b64decode(s.split(',')[1])
      audio = AudioSegment.from_file(BytesIO(b))
      return audio
      # return b

    def record_wav(self, sec=5):
      display(Javascript(RECORD))
      s = output.eval_js('record(%d)' % (sec*1000))
      b = b64decode(s.split(',')[1])
      audio = AudioSegment.from_file(BytesIO(b), format="raw", sample_width=2,
                                       channels=1, frame_rate=48000)

      audio.export("output.wav", format="wav")
      return open("output.wav", "rb")

    def record_save(self, sec=3, output_audio="audio.wav"):
      display(Javascript(RECORD))
      s = output.eval_js('record(%d)' % (sec*1000))
      b = b64decode(s.split(',')[1])
      with open(output_audio,'wb') as f:
        f.write(b)
      return output_audio  # or webm ?

    def send_text_request(self, text):
        headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
        payload = {
            "model": "gpt-4-1106-preview",
            "messages": [{
                "role": "user",
                "content": text
            }]
        }
        return requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload).json()

    def send_vision_request(self, image_data):
        headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
        payload = {
            "model": "gpt-4-vision-preview",
            "messages": [{
                "role": "user",
                "content": [{"type": "text", "text": "Tell me about this image. Limit your response to 100 words."}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}]
            }],
            "max_tokens": 300
        }
        return requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload).json()

    def save_response_as_audio(self, response_data, audio_output="output.mp3"):
        response = client.audio.speech.create(
            model="tts-1",
            voice="alloy",
            input=response_data['choices'][0]['message']['content'],
        )
        response.stream_to_file(audio_output)
        return response_data['choices'][0]['message']['content']

    def openai_api_calculate_cost(self, usage, model="gpt-4-1106-preview"):
      pricing = {
          'gpt-3.5-turbo-1106': {
              'prompt': 0.001,
              'completion': 0.002,
          },
          'gpt-4-8k': {
              'prompt': 0.03,
              'completion': 0.06,
          },
          'gpt-4-32k': {
              'prompt': 0.06,
              'completion': 0.12,
          },
          'gpt-4-1106-preview': {
              'prompt': 0.01,
              'completion': 0.03,
          },
          'gpt-4-1106-vision-preview': {
              'prompt': 0.01,
              'completion': 0.03,
          }
      }

      try:
          model_pricing = pricing[model]
      except KeyError:
          raise ValueError("Invalid model specified")

      prompt_cost = usage['prompt_tokens'] * model_pricing['prompt'] / 1000
      completion_cost = usage['completion_tokens'] * model_pricing['completion'] / 1000

      total_cost = prompt_cost + completion_cost
      print(f"\nTokens used:  {usage['prompt_tokens']:,} prompt + {usage['completion_tokens']:,} completion = {usage['total_tokens']:,} tokens")
      print(f"Total cost for {model}: ${total_cost:.4f}\n")

      return total_cost




Now, let's run the application, capture image from the camera, and then save the response to mp3 audio file, and play the audio.

In [8]:
# Now, let run the application

from IPython.display import Audio
from IPython.display import display
import numpy as np
import time

audio_app = AudioApp(api_key=client.api_key)


if __name__ == '__main__':
  print("Waiting for you to say something to ChatGPT by voice...")
  audio_content = audio_app.record_wav(3)

  transcript = client.audio.transcriptions.create(
    model="whisper-1",
    file  = audio_content
    #file= np.frombuffer(audio_file.raw_data, np.int16).flatten().astype(np.float32) / 32768.0
  )

  print("You:%s" % transcript.text)


  if transcript:
    response_data = audio_app.send_text_request(transcript.text)
    interpretation = audio_app.save_response_as_audio(response_data, audio_output="output.mp3")
    print("Response in text:%s" % interpretation)
    total_cost = audio_app.openai_api_calculate_cost(response_data['usage'])

    wn = Audio("output.mp3", autoplay=True) ##
    display(wn)## play the audio
    time.sleep(2)


Waiting for you to say something to ChatGPT by voice...


<IPython.core.display.Javascript object>

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}