<a href="https://colab.research.google.com/github/NeoClassicalRibbon/ai-vc-assistant/blob/main/ai_vc_assistant_proto_public.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install dependencies

In [None]:
!pip install openai
!pip install pydub
!pip install mutagen
!git clone https://github.com/NeoClassicalRibbon/ai-vc-assistant.git

### Install VOICEVOX

In [None]:
%%shell
RELEASES_VOICEVOX="https://github.com/VOICEVOX/voicevox_core/releases/download/0.14.1"
RELEASES_ONNXRUNTIME="https://github.com/microsoft/onnxruntime/releases/download/v1.13.1"

nvidia-smi
if [ $? -eq 0 ]; then
  echo "GPU Cuda"
  FILE_VOICEVOX="voicevox_core-0.14.1+cuda-cp38-abi3-linux_x86_64.whl"
  FILE_ONNXRUNTIME="onnxruntime-linux-x64-gpu-1.13.1.tgz"
else
  echo "CPU"
  FILE_VOICEVOX="voicevox_core-0.14.1+cpu-cp38-abi3-linux_x86_64.whl"
  FILE_ONNXRUNTIME="onnxruntime-linux-x64-1.13.1.tgz"
fi

wget -nc $RELEASES_ONNXRUNTIME"/"$FILE_ONNXRUNTIME
tar zxvf $FILE_ONNXRUNTIME
ln -s ${FILE_ONNXRUNTIME:0:-4}"/lib/libonnxruntime.so.1.13.1"

wget -nc $RELEASES_VOICEVOX"/"$FILE_VOICEVOX
pip install $FILE_VOICEVOX

wget https://jaist.dl.sourceforge.net/project/open-jtalk/Dictionary/open_jtalk_dic-1.11/open_jtalk_dic_utf_8-1.11.tar.gz
tar xzvf open_jtalk_dic_utf_8-1.11.tar.gz

### Initial setup

Set your OpenAI API key to `openai.api_key`

In [2]:
import openai
from mutagen.mp3 import MP3
from mutagen.wave import WAVE
from IPython.display import Audio, display, clear_output
import time
import sys
import os
import requests
import json

os.mkdir("/content/output")

### SET YOUR OPENAI API KEY ###
openai.api_key = ''

### Define character

In [None]:
characters = {
  "ojosama" : {
    "settings": "/content/ai-vc-assistant/settings_ojosama.txt",
    "speaker": 0  # 四国メタン（あまあま）
  },
  "neko": {
    "settings": "/content/ai-vc-assistant/settings_nekonyan.txt",
    "speaker": 43 # 櫻歌ミコ (ノーマル)
  }
}

character_name = "ojosama"  # Select Character
character = characters[character_name]

### Setup input audio

In [None]:
# %%script false --no-raise-error

audio_files = ["/content/ai-vc-assistant/引くこと覚えろカス.wav",
               "/content/ai-vc-assistant/頑張ってじゃねぇよ。おめえも頑張んだよ！！.wav",
               "/content/ai-vc-assistant/頑張ってじゃねぇよ。おめえも頑張んだよ！！_original.mp3"]

audio_path = audio_files[2]

# Get audio length
file_name, file_extension = os.path.splitext(audio_path)
if file_extension == ".mp3":
  mp3_info = MP3(audio_path)
  audio_length = mp3_info.info.length
elif file_extension == ".wav":
  wav_info = WAVE(audio_path)
  audio_length = wav_info.info.length
else:
  sys.exit()

# Play input file
audio = Audio(audio_path, autoplay=True)
display(audio)
time.sleep(audio_length + 0.5)

### Call Whisper API and execute transcription

In [None]:
# %%script false --no-raise-error

with open(audio_path, "rb") as audio_file:
  transcript = openai.Audio.transcribe("whisper-1", audio_file)
transcript_text = transcript["text"] 
print("Transcript: " + transcript_text)

### Read character settings prompt

In [None]:
# %%script false --no-raise-error

settings_file_path = character["settings"]

with open(settings_file_path) as f:
    settings = f.read()
print(settings)

### Send character settings and transcript to ChatGPT and get a response

In [None]:
# %%script false --no-raise-error

# Set character settings prompt
messages = [{'role': 'system', 'content': settings}]
# Set transcript
transcript_text="「" + transcript_text + "」"
messages.append({'role': 'user', 'content': transcript_text})

# Send to ChatGPT
result = openai.ChatCompletion.create(
        model='gpt-3.5-turbo',
        messages=messages
)
response_text = result['choices'][0]['message']['content']

print("ChatGPT: " + response_text)

### Generate TTS by VOICEVOX

In [None]:
# %%script false --no-raise-error

from pathlib import Path
from voicevox_core import VoicevoxCore, METAS
core = VoicevoxCore(open_jtalk_dict_dir=Path("open_jtalk_dic_utf_8-1.11"))
from pprint import pprint
# pprint(METAS) # print characters params

speaker = character["speaker"]
wav_file_name = "/content/output/response-" + character_name + ".wav"

# Generate audio
if not core.is_model_loaded(speaker):
    core.load_model(speaker)
wave_bytes = core.tts(response_text, speaker)
with open(wav_file_name, "wb") as f:
    f.write(wave_bytes)

# Play response file
audio = Audio(wav_file_name, autoplay=True)
display(audio)
