
# 🗣🎙️CosyVoice Zero-Shot Voice Cloning 🎙️
### Credit
[CosyVoice Github](https://github.com/FunAudioLLM/CosyVoice) <br>
[Hugging Face Space](https://huggingface.co/spaces/modelscope/CosyVoice-300M) <br>
[CosyVoice-300M Model Download](https://www.modelscope.cn/models/iic/CosyVoice-300M)
<br>

In [None]:
#@title Automatic Install CosyVoice, Download Model & AUTO-RESTART (Cancel ```Restart Runtime``` Pop UP)
import os
import shutil
!pip install modelscope

# root_path="." #for windows/kaggle/mac
# root_path=os.getcwd() #else use this one
root_path="/content" #if you are not running this on google colab comment this

base_path=f"{root_path}"
os.chdir(base_path)
if os.path.exists(f"{base_path}/CosyVoice"):
  shutil.rmtree(f"{base_path}/CosyVoice")
  print(f"Deleting Old {base_path}/CosyVoice")
!git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
os.chdir(f"{base_path}/CosyVoice")

#Downloading Model using git clone is very slow
# !git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
from modelscope import snapshot_download
snapshot_download('iic/CosyVoice-300M', local_dir=f'{base_path}/CosyVoice/pretrained_models/CosyVoice-300M')

!pip install -r requirements.txt
!pip install matcha-tts
!echo /usr/lib64-nvidia/ >/etc/ld.so.conf.d/libcuda.conf; ldconfig
from IPython.display import clear_output
clear_output()
import time
time.sleep(5)
import os
os.kill(os.getpid(), 9)

Don't panic if Google Colab got disconnected run from the next cell

In [None]:
#@title <-- Tap this if you play on Mobile { display-mode: "form" }

%%html
<b>Press play on the music player to keep the tab alive, then run the cell below</b><br/>
<audio src="https://raw.githubusercontent.com/KoboldAI/KoboldAI-Client/main/colab/silence.m4a" controls>

In [None]:
#@title import CosyVoice model
%cd /content/CosyVoice
from cosyvoice.cli.cosyvoice import CosyVoice
from cosyvoice.utils.file_utils import load_wav
import torchaudio

cosyvoice = CosyVoice('/content/CosyVoice/pretrained_models/CosyVoice-300M')
from IPython.display import clear_output
clear_output()
#@title utils
import librosa
import whisper
from pydub import AudioSegment
import os
import random
import numpy as np
import uuid
import torch
import shutil
from google.colab import files

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from IPython.display import Audio
from IPython.core.display import display
from tqdm.notebook import tqdm

select_model ="tiny" # ['tiny', 'base']
whisper_model = whisper.load_model(select_model)
language_dict = {
    "Chinese": "zh",
    "English": "en",
    "Japanese": "ja",
    "Cantonese": "yue",
    "Korean": "ko"
}
def convert_to_wav(file_path):
    # Check if the file is already a WAV file
    if file_path.lower().endswith('.wav'):
        # print(f"{file_path} is already a WAV file.")
        return file_path

    # Define the output file path with .wav extension
    output_file_path = os.path.splitext(file_path)[0] + '.wav'

    # Convert the file to WAV format
    audio = AudioSegment.from_file(file_path)
    audio.export(output_file_path, format='wav')
    # print(f"Converted {file_path} to {output_file_path}.")

    return output_file_path

def set_all_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def postprocess(wav_file_path, Language,audio_to_text=False,top_db=60, hop_length=220, win_length=440):
    global language_dict
    max_val = 0.8
    prompt_sr, target_sr = 16000, 22050
    default_data = np.zeros(target_sr)
    if torchaudio.info(wav_file_path).sample_rate < prompt_sr:
      prompt_sr=target_sr
    speech=load_wav(wav_file_path, prompt_sr)
    speech, _ = librosa.effects.trim(
        speech, top_db=top_db,
        frame_length=win_length,
        hop_length=hop_length
    )
    if speech.abs().max() > max_val:
        speech = speech / speech.abs().max() * max_val
    speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
    #extract text
    if audio_to_text:
      result = whisper_model.transcribe(wav_file_path,language=language_dict[Language])
      prompt_text=result["text"].strip()
    else:
      prompt_text=""
    print(f"Reference Audio Text:\n{prompt_text}")
    return speech,prompt_text
from pydub import AudioSegment

def merge_audio(audio_chunks_list, save_path):
    # Initialize an empty AudioSegment object for the final output
    merged_audio = AudioSegment.empty()

    # Loop through the list of audio file paths
    for audio_path in audio_chunks_list:
        # Load each audio file
        audio = AudioSegment.from_file(audio_path)
        # Append the audio to the merged_audio object
        merged_audio += audio

    # Export the merged audio to the specified path in WAV format
    merged_audio.export(save_path, format="wav")


def chunks_sentences(paragraph, join_limit=2):
    sentences = sent_tokenize(paragraph)
    # Initialize an empty list to store the new sentences
    new_sentences = []

    # Iterate through the list of sentences in steps of 'join_limit'
    for i in range(0, len(sentences), join_limit):
        # Join the sentences with a space between them
        new_sentence = ' '.join(sentences[i:i + join_limit])
        new_sentences.append(new_sentence)

    return new_sentences

def speed_change(clone_voice_save_path,speedup_factor):
  speedup_filename=clone_voice_save_path.replace(".wav","_speed_up.wav")
  speed_change_command=f"ffmpeg -i {clone_voice_save_path} -filter:a atempo={speedup_factor} {speedup_filename} -y"
  var=os.system(speed_change_command)
  if var==0:
    return speedup_filename
  else:
    print(speed_change_command)
    return None


def voice_clone(text,reference_audio_file,Language="English",clone_method="3s Quick Clone",seed=0,save_folder="."):
  global language_dict
  if save_folder.endswith("/"):
    save_folder=save_folder[:-1]
  if not os.path.exists(save_folder):
    os.mkdir(save_folder)
  small_sentences=chunks_sentences(text)
  #may be I am wrong
  seed=seed+len(small_sentences)
  set_all_random_seed(seed)
  wav_file_path=convert_to_wav(reference_audio_file)


  if 'COLAB_GPU' in os.environ:
    tts_save_folder="/content/cosy_tts"
  else:
    tts_save_folder="./cosy_tts"
  if os.path.exists(tts_save_folder):
    shutil.rmtree(tts_save_folder)
  os.mkdir(tts_save_folder)
  if clone_method=="3s Quick Clone":
    prompt_speech_16k,prompt_text = postprocess(wav_file_path,Language,audio_to_text=True)
  else:
    prompt_speech_16k,prompt_text = postprocess(wav_file_path,Language,audio_to_text=False)
  audio_chunks_list=[]
  for index, tts_text in tqdm(enumerate(small_sentences), total=len(small_sentences)):
  # for index, tts_text in enumerate(small_sentences):
    if clone_method=="3s Quick Clone":
      output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k)
    else:
      tts_text=f"<|{language_dict['English']}|> {tts_text}"
      output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k)
    temp_file_path=f"{tts_save_folder}/{index}.wav"
    torchaudio.save(temp_file_path, output['tts_speech'], 22050)
    audio_chunks_list.append(temp_file_path)
  file_name=text[:20].replace(" ","_")
  file_name=file_name.replace("<|","").replace("|>","")
  random_uuid = str(uuid.uuid4())[:6]
  save_clone_voice_path=f"{save_folder}/{file_name}_{random_uuid}.wav"
  if len(audio_chunks_list)==0:
    return None
  elif len(audio_chunks_list)==1:
    shutil.copy(audio_chunks_list[-1],save_clone_voice_path)
    return save_clone_voice_path
  else:
    merge_audio(audio_chunks_list, save_clone_voice_path)
    return save_clone_voice_path

from IPython.display import clear_output
clear_output()

[Online Audio Cutter](https://mp3cut.net/) &emsp;If you have a larger Audio<br>
[Vocal & Music Seperator](https://vocalremover.org/)&emsp;If you have a audio with background Music<br>
[Enhance low quality audio](https://podcast.adobe.com/)&emsp;If your audio file have poor audio quality

In [None]:
#@title Upload 6s Audio clip
import os
from google.colab import files
%cd /content/
uploaded = files.upload()
%cd /content/CosyVoice/
upload_list=[]
for fn in uploaded.keys():
  upload_list.append('/content/'+fn)
from IPython.display import clear_output
clear_output()
upload_list[-1]

In [None]:
#@title Run CosyVoice for voice cloning (Support Long Text)
text="This is Cosy Voice Google Colab Demo"# @param {type: "string"}
Reference_Audio_File="/content/audio.wav"# @param {type: "string"}
Language = "English" # @param [ 'English','Chinese', 'Japanese', 'Cantonese', 'Korean'] {allow-input: true}
Clone_Method = "3s Quick Clone" # @param ["3s Quick Clone", "Cross-lingual Clone"] {allow-input: true}

Seed=0 # @param {type: "number"}
# save_folder="/content/clone_voice" # @param {type: "string"}
save_folder="/content/clone_voice"
clone_voice_save_path=voice_clone(text,Reference_Audio_File,Language,Clone_Method,Seed,save_folder)


In [None]:
#@title Display Original & Cloned Voice
from pydub import AudioSegment

def audio_duration_check(file_path):
    audio = AudioSegment.from_file(file_path)
    duration_ms = len(audio)  # Duration in milliseconds
    duration_s = duration_ms / 1000  # Convert to seconds
    return duration_s <= 240  # 4 minutes in seconds

print(f"Original Audio File:")
if audio_duration_check(Reference_Audio_File):
  display(Audio(Reference_Audio_File, autoplay=False))
else:
  print("Audio File is larger please download the audio in your local device")
  print(f"Save at {Reference_Audio_File}")

print(f"Cloned Audio File:")
if audio_duration_check(clone_voice_save_path):
  display(Audio(clone_voice_save_path, autoplay=False))
else:
  print("Audio File is larger please download the audio in your local device")
  print(f"Save at {clone_voice_save_path}")
  files.download(clone_voice_save_path)


In [None]:
#@title Download Cloned Voice
from google.colab import files
print(f"Save at {clone_voice_save_path}")
files.download(clone_voice_save_path)

Save at /content/clone_voice/In_this_video_I'll_s_76c911.wav


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Change Audio Speed

In [None]:
#@title Change the audio speed to achieve your desired voice
speed="0.9"  # @param {type: "string"}
speedup_factor=float(speed)
speed_change_file=speed_change(clone_voice_save_path,speedup_factor)

print(f"Cloned Audio File:")
if audio_duration_check(clone_voice_save_path):
  display(Audio(clone_voice_save_path, autoplay=False))
else:
  print("Audio File is larger please download the audio in your local device")
  print(f"Save at {clone_voice_save_path}")
  files.download(clone_voice_save_path)

print(f"Cloned Audio File After Speed Change:")
if audio_duration_check(speed_change_file):
  display(Audio(speed_change_file, autoplay=False))
else:
  print("Audio File is larger please download the audio in your local device")
  print(f"Save at {speed_change_file}")
  files.download(speed_change_file)

In [None]:
#@title Download the speed-changed cloned voice
from google.colab import files
print(f"Save at {clone_voice_save_path}")
files.download(clone_voice_save_path)

In [None]:
#@title Use as funtion

# voice_clone(text="hi how are you ",reference_audio_file='/content/audio.wav',Language='English',clone_method="3s Quick Clone",seed=0,save_folder="/content/clone_voice")