## ⚠️ Misuse & Abuse Policy  

This speech generation model is for research and educational use only. The following are strictly prohibited:  

- **Impersonation**: No mimicking real individuals without consent.  
- **Misinformation**: No deceptive or misleading content (e.g., fake news, fraud, Scam).  
- **Illegal Activities**: No use for harm, crime, or malicious purposes.  

Use responsibly and ethically. 🚫  
## ⚠️ Disclaimer  

The developers are **not responsible** for any misuse of this technology.  
Unethical use is **strongly condemned**—follow your **local laws** and use responsibly. 🚫  


## Before Running [CSM](https://github.com/SesameAILabs/csm)

### 1. Accept License Agreements  
- [CSM-1B License](https://huggingface.co/sesame/csm-1b)  
- [Llama-3.2-1B License](https://huggingface.co/meta-llama/Llama-3.2-1B)  

### 2. Get Hugging Face Access Token  
- Generate or retrieve your token from [Hugging Face Access Tokens](https://huggingface.co/settings/tokens)  

### 3. Run CSM  
Once the licenses are accepted and the access token is ready, you can proceed with setting up and running CSM.  


In [None]:
#@title Install CSM
HF_TOKEN="asfdagfhjgsdfjhagdhjshjf" # @param {type: "string"}
root_path="/content" # @param {type: "string"}

#
%cd $root_path
!git clone https://github.com/SesameAILabs/csm.git
%cd $root_path/csm
!pip install -r requirements.txt
!pip install -U openai-whisper
!pip install gradio>=5.9.1
!pip install click
!pip install pydub>=0.25.1

!apt-get update && apt-get install -y ffmpeg

from huggingface_hub import login
login(token=HF_TOKEN)

#Restart Sesstion otherwise huggingface authentication will not work
from IPython.display import clear_output
clear_output()
import time
time.sleep(5)
import os
os.kill(os.getpid(), 9)

In [None]:
#@title Create /content/csm/app.py

%%writefile /content/csm/app.py
from generator import load_csm_1b,Segment
import torchaudio
import torch
import os
import whisper
import hashlib
import os
from pydub import AudioSegment

# Ensure the upload directory exists
os.makedirs("./upload/", exist_ok=True)
os.makedirs("./result",exist_ok=True)
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

generator = load_csm_1b(device=device)

whisper_model = whisper.load_model("tiny")


old_hash_key = ""  # Stores the last processed audio hash
old_transcripts = []  # Stores cached transcripts



def csm_tts(text,speaker_id=0):
  audio = generator.generate(
      text=text,
      speaker=speaker_id,
      context=[],
      max_audio_length_ms=10_000,
  )
  save_path=f"./result{text[:20]}.wav"
  torchaudio.save(save_path, audio.unsqueeze(0).cpu(), generator.sample_rate)
  return save_path


def get_audio_hash(audio_paths):
    """Generate a hash for the given audio files to check for identical content."""
    hasher = hashlib.md5()
    for path in audio_paths:
        with open(path, "rb") as f:
            hasher.update(f.read())  # Read the file in binary mode and update hash
    return hasher.hexdigest()  # Return unique hash for audio files



#Because on stereo csm giving error
def convert_stereo_to_mono(audio_paths):
  mono_audio_paths=[]
  for input_file in audio_paths:
    base_name = os.path.basename(input_file)  # Extract filename
    output_file = f"./upload/{os.path.splitext(base_name)[0]}_mono.wav"  # Append "_mono"

    audio = AudioSegment.from_file(input_file)
    mono_audio = audio.set_channels(1)  # Convert to mono
    mono_audio.export(output_file, format="wav")  # Save as WAV
    mono_audio_paths.append(output_file)
  return mono_audio_paths

def get_transcripts(audio_paths):
    global old_hash_key, old_transcripts  # Use global cache

    hash_key = get_audio_hash(audio_paths)  # Generate a hash for current files

    # If hash matches previous one, return cached transcripts
    if hash_key == old_hash_key and old_transcripts:
        # print("Using cached transcripts")  # Debug message
        return old_transcripts

    # print("Generating new transcripts")  # Debug message
    transcripts = []
    for audio_path in audio_paths:
        result = whisper_model.transcribe(audio_path)["text"].strip()
        transcripts.append(result)  # Append correct result

    # Update cache
    old_transcripts = transcripts
    old_hash_key = hash_key

    return transcripts


def load_audio(audio_path):
    audio_tensor, sample_rate = torchaudio.load(audio_path)
    audio_tensor = torchaudio.functional.resample(
        audio_tensor.squeeze(0), orig_freq=sample_rate, new_freq=generator.sample_rate
    )
    return audio_tensor

def get_segments(audio_paths,speaker_id=0):
  segments = []
  transcripts_list=get_transcripts(audio_paths)
  for audio_path,transcript in zip(audio_paths,transcripts_list):
    audio = load_audio(audio_path)
    segment = Segment(text=transcript, speaker=speaker_id, audio=audio)
    segments.append(segment)
  return segments

def csm_clone(text,speaker_id,audio_paths):
  audio_paths=convert_stereo_to_mono(audio_paths)
  segments=get_segments(audio_paths,speaker_id)
  audio = generator.generate(
    text=text,
    speaker=speaker_id,
    context=segments,
    max_audio_length_ms=10_000,
  )
  save_path=f"./result{text[:20]}.wav"
  torchaudio.save(save_path, audio.unsqueeze(0).cpu(), generator.sample_rate)
  return save_path



import gradio as gr

def toggle_autoplay(autoplay):
    return gr.update(autoplay=autoplay)  # Update instead of creating a new Audio component

def ui1():
    with gr.Blocks() as demo:
        with gr.Row():
            with gr.Column():
                text_input = gr.Textbox(label="📝 Enter Text", lines=3)
                number_input = gr.Number(label="🎙️ Speaker ID", value=0)
                generate_button = gr.Button("🚀 Generate")
            with gr.Column():
                output_audio = gr.Audio(label="Generated Audio", autoplay=True)
                with gr.Accordion('▶️ Autoplay', open=False):
                    autoplay = gr.Checkbox(value=True, label='Autoplay')
                    autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[output_audio])  # Fixed reference

        generate_button.click(csm_tts, inputs=[text_input, number_input], outputs=output_audio)

    return demo

# tab1 = ui1()
# tab1.queue().launch(debug=True, share=True)
def tem_csm_clone(text_input, number_input, file_input,multiple_file_input=None):
  if multiple_file_input is not None:
    clone_path=csm_clone(text_input, number_input,multiple_file_input)
  else:
    clone_path=csm_clone(text_input, number_input,[file_input])
  return clone_path

def ui2():
    with gr.Blocks() as demo:
        with gr.Row():
            with gr.Column():
                text_input = gr.Textbox(label="📝 Enter Text", lines=3)
                number_input = gr.Number(label="🎙️ Speaker ID", value=0)
                file_input=gr.Audio(label="🎤 Record or Upload Reference Audio",  type='filepath')
                generate_button = gr.Button("🚀 Generate")
                with gr.Accordion('🎧 Multiple Reference Audio', open=False):
                  multiple_file_input = gr.File(label="📂 Upload Multiple Reference Audios", type='filepath',file_types=['.wav'], file_count='multiple',value=None)


            with gr.Column():
                output_audio = gr.Audio(label="Generated Audio", autoplay=True)
                with gr.Accordion('▶️ Autoplay', open=False):
                    autoplay = gr.Checkbox(value=True, label="Autoplay")
                    autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[output_audio])  # Update autoplay setting

        generate_button.click(tem_csm_clone, inputs=[text_input, number_input, file_input,multiple_file_input], outputs=output_audio)

    return demo

# tab2 = ui2()
# tab2.queue().launch(debug=True, share=True)


# tab1 = ui1()
# tab2 = ui2()
# demo = gr.TabbedInterface([tab1, tab2],["CSM TTS","CSM VOICE CLONE"],title="CSM-1B")#,theme='JohnSmith9982/small_and_pretty')
# demo.queue().launch(debug=True, share=True)


import click
@click.command()
@click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
@click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
def main(debug, share):
  tab1 = ui1()
  tab2 = ui2()
  demo = gr.TabbedInterface([tab1, tab2],["CSM TTS","CSM VOICE CLONE"],title="CSM-1B")#,theme='JohnSmith9982/small_and_pretty')
  demo.queue().launch(debug=debug, share=share)
if __name__ == "__main__":
    main()


Overwriting /content/csm/app.py


In [None]:
%cd /content/csm
!python app.py --share