<a href="https://colab.research.google.com/github/Justmalhar/csm-google-collab/blob/main/Sesame_AI_CSM_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sesame AI - Google Collab CSM notebook

Author: justmalhar

http://x.com/justmalhar

http://github.com/justmalhar

HuggingFace Model - https://huggingface.co/sesame/csm-1b

## Instructions

Run each step below till you see the Gradio UI running


## Steps:
1. Setup Gradio
2. Clone the repo and install the requirments
3. Login with HuggingFace account to access the model
4. Run all steps till you see the Gradio UI

In [None]:
# Install dependencies
!pip install gradio



In [None]:
# Clone the repository
!git clone https://github.com/SesameAILabs/csm.git

fatal: destination path 'csm' already exists and is not an empty directory.


In [None]:
# Install dependencies from requirements.txt
!pip install -r /content/csm/requirements.txt

Collecting silentcipher@ git+https://github.com/SesameAILabs/silentcipher@master (from -r /content/csm/requirements.txt (line 9))
  Cloning https://github.com/SesameAILabs/silentcipher (to revision master) to /tmp/pip-install-d9ght3zq/silentcipher_3dbdf716654d405bacc05df088282f64
  Running command git clone --filter=blob:none --quiet https://github.com/SesameAILabs/silentcipher /tmp/pip-install-d9ght3zq/silentcipher_3dbdf716654d405bacc05df088282f64
  Resolved https://github.com/SesameAILabs/silentcipher to commit d46d7d0893a583d8968ab3a6626e2289faec9152
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
#Add the repository to Python's path
import sys
sys.path.append("/content/csm")

In [None]:
# Authenticate with Hugging Face
from huggingface_hub import notebook_login
notebook_login("")

In [None]:
# Load the model
from generator import load_csm_1b
import torchaudio

generator = load_csm_1b(device="cuda")
print("Model loaded successfully! 🎉")

TypeError: Model.__init__() missing 1 required positional argument: 'config'

In [None]:
# Create a function to generate audio
def generate_audio(text, speaker=0):
    # Generate audio from text
    audio = generator.generate(
        text=text,
        speaker=speaker,
        context=[],
        max_audio_length_ms=10_000,
    )

    # Save the generated audio
    output_file = "output_audio.wav"
    torchaudio.save(output_file, audio.unsqueeze(0).cpu(), generator.sample_rate)

    return output_file

In [None]:
# Step 8: Define the Gradio interface function
def gradio_interface(text, speaker):
    try:
        # Call the generate_audio function
        audio_file = generate_audio(text, speaker)
        return audio_file
    except Exception as e:
        print(f"Error generating audio: {e}")
        return None

SIMPLE INTEFACE

In [None]:
# Create the Gradio UI
import gradio as gr
# Step 9: Create the Gradio app
import gradio as gr

iface = gr.Interface(
    fn=gradio_interface,  # Function to call
    inputs=[
        gr.Textbox(label="Input Text", placeholder="Enter text here..."),  # Text input
        gr.Slider(minimum=0, maximum=10, step=1, label="Speaker ID", value=0)  # Speaker ID slider
    ],
    outputs=gr.Audio(label="Generated Audio"),  # Audio output
    title="Sesame CSM-1B Text-to-Speech",  # Title of the app
    description="Generate audio from text using the Sesame CSM-1B model."
)
# Step 10: Launch the Gradio app
iface.launch(share=True)  # Set `share=True` to get a public link


INTERFACE WITH MORE FUNCTIONALITIES

In [None]:
# Define the Gradio interface using Blocks API
with gr.Blocks(title="Sesame CSM-1B Text-to-Speech") as demo:
    gr.Markdown("# 🎙️ Sesame CSM-1B Text-to-Speech")
    gr.Markdown("Generate high-quality audio from text using the Sesame CSM-1B model.")

    with gr.Row():
        with gr.Column():
            # Text input
            text_input = gr.Textbox(label="Enter Text", placeholder="Type your text here...", lines=5)

            # Speaker selection
            speaker_dropdown = gr.Dropdown(
                choices=["Speaker 0", "Speaker 1", "Speaker 2", "Speaker 3"],  # Add more options if needed
                label="Select Speaker",
                value="Speaker 0"
            )

            # File upload for text
            file_upload = gr.File(label="Or Upload a Text File", file_types=[".txt"])

            # Generate button
            generate_button = gr.Button("Generate Audio 🎵")

        with gr.Column():
            # Audio output
            audio_output = gr.Audio(label="Generated Audio", interactive=False)

            # Playback controls
            with gr.Row():
                play_button = gr.Button("▶️ Play")
                pause_button = gr.Button("⏸️ Pause")
                stop_button = gr.Button("⏹️ Stop")

            # Volume control
            volume_slider = gr.Slider(minimum=0, maximum=100, value=50, label="Volume")

    # Define interactions
    def process_input(text, file, speaker):
        if file is not None:
            # Read text from the uploaded file
            with open(file.name, "r") as f:
                text = f.read()
        # Get speaker ID from the dropdown
        speaker_id = int(speaker.split()[-1])
        # Generate audio
        audio_file = generate_audio(text, speaker_id)
        return audio_file

    # Link inputs and outputs
    generate_button.click(
        fn=process_input,
        inputs=[text_input, file_upload, speaker_dropdown],
        outputs=audio_output
    )

    # Playback controls (placeholders, as Gradio Audio already has built-in controls)
    play_button.click(fn=lambda: None)
    pause_button.click(fn=lambda: None)
    stop_button.click(fn=lambda: None)

# Launch the Gradio app
demo.launch(share=True)  # Set `share=True` to get a public link