# ü§ñ PersonaPlex: Local Control & Emotional System (Inside Out)

This notebook allows you to run PersonaPlex locally without depending on Modal. 

**Features:**
1. **CPU Offloading**: Allows running the 7B model on GPUs with low VRAM (like the T550).
2. **Emotional Prompting**: System based on *Inside Out* to vary the tone (Joy, Sadness, etc.).
3. **Local Text-to-Audio**: Generates and saves `.wav` files directly.

In [None]:
import os
import torch
import numpy as np
from pathlib import Path
import IPython.display as ipd
import sys
import asyncio
import wave

# Add moshi path if not installed globally
sys.path.append(os.path.abspath('moshi'))

from moshi.models import loaders
from sentencepiece import SentencePieceProcessor
from moshi.models.lm import LMGen

print("‚úÖ Libraries loaded.")

## 1. Model Initialization
Loading Mimi (Audio) and Moshi LM (Brain).

In [None]:
weights_dir = Path('weights')
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f"Loading models on {device} (this may take a few minutes)...")

# Mimi: The audio tokenizer
mimi = loaders.get_mimi(weights_dir / loaders.MIMI_NAME, device)

# Moshi LM: The language model (7B)
# We use cpu_offload=True to handle GPUs with low VRAM
moshi_lm = loaders.get_moshi_lm(
    weights_dir / loaders.MOSHI_NAME, 
    device=device, 
    cpu_offload=True
)

tokenizer_path = weights_dir / loaders.TEXT_TOKENIZER_NAME
text_tokenizer = SentencePieceProcessor(str(tokenizer_path))

lm_gen = LMGen(moshi_lm, text_tokenizer)

print("üöÄ System ready. GPU VRAM detected:", torch.cuda.get_device_properties(0).total_memory / 1024**2 if device == 'cuda' else 'N/A', "MB")

## 2. Emotional Configuration 'Inside Out'
Injecting emotional state via specialized prompts.

In [None]:
EMOTIONS = {
    "Joy": "Hello! I'm so excited to talk to you! Hee-hee! Everything is wonderful! [laugh]",
    "Sadness": "Hello... I'm feeling a bit down today. Everything seems so gray... [sigh]",
    "Anger": "I can't believe it! This is unacceptable! Beep-boop-grrr! I'm very angry!",
    "Fear": "What was that? I'm scared... are you there? Please don't go...",
    "Disgust": "Ugh, how gross. That's repulsive. I don't even want to look at it. Puaj."
}

def wrap_with_system_tags(text):
    """Wrap text in tags the model expects for instructions."""
    return f"(user) {text} (assistant)"

def get_emotional_prompt(emotion, user_text):
    prefix = EMOTIONS.get(emotion, "")
    return wrap_with_system_tags(f"{prefix} {user_text}")

## 3. Pepper's Voice Generation
This cell runs the actual inference loop.

In [None]:
def generate_audio_local(text, emotion="Joy", voice_pt="weights/pepper.pt", output_wav="pepper_reply.wav", duration_frames=300):
    # 1. Load Pepper's voice identity
    if os.path.exists(voice_pt):
        lm_gen.load_voice_prompt_embeddings(voice_pt)
        print(f"‚úÖ Voice loaded from {voice_pt}")
    
    # 2. Prepare emotional text prompt
    emotional_text = get_emotional_prompt(emotion, text)
    lm_gen.text_prompt_tokens = text_tokenizer.encode(emotional_text)
    
    print(f"üé§ Pepper acting as: {emotion}...")
    all_audio_chunks = []
    generated_text = ""
    
    # 4. Step-by-step generation loop (Streaming)
    # Use zero audio codes for 'moshi_tokens' (empty input audio)
    empty_audio_codes = torch.zeros((1, 8, 1), device=device, dtype=torch.long)
    
    for step in range(duration_frames):
        # Model generates audio and text tokens simultaneously
        tokens = lm_gen.step(moshi_tokens=empty_audio_codes)
        
        if tokens is not None:
            # Audio tokens (indices 1 to 8 are for Mimi decoder)
            audio_tokens = tokens[:, 1:9]
            pcm_chunk = mimi.decode(audio_tokens)
            all_audio_chunks.append(pcm_chunk.cpu().numpy().flatten())
            
            # Text token (index 0)
            text_token = tokens[0, 0, 0].item()
            if text_token > 3: # Ignore special tokens (padding, etc)
                piece = text_tokenizer.id_to_piece(text_token).replace('‚ñÅ', ' ')
                generated_text += piece
                if step % 20 == 0: print(f"Text: {generated_text}")

    # 5. Concatenate and save audio
    full_audio = np.concatenate(all_audio_chunks)
    
    # Save to .wav file
    import scipy.io.wavfile as wavfile
    wavfile.write(output_wav, 24000, (full_audio * 32767).astype(np.int16))
    
    print(f"\n‚ú® Pepper's reply finished.")
    return output_wav, generated_text

# --- LOCAL TEST ---
# wav_file, text_out = generate_audio_local("Hello Pepper! How are you today?", emotion="Joy")
# ipd.Audio(wav_file)