# Part 2: Speech-to-Text with Whisper

This notebook implements the speech-to-text component with Whisper.

In [None]:
import os
import sys
import whisper
from IPython.display import Audio

# Add src to path
sys.path.append(os.path.join(os.pardir, "src"))

# Import our custom module
from speech_analysis.whisper_processor import WhisperTranscriber

## 1. Download Sample Audio Files

We will download sample emotional speech files from the RAVDESS dataset.

In [None]:
import requests

def download_file(url, target_path):
    """Download a file from a URL to a target path."""
    if os.path.exists(target_path):
        print(f"File already exists at {target_path}")
        return
    
    response = requests.get(url, stream=True)
    response.raise_for_status()
    
    os.makedirs(os.path.dirname(target_path), exist_ok=True)
    
    with open(target_path, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    
    print(f"Downloaded {url} to {target_path}")

In [None]:
# RAVDESS dataset sample files
# Emotions: 01=neutral, 03=happy, 04=sad, 05=angry

base_url = "https://zenodo.org/records/1188976/files/"
audio_files = [
    # Neutral sample
    "03-01-01-01-01-01-01.wav",  # Neutral, female
    
    # Happy sample
    "03-01-03-01-01-01-11.wav",  # Happy, male
    
    # Sad sample
    "03-01-04-01-01-01-01.wav",  # Sad, female
    
    # Angry sample
    "03-01-05-01-01-01-11.wav"   # Angry, male
]

audio_dir = "../data/audio/samples"
audio_paths = {}

# Download the audio files
for file_name in audio_files:
    url = base_url + file_name
    target_path = os.path.join(audio_dir, file_name)
    download_file(url, target_path)
    
    # Map file name to emotion
    emotion_code = file_name.split("-")[2]
    emotion_map = {
        "01": "neutral",
        "03": "happy",
        "04": "sad",
        "05": "angry"
    }
    emotion = emotion_map.get(emotion_code, "unknown")
    audio_paths[target_path] = emotion

print(f"Downloaded {len(audio_files)} audio files")

## 2. Initialize Whisper Model

In [None]:
# Initialize the Whisper transcriber
transcriber = WhisperTranscriber(model_name="base")

## 3. Transcribe Audio Samples

In [None]:
import pandas as pd

transcriptions = []

for audio_path, emotion in audio_paths.items():
    # Display audio player
    display(Audio(audio_path))
    
    # Transcribe the audio
    result = transcriber.transcribe_file(audio_path)
    
    # Get filename
    filename = os.path.basename(audio_path)
    
    # Add to our list
    transcriptions.append({
        "filename": filename,
        "emotion": emotion,
        "text": result["text"],
        "audio_path": audio_path
    })
    
    # Print the transcription
    print(f"File: {filename} (Emotion: {emotion})")
    print(f"Transcription: {result["text"]}")
    print("-" * 50)

# Create DataFrame
transcriptions_df = pd.DataFrame(transcriptions)
transcriptions_df

## 4. Simple Text Sentiment Analysis

As a placeholder, we will use a simple rule-based sentiment analyzer. In a complete implementation, you would integrate with your model from Part 1.

In [None]:
# Simple sentiment analysis function
def simple_sentiment_analysis(text):
    """A simple rule-based sentiment analyzer for demonstration."""
    text = text.lower()
    
    # Define positive and negative word lists
    positive_words = ["happy", "good", "great", "joy", "love", "excellent", "positive"]
    negative_words = ["sad", "bad", "terrible", "awful", "hate", "dislike", "negative"]
    
    # Count positive and negative words
    positive_count = sum(1 for word in positive_words if word in text)
    negative_count = sum(1 for word in negative_words if word in text)
    
    # Calculate sentiment score (0 to 1)
    if positive_count + negative_count == 0:
        return 0.5  # Neutral if no positive or negative words
    else:
        return positive_count / (positive_count + negative_count)

# Apply simple sentiment analysis to transcriptions
transcriptions_df["sentiment_score"] = transcriptions_df["text"].apply(simple_sentiment_analysis)

# Convert score to label (positive, neutral, negative)
def score_to_label(score):
    if score >= 0.7:
        return "positive"
    elif score <= 0.3:
        return "negative"
    else:
        return "neutral"
        
transcriptions_df["sentiment"] = transcriptions_df["sentiment_score"].apply(score_to_label)

# Display results
transcriptions_df[["filename", "emotion", "text", "sentiment_score", "sentiment"]]

## 5. Create an End-to-End Pipeline

In [None]:
def analyze_speech_sentiment(audio_path):
    """End-to-end pipeline for speech sentiment analysis."""
    # Display audio
    display(Audio(audio_path))
    
    # Step 1: Transcribe audio
    print("Transcribing audio...")
    result = transcriber.transcribe_file(audio_path)
    transcription = result["text"]
    print(f"Transcription: {transcription}")
    
    # Step 2: Analyze sentiment
    print("Analyzing sentiment...")
    sentiment_score = simple_sentiment_analysis(transcription)
    sentiment_label = score_to_label(sentiment_score)
    
    print(f"Sentiment: {sentiment_label} (score: {sentiment_score:.2f})")
    
    return {
        "audio_path": audio_path,
        "transcription": transcription,
        "sentiment_score": sentiment_score,
        "sentiment_label": sentiment_label
    }

In [None]:
# Test the pipeline on a sample
sample_path = list(audio_paths.keys())[1]  # Happy sample
result = analyze_speech_sentiment(sample_path)
print("
Result:")
result

## 6. Next Steps

1. **Integrate with Text Model**: Replace the simple rule-based sentiment analyzer with your trained model from Part 1
2. **Create a Web Interface**: Implement the Flask web app for uploading and analyzing audio
3. **Support Longer Audio**: Add functionality to process longer audio files
4. **Compare with LLMs**: In Part 3, you will compare this approach with LLM-based sentiment analysis