In [None]:
from IPython import get_ipython
from IPython.display import display
import tensorflow as tf
import librosa
import numpy as np
import soundfile as sf
import torch
import torchaudio
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from IPython.display import Audio
from langchain_core.runnables import Runnable
from langchain_core.runnables import RunnableLambda, RunnableMap

In [None]:
from google.colab import files

# This will open a file upload dialog in Colab
uploaded = files.upload()

In [None]:

# Load the emotion recognition model
emotion_model = tf.keras.models.load_model('my_model.h5')

# Print model summary to verify
emotion_model.summary()



In [None]:
emotion_classes = ['neutral', 'calm', 'sad',  'fear', 'disgust', 'happy', 'angry','surprise'] # Example classes

In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
audio = 'audio2.wav'
Audio(audio)

In [None]:
# === Step 2: Speech-to-Text (ASR) ===
asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=0 if torch.cuda.is_available() else -1)

def speech_to_text(audio_path: str) -> str:
    """Convert speech to text using ASR pipeline."""
    result = asr_pipeline(audio_path)
    return result["text"]

# %%
print(speech_to_text(audio))

In [None]:
# === Step 3: Text Response Generator (LLM) ===
# Load the language model with a different variable name
llm_tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")
llm_model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-alpha", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto")

In [None]:
def generate_response(inputs: dict) -> str:
    text = inputs["text"]
    sentiment = inputs["sentiment"]

    prompt = f"You are a helpful assistant. The user sounds {sentiment}. Respond appropriately.\nUser said: {text}\nAssistant:"
    input_ids = llm_tokenizer(prompt, return_tensors="pt").input_ids.to(llm_model.device)

    with torch.no_grad():
        output = llm_model.generate(input_ids, max_new_tokens=100, do_sample=True)

    return llm_tokenizer.decode(output[0], skip_special_tokens=True).split("Assistant:")[-1].strip()

In [None]:
def extract_features(file_path, max_pad_len=174):
    """Extract MFCC features from an audio file."""
    try:
        with sf.SoundFile(file_path) as sound_file:
            X = sound_file.read(dtype="float32")
            sample_rate = sound_file.samplerate

            if X.ndim > 1:
                X = librosa.to_mono(X)

            mfccs = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40)

            if mfccs.shape[1] < max_pad_len:
                pad_width = max_pad_len - mfccs.shape[1]
                mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
            else:
                mfccs = mfccs[:, :max_pad_len]

        return mfccs
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

def predict_emotion(audio_path):
    """Predict the emotion from an audio file."""
    features = extract_features(audio_path)

    if features is None:
        print("Feature extraction failed.")
        return None

    # Convert MFCCs to single channel
    single_channel = np.mean(features, axis=0)  # (174,)
    input_data = np.expand_dims(single_channel, axis=0)  # (1, 174)
    input_data = np.expand_dims(input_data, axis=-1)     # (1, 174, 1)

    # Predict using global model
    prediction = emotion_model.predict(input_data)
    predicted_index = np.argmax(prediction[0])
    predicted_emotion = emotion_classes[predicted_index]
    confidence = np.max(prediction[0])

    print(f"Predicted Emotion: {predicted_emotion}")
    print(f"Confidence: {confidence:.4f}")
    return predicted_emotion


In [None]:
# LangChain integration using the corrected functions
speech_to_text_chain = RunnableLambda(lambda audio_path: {"text": speech_to_text(audio_path)})
sentiment_chain = RunnableLambda(lambda audio_path: {"sentiment": predict_emotion(audio_path)})

# text + sentiment chain
merge_chain = RunnableLambda(lambda inputs: {"text": inputs["text"]["text"], "sentiment": inputs["sentiment"]["sentiment"]})

llm_response_chain = RunnableLambda(generate_response) # Renamed for clarity

# Full chain
full_chain: Runnable = (
    RunnableMap({
        "text": speech_to_text_chain,
        "sentiment": sentiment_chain
    })
    | merge_chain
    | llm_response_chain
)

In [None]:
audio_file = "audio2.wav"  # Replace with your file
result = full_chain.invoke(audio_file)
print("Generated Response:", result)

# Preprocessing

# Feature extraction

In [None]:


# Load model and classes
emotion_model = tf.keras.models.load_model('my_model.h5')


# Augmentation functions
def noise(data):
    noise_amp = 0.035 * np.random.uniform() * np.amax(data)
    return data + noise_amp * np.random.normal(size=data.shape[0])

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(y=data, rate=rate)

def pitch(data, sr, n_steps=2):
    return librosa.effects.pitch_shift(y=data, sr=sr, n_steps=n_steps)

# Feature extraction - make sure output length matches 162 features
def extract_features(data, sr):
    result = np.array([])

    # Zero Crossing Rate
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result = np.hstack((result, zcr))

    # Chroma STFT
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T, axis=0)
    result = np.hstack((result, chroma_stft))

    # MFCC (use 40 coefficients to get enough length)
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sr, n_mfcc=40).T, axis=0)
    result = np.hstack((result, mfcc))

    # Root Mean Square Energy
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms))

    # Mel Spectrogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sr).T, axis=0)
    result = np.hstack((result, mel))

    # At this point result length may be greater than 162, so trim or pad:
    desired_length = 162
    if len(result) < desired_length:
        # Pad with zeros
        result = np.pad(result, (0, desired_length - len(result)), mode='constant')
    else:
        # Trim to desired length
        result = result[:desired_length]

    return result

# Get features for original + augmented audios
def get_features(path):
    data, sr = librosa.load(path, duration=2.5, offset=0.6)

    features = []

    # Original
    features.append(extract_features(data, sr))

    # Noise
    features.append(extract_features(noise(data), sr))

    # Stretch + Pitch
    stretched = stretch(data)
    pitched = pitch(stretched, sr)
    features.append(extract_features(pitched, sr))

    return np.array(features)  # Shape: (3, 162)

# Predict emotion averaging augmented inputs
def predict_emotion_with_augmentation(audio_path):
    features = get_features(audio_path)

    print("Features shape before expand_dims:", features.shape)  # Should be (3, 162)

    # Expand dims for Conv1D: (batch, time_steps, channels)
    features = np.expand_dims(features, axis=2)  # (3, 162, 1)

    print("Features shape after expand_dims:", features.shape)

    predictions = emotion_model.predict(features)  # (3, num_classes)

    mean_prediction = np.mean(predictions, axis=0)

    predicted_index = np.argmax(mean_prediction)
    predicted_emotion = emotion_classes[predicted_index]
    confidence = np.max(mean_prediction)

    print(f"Predicted Emotion: {predicted_emotion}")
    print(f"Confidence: {confidence:.4f}")

    return predicted_emotion


In [None]:
emotion = predict_emotion_with_augmentation(audio)
Audio(audio)