In [1]:
from ASR.models.asr_model import HindiASRModel
from transformers import MarianMTModel, MarianTokenizer
import torch
import numpy as np
import joblib
import soundfile
import numpy as np
import librosa
import os
from sklearn.model_selection import train_test_split
import glob
from inference import infer

  checkpoint = torch.load(fp, map_location=device)


ESR Loaded model from ./ESR/result/random_forest_model.joblib




Model loaded on cuda


### Loading all the models

In [2]:
asr = HindiASRModel(model_size="medium")

int2emotion_new = {
    "anger": "angry",
    "sad": "sad",
    "happy": "happy",
    "neutral": "calm"
}

# we allow only these four emotions
AVAILABLE_EMOTIONS_NEW = set(int2emotion_new.values())

def extract_feature(file_name, **kwargs):
    """
    Extract feature from audio file `file_name`
    Features supported:
        - MFCC (mfcc)
        - Chroma (chroma)
        - MEL Spectrogram Frequency (mel)
        - Contrast (contrast)
        - Tonnetz (tonnetz)
    e.g:
    `features = extract_feature(path, mel=True, mfcc=True)`
    """
    mfcc = kwargs.get("mfcc")
    chroma = kwargs.get("chroma")
    mel = kwargs.get("mel")
    contrast = kwargs.get("contrast")
    tonnetz = kwargs.get("tonnetz")
    
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate
        
        if chroma or contrast:
            stft = np.abs(librosa.stft(X))
            
        result = np.array([])
        
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
            result = np.hstack((result, chroma))
            
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
            result = np.hstack((result, mel))
            
        if contrast:
            contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T, axis=0)
            result = np.hstack((result, contrast))
            
        if tonnetz:
            tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T, axis=0)
            result = np.hstack((result, tonnetz))
            
    return result


model_filename = "./ESR/result/random_forest_model.joblib" # achieved around 0.75 accuracy with high precision and recall for this model
loaded_classifier = joblib.load(model_filename)
print(f"ESR Loaded model from {model_filename}")


model_name = "Helsinki-NLP/opus-mt-hi-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

print(f"Model loaded on {device}")


def translate_hindi_to_english(text):
    """
    Translate Hindi text to English using the pretrained model
    
    Args:
        text (str): Hindi text to be translated
        
    Returns:
        str: Translated English text
    """
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs)
    
    translated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)    
    return translated_text[0]

ESR Loaded model from ./ESR/result/random_forest_model.joblib
Model loaded on cuda


### Sample Inference

In [3]:
# path of audio 
sample_audio_path = "./sample_audio/5.3.sad-06.wav"
print("Performing Inference on ",sample_audio_path)

# ASR
hindi_text = asr.transcribe(sample_audio_path)
print("ASR output (Hindi text) : ",hindi_text)

# ESR
X = []
features = extract_feature(sample_audio_path, mfcc=True, chroma=True, mel=True)
X.append(features)
loaded_prediction = loaded_classifier.predict(X)
print("Emotion Predicted :", loaded_prediction[0])

# MT
english_translation = translate_hindi_to_english(hindi_text)
print("English Translation : ",english_translation)

Performing Inference on  ./sample_audio/5.3.sad-06.wav
ASR output (Hindi text) :   मैंने ये पाट पूरा नहीं पढ़ा है।
Emotion Predicted : sad
English Translation :  I haven't read that piece.


In [11]:
sample_audio_path = "./sample_audio/5.3.happy-06.wav"
print("Performing Inference on ",sample_audio_path)
infer(sample_audio_path)


Performing Inference on  ./sample_audio/5.3.happy-06.wav
Hindi text :   मैंने ये पाट पूरा नहीं पढ़ा है।
Emotion Predicted : happy
English Translation :  I haven't read that piece.


In [5]:
sample_audio_path = "./sample_audio/6.4.neutral-03.wav"
print("Performing Inference on ",sample_audio_path)
infer(sample_audio_path)

Performing Inference on  ./sample_audio/6.4.neutral-03.wav
Hindi text :   मुझे अच्छे अंक लाने हैं
Emotion Predicted : calm
English Translation :  Let me bring a good score.


In [14]:
sample_audio_path = "./sample_audio/4.5.anger-07.wav"
print("Performing Inference on ",sample_audio_path)
infer(sample_audio_path)

Performing Inference on  ./sample_audio/4.5.anger-07.wav
Hindi text :   अच्छा विद्यार्थी अहंकार से दूर रहता हैं।
Emotion Predicted : angry
English Translation :  Good students are far from pride.
