<a href="https://colab.research.google.com/github/MITHRAMA/-multilingual-speech-recognition_/blob/main/multilingual_speech_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel

def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

# Function to simulate speech input
def simulate_speech_input(language, query):
    return f"Speech Input ({language}): {query}"

# English and Telugu queries
english_query = simulate_speech_input('English', 'How much protein should a female eat?')
telugu_query = simulate_speech_input('Telugu', 'మహిళలు ఎంత ప్రోటీన్ తినాలి?')

# Dummy RAG document
rag_document = """
RAG Document:
1. Introduction to Protein Requirements
   - Overview of daily protein needs based on age, gender, and activity level.
2. Protein Requirements for Females
   - Specific guidelines for females, including recommended daily intake and sources of protein.
3. Factors Affecting Protein Needs
   - Discusses factors like pregnancy, breastfeeding, and physical activity influencing protein requirements.
4. Conclusion
   - Summarizes the importance of meeting daily protein needs for optimal health.
"""

# Load tokenizer and model (using Multilingual Whisper)
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large-instruct')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-large-instruct')

# Combine queries and document for processing
input_texts = [english_query, telugu_query, rag_document]

# Tokenize input texts
batch_dict = tokenizer(input_texts, return_tensors='pt', padding=True, truncation=True, max_length=512)

# Get model outputs
with torch.no_grad():
    outputs = model(**batch_dict)
    embeddings = outputs.last_hidden_state[:, 0, :]  # Take embeddings from the first token (CLS)

# Normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)

# Calculate similarity scores
scores = (embeddings[:-1] @ embeddings[-1].unsqueeze(-1)).squeeze().tolist()

# Print similarity scores
print("Similarity Scores:")
print(f"English Query: {english_query}\nScore: {scores[0]}")
print(f"Telugu Query: {telugu_query}\nScore: {scores[1]}")


Similarity Scores:
English Query: Speech Input (English): How much protein should a female eat?
Score: 0.8464750647544861
Telugu Query: Speech Input (Telugu): మహిళలు ఎంత ప్రోటీన్ తినాలి?
Score: 0.8068825006484985
