<a href="https://colab.research.google.com/github/Kavya-sri-05/genai/blob/main/multimode_rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from flask import Flask, request, jsonify
import os
from io import BytesIO
from pydub import AudioSegment
from PIL import Image
import pytesseract
import librosa
import numpy as np
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
import requests

app = Flask(__name__)

# Set your Gemini 1.5 Pro API key here securely using environment variables
GEMINI_API_KEY = os.getenv("AIzaSyBe7E_z4LXe2AXwfBxAWtHYr87Jfwug09M")  # Ensure you set the environment variable

# Initialize embedding model
embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
vector_store = Chroma(persist_directory="./db", embedding_function=embedding_model)

def process_audio(audio_file):
    """Convert audio to text."""
    try:
        audio = AudioSegment.from_file(audio_file)
        audio = audio.set_channels(1).set_frame_rate(16000)
        audio.export("temp.wav", format="wav")
        audio_data, _ = librosa.load("temp.wav", sr=16000)
        # You can replace this with an actual transcription API (e.g., Google Speech-to-Text)
        return "Transcribed text from audio"  # Placeholder
    except Exception as e:
        return f"Error processing audio: {str(e)}"

def process_image(image_file):
    """Convert image to text using OCR."""
    try:
        image = Image.open(image_file)
        text = pytesseract.image_to_string(image)  # Extract text from the image using OCR
        return text if text else "No text found in image"
    except Exception as e:
        return f"Error processing image: {str(e)}"

def process_text(text):
    """Process and embed text."""
    return text

def embed_and_store(data, source):
    """Embed data and store in vector store."""
    try:
        embeddings = embedding_model.embed_documents([data])
        vector_store.add_texts([data], embeddings=embeddings, metadatas=[{"source": source}])
    except Exception as e:
        return f"Error embedding and storing data: {str(e)}"

def generate_response(query):
    """Generate a response based on the query."""
    try:
        docs = vector_store.similarity_search(query, k=3)
        return docs[0] if docs else "No relevant documents found."
    except Exception as e:
        return f"Error generating response: {str(e)}"

def multimodal_rag(input_data, input_type):
    """Process input data based on its type and generate a response."""
    try:
        if input_type == "audio":
            text_data = process_audio(input_data)
            source = "Audio"
        elif input_type == "image":
            text_data = process_image(input_data)
            source = "Image"
        elif input_type == "text":
            text_data = process_text(input_data)
            source = "Text"
        else:
            return "Unsupported input type"

        embed_and_store(text_data, source)
        return generate_response(text_data)
    except Exception as e:
        return f"Error processing input data: {str(e)}"

def query_gemini_api(prompt):
    """Query the Gemini 1.5 Pro API."""
    try:
        url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent"
        headers = {
            "Authorization": f"Bearer {GEMINI_API_KEY}",
            "Content-Type": "application/json",
        }
        data = {
            "model": "gemini-1.5-pro-latest",
            "prompt": prompt,
            "max_tokens": 100,
        }
        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()
        return response.json().get("choices", [{}])[0].get("text", "")
    except requests.exceptions.RequestException as e:
        return f"Error querying Gemini API: {str(e)}"

@app.route('/upload', methods=['POST'])
def upload_file():
    try:
        file = request.files['file']
        input_type = request.form.get('type')

        if input_type == "audio":
            response = multimodal_rag(file, "audio")
        elif input_type == "image":
            response = multimodal_rag(file, "image")
        elif input_type == "text":
            text_input = file.read().decode('utf-8')
            response = multimodal_rag(text_input, "text")
        else:
            response = "Invalid input type specified."

        return jsonify({"response": response})

    except Exception as e:
        return jsonify({"error": f"Error handling upload: {str(e)}"})

if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat


In [1]:
!pip install -U --quiet langchain langchain_community chromadb langchain-google-vertexai
!pip install --quiet "unstructured[all-docs]" pypdf pillow pydantic lxml matplotlib opencv-python tiktoken
!pip install -q langchain google-generativeai faiss-cpu sentence-transformers chromadb pydub librosa transformers torch


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2