In [1]:
import os
import io
import json
import logging
from traceback import format_exc
from dotenv import load_dotenv

# Web Server & Framework
from flask import Flask, request
from flask_limiter import Limiter
from flask_limiter.util import get_remote_address

# API Clients
import requests
from twilio.rest import Client
from googleapiclient.discovery import build
import google.generativeai as genai

# Media & AI Processing
from PIL import Image
import pytesseract
import torch
import torchvision.transforms as transforms
from torchvision.models import resnet18, ResNet18_Weights
from faster_whisper import WhisperModel

#Google Text to Speech
import uuid
from gtts import gTTS
from flask import send_from_directory

In [2]:
# --- INITIALIZATION AND CONFIGURATION ---

load_dotenv() # Load environment variables from .env file
app = Flask(__name__)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
limiter = Limiter(app=app, key_func=get_remote_address, default_limits=["2000 per day", "500 per hour"])




In [None]:
# --- API CLIENTS AND ENVIRONMENT VARIABLES ---

# Securely loading keys from environment variables
# --- Configuration: get keys from environment ---
TWILIO_ACCOUNT_SID = ""
TWILIO_AUTH_TOKEN = ""
GOOGLE_AI_API_KEY = ""
GOOGLE_API_KEY = ""
GOOGLE_CSE_ID = ""
TWILIO_WHATSAPP_NUMBER = 'whatsapp:+14155238886'

#GTTS Stuff
NGROK_PUBLIC_URL = "" 

# Create a directory to store the audio files
AUDIO_DIR = "/tmp/vaidya_audio" 
os.makedirs(AUDIO_DIR, exist_ok=True)

# Validate that all necessary credentials are set
if not all([TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN, GOOGLE_API_KEY, GOOGLE_CSE_ID, GOOGLE_AI_API_KEY]):
    raise EnvironmentError("One or more required environment variables are not set.")

In [4]:
# Initialize API clients
twilio_client = Client(TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN)
genai.configure(api_key=GOOGLE_AI_API_KEY)
gemini_model = genai.GenerativeModel('gemini-2.5-flash')
audio_model = WhisperModel("base", device="cuda", compute_type="float16")
vision_weights = ResNet18_Weights.DEFAULT
vision_model = resnet18(weights=vision_weights)
vision_model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [5]:
# --- CONVERSATION MEMORY ---
# Bot's memory is a simple dictionary that resets on server restart.
conversation_states = {}

In [6]:
# --- AI SERVICES ---

def get_conversational_reply(history):
    """Generates a natural, conversational reply from Gemini."""
    history_str = "\n".join([f"{msg['role']}: {msg['content']}" for msg in history])
    prompt = (
        "You are VaidyaAI, a warm, empathetic, and insightful AI health companion rooted in the wisdom of Ayurveda. Your goal is to have a natural, free-flowing conversation to help a user understand their health concerns."
        "\n\n**Your Core Rules:**\n"
        "1.  **Be Conversational:** Talk like a caring, knowledgeable guide, not a robot. Ask questions naturally to understand symptoms (nature, duration, intensity).\n"
        "2.  **Safety First:** NEVER give a medical diagnosis or prescribe specific dosages. Always include disclaimers and advise consulting a qualified practitioner.\n"
        "3.  **Stay Focused:** If the user talks about unrelated topics, gently guide them back by saying, 'That's interesting, but my main purpose is to help with your wellness. How can I assist you with that today?'\n"
        "4.  **Implicitly Understand:** When you have enough information and the user asks for a remedy, provide one in a structured, easy-to-read format using Markdown with bold asterisks for headings.\n\n"
        f"**Current Conversation:**\n{history_str}\nassistant: "
    )
    try:
        response = gemini_model.generate_content(prompt)
        return response.text
    except Exception as e:
        logging.error(f"Gemini API call failed: {e}")
        return "I'm sorry, I'm having trouble connecting to my knowledge base at the moment."

# --- MEDIA ANALYSIS ---

def transcribe_audio(audio_bytes):
    """Transcribes audio bytes to text using Whisper."""
    try:
        temp_audio_path = "/tmp/temp_audio.ogg"
        with open(temp_audio_path, "wb") as f: f.write(audio_bytes)
        segments, _ = audio_model.transcribe(temp_audio_path, beam_size=5)
        return "".join(segment.text for segment in segments).strip()
    except Exception: return "Error transcribing audio."

def process_image_bytes(image_bytes):
    """Performs OCR on the raw bytes of an image."""
    try:
        return pytesseract.image_to_string(Image.open(io.BytesIO(image_bytes))).strip()
    except Exception: return ""

def recognize_image_content(image_bytes):
    """Uses a ResNet model to identify the main object in an image."""
    try:
        labels = vision_weights.meta["categories"]
        preprocess = vision_weights.transforms()
        image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
        batch = preprocess(image).unsqueeze(0)
        prediction = vision_model(batch).squeeze(0).softmax(0)
        class_id = prediction.argmax().item()
        score = prediction[class_id].item()
        return labels[class_id] if score > 0.1 else ""
    except Exception: return ""

def analyze_image(image_bytes):
    """Analyzes an image by first trying OCR, then falling back to image recognition."""
    ocr_text = process_image_bytes(image_bytes)
    return ocr_text if ocr_text and len(ocr_text) > 5 else recognize_image_content(image_bytes)

# --- UTILITIES ---

def soft_shorten_message(full_text):
    """
    Intelligently shortens a message to fit within the 1600 character limit
    by truncating at the end of the last full sentence that fits.
    """
    limit = 1600
    if len(full_text) <= limit:
        return full_text

    logging.warning("Message exceeds 1600 characters, performing soft shortening.")
    
    # Isolate the main body from the disclaimer we always want to include
    disclaimer = "\n\n*Disclaimer: This is AI-generated advice. Please consult a qualified practitioner.*"
    text_body = full_text.replace(disclaimer, "").strip()
    notice = "\n\n...(message shortened for brevity)"
    
    # Calculate the maximum possible length for the main text body
    max_body_len = limit - len(notice) - len(disclaimer)
    
    if len(text_body) <= max_body_len:
        return full_text # Should not happen, but a good safeguard

    # Find the last period (end of a sentence) before the maximum allowed length
    cut_off_point = text_body.rfind('.', 0, max_body_len)
    
    if cut_off_point != -1:
        # If a period is found, cut the message cleanly at the end of that sentence
        shortened_body = text_body[:cut_off_point + 1]
    else:
        # If no period is found (e.g., one very long paragraph), perform the hard truncate as a last resort
        shortened_body = text_body[:max_body_len]

    return shortened_body.strip() + notice + disclaimer

def find_image_url(query):
    """Uses Google Custom Search to find an image URL."""
    try:
        service = build("customsearch", "v1", developerKey=GOOGLE_API_KEY)
        res = service.cse().list(q=query, cx=GOOGLE_CSE_ID, searchType='image', num=1, safe='high').execute()
        return res['items'][0]['link'] if 'items' in res and len(res['items']) > 0 else None
    except Exception: return None

def get_media_content(message_sid):
    """Securely fetches media content from Twilio."""
    try:
        media_list = twilio_client.messages(message_sid).media.list()
        if not media_list: return None
        media_instance = media_list[0]
        media_url = f"https://api.twilio.com{media_instance.uri.replace('.json', '')}"
        response = requests.get(media_url, auth=(TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN))
        response.raise_for_status()
        return response.content
    except Exception: return None

def send_whatsapp_message(to_number, message_body, media_url=None):
    """Sends a message via Twilio, optionally with a single media file (image or audio)."""
    try:
        message_args = {'from_': TWILIO_WHATSAPP_NUMBER, 'to': to_number, 'body': message_body}
        
        if media_url:
            # Check if it's a valid media type Twilio supports
            if media_url.lower().endswith(('.png', '.jpg', '.jpeg', '.mp3', '.ogg')):
                 message_args['media_url'] = [media_url]
            else:
                logging.warning(f"Unsupported media_url skipped: {media_url}")
                
        twilio_client.messages.create(**message_args)
    except Exception as e:
        logging.error(f"Twilio API error: {format_exc()}")

@app.route('/audio/<filename>')
def serve_audio(filename):
    """Serves static audio files from the tmp directory."""
    try:
        return send_from_directory(AUDIO_DIR, filename)
    except FileNotFoundError:
        return "Audio file not found", 404

def generate_tts_audio_url(text_to_speak):
    """Generates a TTS audio file, saves it, and returns the public URL."""
    try:
        if not NGROK_PUBLIC_URL or "YOUR-UNIQUE-ID" in NGROK_PUBLIC_URL:
            logging.warning("NGROK_PUBLIC_URL not set. Skipping TTS.")
            return None

        # Clean text for speech
        text_to_speak = text_to_speak.replace("*", "") # Remove markdown

        tts = gTTS(text=text_to_speak, lang='en')
        filename = f"{uuid.uuid4()}.mp3"
        filepath = os.path.join(AUDIO_DIR, filename)
        tts.save(filepath)

        # Build the public URL for Twilio
        public_url = f"{NGROK_PUBLIC_URL}/audio/{filename}"
        return public_url
    except Exception as e:
        logging.error(f"TTS generation failed: {e}")
        return None

In [7]:
# --- MAIN WEBHOOK ---

@app.route('/webhook', methods=['POST'])
@limiter.limit("100 per minute")
def webhook():
    from_number = request.form.get('From')
    incoming_msg_body = request.form.get('Body', '').strip()
    
    if from_number not in conversation_states:
        conversation_states[from_number] = {'history': []}
    state_data = conversation_states[from_number]
    history = state_data['history']
    
    media_text = ""
    is_audio_message = False # Flag to track if user sent audio
    
    if int(request.form.get('NumMedia', 0)) > 0:
        media_bytes = get_media_content(request.form.get('MessageSid')) 
        if media_bytes:
            media_type = request.form.get('MediaContentType0', '')
            if 'audio' in media_type:
                is_audio_message = True
                media_text = transcribe_audio(media_bytes)
            else:
                media_text = analyze_image(media_bytes)

    # --- Format the full incoming message with context ---
    full_incoming_msg = ""
    if media_text and incoming_msg_body:
        full_incoming_msg = f"User sent a media file (content analyzed as: '{media_text}') and also typed: '{incoming_msg_body}'"
    elif media_text:
        full_incoming_msg = f"User sent a media file. The content I extracted from it is: '{media_text}'. Please respond to this in the context of our conversation."
    elif incoming_msg_body:
        full_incoming_msg = incoming_msg_body
    else:
        return '', 200 # Ignore empty messages

    history.append({'role': 'user', 'content': full_incoming_msg})
    
    reply_text, image_url = "", None
    
    if full_incoming_msg.lower().startswith("show me a picture of"):
        query = full_incoming_msg[len("show me a picture of"):].strip()
        if query:
            image_url = find_image_url(query) 
            if image_url:
                reply_text = f"Here is a picture of {query}."
            else:
                reply_text = f"I'm sorry, I couldn't find a picture of {query}."
        else:
            reply_text = "Please tell me what you want to see a picture of."
    else:
        reply_text = get_conversational_reply(history)

    if not reply_text: 
        reply_text = "I'm sorry, I'm having a bit of trouble at the moment."
    
    history.append({'role': 'assistant', 'content': reply_text})
    
    disclaimer = "\n\n*Disclaimer: This is AI-generated advice. Please consult a qualified practitioner.*"
    full_reply = reply_text + disclaimer
    final_message = soft_shorten_message(full_reply)
    
    # --- Media Reply Logic: Prioritize Image, then Audio ---
    media_to_send_url = None
    if image_url:
        # Priority 1: Send the image if one was found
        media_to_send_url = image_url
    elif is_audio_message:
        # Priority 2: If user sent audio (and we're not sending an image), reply with audio
        # We send the *original* text to TTS, not the one with the disclaimer.
        media_to_send_url = generate_tts_audio_url(reply_text)

    send_whatsapp_message(from_number, final_message, media_to_send_url)
    
    return '', 200

In [None]:
# --- MAIN EXECUTION ---
if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000, debug=False)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.17.213.47:5000
2025-11-16 14:54:10,363 - INFO - [33mPress CTRL+C to quit[0m
