In [3]:
import torch
torch.cuda.empty_cache()

In [2]:
#Launching without log
import torch
from transformers import BitsAndBytesConfig, pipeline
import warnings
warnings.filterwarnings("ignore")
from PIL import Image
import re
import whisper
import gradio as gr
import os
from gtts import gTTS

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

model_id = "llava-hf/llava-1.5-7b-hf"
pipe = pipeline("image-to-text", model=model_id, 
                model_kwargs={"quantization_config": quantization_config})

# CUDA device check
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
#print(f"Using torch {torch._version_} ({DEVICE})")

whisper_model = whisper.load_model("base", device=DEVICE)

def transcribe(audio_path):
    if not audio_path:
        return '', '', None

    audio = whisper.load_audio(audio_path)
    audio = whisper.pad_or_trim(audio)

    mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)

    options = whisper.DecodingOptions(language="en")
    result = whisper.decode(whisper_model, mel, options)
    return result.text

def text_to_speech(text, file_path):
    language = 'en'
    audio_obj = gTTS(text=text, lang=language, slow=False)
    audio_obj.save(file_path)
    return file_path

def img2txt(input_text, input_image):
    image = Image.open(input_image)
    prompt_instructions = f"Act as an expert in imagery descriptive analysis, respond to the prompt: {input_text}"
    prompt = f"USER: <image>\n{prompt_instructions}\nASSISTANT:"
    
    outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 6000})
    
    if outputs and len(outputs[0]["generated_text"]) > 0:
        match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
        if match:
            reply = match.group(1)
        else:
            reply = "No response found."
    else:
        reply = "No response generated."
    return reply

def process_inputs(text_input, audio_path, image_path):
    speech_to_text_output = ''
    if audio_path:
        speech_to_text_output = transcribe(audio_path)
    
    if text_input:
        combined_input = text_input
    else:
        combined_input = speech_to_text_output

    if not combined_input:
        combined_input = "Describe this image."

    if image_path:
        chatgpt_output = img2txt(combined_input, image_path)
    else:
        image_path="C:/Users/kmano/OneDrive/Documents/Certificates/blk.png"  # Default image if none provided
        chatgpt_output = img2txt(combined_input, image_path)
    
    processed_audio_path = text_to_speech(chatgpt_output, "Temp3.mp3") if chatgpt_output else None

    return speech_to_text_output, chatgpt_output, processed_audio_path

iface = gr.Interface(
    fn=process_inputs,
    inputs=[
        gr.Textbox(label="Text Input"),
        gr.Audio(type="filepath", label="Audio Input"),  
        gr.Image(type="filepath", label="Image Input")   
               
    ],
    outputs=[
        gr.Textbox(label="Speech to Text Output"),  
        gr.Textbox(label="Generated Response"),     
        gr.Audio(label="Text to Speech Output")    
    ],
    title="Intelligent Voice and Text Assistant using Llava",
    description="Upload an image, provide voice input, or type a prompt to receive a detailed analysis and audio response."
)

iface.launch(debug=True,share=True)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 3/3 [00:37<00:00, 12.52s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://ecd3e9415d4e09c72c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://ecd3e9415d4e09c72c.gradio.live




In [1]:
import torch
torch.cuda.empty_cache()

In [2]:
#launch with log
import torch
from transformers import BitsAndBytesConfig, pipeline
import warnings
warnings.filterwarnings("ignore")
from PIL import Image
import re
import whisper
import gradio as gr
import os
from gtts import gTTS
import socket

def get_ip_address():
    hostname = socket.gethostname()
    ip_address = socket.gethostbyname(hostname)
    return ip_address
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

model_id = "llava-hf/llava-1.5-7b-hf"
pipe = pipeline("image-to-text", model=model_id, 
                model_kwargs={"quantization_config": quantization_config})

# CUDA device check
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
#print(f"Using torch {torch._version_} ({DEVICE})")

whisper_model = whisper.load_model("base", device=DEVICE)

def transcribe(audio_path):
    if not audio_path:
        return '', '', None

    audio = whisper.load_audio(audio_path)
    audio = whisper.pad_or_trim(audio)

    mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)

    options = whisper.DecodingOptions(language="en")
    result = whisper.decode(whisper_model, mel, options)
    return result.text

def text_to_speech(text, file_path):
    language = 'en'
    audio_obj = gTTS(text=text, lang=language, slow=False)
    audio_obj.save(file_path)
    return file_path

def img2txt(input_text, input_image):
    image = Image.open(input_image)
    prompt_instructions = f"Act as an expert in imagery descriptive analysis, respond to the prompt: {input_text}"
    prompt = f"USER: <image>\n{prompt_instructions}\nASSISTANT:"
    
    outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 6000})
    
    if outputs and len(outputs[0]["generated_text"]) > 0:
        match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
        if match:
            reply = match.group(1)
        else:
            reply = "No response found."
    else:
        reply = "No response generated."
    return reply

def process_inputs(text_input, audio_path, image_path):
    speech_to_text_output = ''
    userIP=get_ip_address()
    if audio_path:
        speech_to_text_output = transcribe(audio_path)
    
    if text_input:
        combined_input = text_input
    else:
        combined_input = speech_to_text_output

    if not combined_input:
        combined_input = "Describe this image."

    if image_path:
        gpt_output = img2txt(combined_input, image_path)
    else:
        image_path="C:/Users/kmano/OneDrive/Documents/Certificates/blk.png"  # Default image if none provided
        gpt_output = img2txt(combined_input, image_path)
    
    processed_audio_path = text_to_speech(gpt_output, "Temp3.mp3") if gpt_output else None
    logData="\n*******NEW DATA*******\nUser:"+str(userIP)+"\nText Input:"+text_input+"\nAudio input:"+speech_to_text_output+"\nOutput:"+gpt_output
    print(logData)
    with open('User_Log.txt', 'a') as file:
        file.write(logData)
    return speech_to_text_output, gpt_output, processed_audio_path

iface = gr.Interface(
    fn=process_inputs,
    inputs=[
        gr.Textbox(label="Text Input"),
        gr.Audio(type="filepath", label="Audio Input"),  
        gr.Image(type="filepath", label="Image Input")   
               
    ],
    outputs=[
        gr.Textbox(label="Speech to Text Output"),  
        gr.Textbox(label="Generated Response"),     
        gr.Audio(label="Text to Speech Output")    
    ],
    title="Intelligent Voice Assistant using Llava",
    description="Upload an image, provide voice input, or type a prompt to receive a detailed analysis and audio response."
)

iface.launch(debug=True,share=True)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 3/3 [00:28<00:00,  9.49s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://f4d89f1eebcbb251bd.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)



*******NEW DATA*******
User:192.168.1.7
Text Input:What is CRDI Engine
Audio input:
Output:CRDI stands for Compact and Rural Diesel Initiative. It is a technology developed by the Indian government to improve the efficiency and performance of diesel engines in rural and remote areas. The CRDI engine is designed to operate on low-quality diesel fuel, which is commonly available in rural areas. The engine is equipped with advanced features such as improved combustion, better fuel atomization, and enhanced engine control systems. These features help to optimize the engine's performance, reduce emissions, and improve overall engine efficiency, making it an ideal choice for rural and remote locations.

*******NEW DATA*******
User:192.168.1.7
Text Input:What is CRDI 
Audio input:
Output:CRDI stands for "Cognitive Reasoning and Decision Instrument." It is a psychometric tool used to assess an individual's cognitive abilities, decision-making skills, and problem-solving capabilities. The CRDI

