In [3]:
# Load model directly
from transformers import AutoImageProcessor, AutoModelForImageClassification
from PIL import Image
import requests
import time

processor = AutoImageProcessor.from_pretrained("dima806/medicinal_plants_image_detection")
model = AutoModelForImageClassification.from_pretrained("dima806/medicinal_plants_image_detection")
model.to("cuda")

# Load the local image file
image = Image.open("download (2).jpeg")

# Process the image for model input
inputs = processor(images=image, return_tensors="pt")

time_start = time.time()
# Get model predictions
outputs = model(**inputs.to(model.device))
time_end = time.time()
print("Time taken for prediction:", time_end - time_start)
predictions = outputs.logits.softmax(dim=1)

# Get the predicted class
predicted_class_idx = predictions.argmax().item()
predicted_class = model.config.id2label[predicted_class_idx]
confidence = predictions[0][predicted_class_idx].item()

print(f"Predicted class: {predicted_class}")
print(f"Confidence: {confidence:.2%}")


Time taken for prediction: 0.03592205047607422
Predicted class: Mint
Confidence: 38.10%


In [1]:
import sounddevice as sd
import numpy as np
import keyboard
import threading
import queue
import time

# Setup audio parameters
sample_rate = 16000  # Sample rate expected by Whisper
channels = 1
dtype = 'float32'

from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset

# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
model.config.forced_decoder_ids = None

# Create a queue to store audio chunks
audio_queue = queue.Queue()
is_recording = False
stop_recording = False

def audio_callback(indata, frames, time, status):
    """This is called for each audio block"""
    if is_recording:
        # Add the current chunk to our queue
        audio_queue.put(indata.copy())

def key_monitor():
    """Monitor space bar presses and control recording"""
    global is_recording, stop_recording
    
    print("Press and hold SPACE to record. Release to transcribe. Press ESC to quit.")
    
    while not stop_recording:
        # Start recording when space is pressed
        if keyboard.is_pressed('space') and not is_recording:
            is_recording = True
            audio_queue.queue.clear()  # Clear any old audio
            print("Recording... (holding space)")
        
        # Stop recording when space is released
        elif not keyboard.is_pressed('space') and is_recording:
            is_recording = False
            print("Processing...")
            process_audio()
        
        # Exit on ESC
        if keyboard.is_pressed('esc'):
            stop_recording = True
            is_recording = False
            print("Stopping...")
        
        time.sleep(0.01)  # Small sleep to prevent CPU hogging

def process_audio():
    """Process collected audio chunks and transcribe"""
    if audio_queue.empty():
        print("No audio recorded")
        return
    
    # Combine all audio chunks
    chunks = []
    while not audio_queue.empty():
        chunks.append(audio_queue.get())
    
    if not chunks:
        return
        
    audio_data = np.concatenate(chunks, axis=0)
    audio_flat = audio_data.flatten()
    
    # Process through Whisper
    input_features = processor(
        audio_flat,
        sampling_rate=sample_rate,
        return_tensors="pt"
    ).input_features
    
    # Generate transcription
    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    
    print(f"Transcription: {transcription}")

def start_space_bar_transcription():
    """Start the main transcription loop"""
    global stop_recording, is_recording
    
    # Reset flags
    stop_recording = False
    is_recording = False
    
    # Start audio stream
    with sd.InputStream(samplerate=sample_rate, channels=channels, dtype=dtype, callback=audio_callback):
        # Start key monitoring in a separate thread
        monitor_thread = threading.Thread(target=key_monitor)
        monitor_thread.start()
        
        try:
            # Wait for the monitor thread to finish
            while not stop_recording:
                time.sleep(0.1)
        except KeyboardInterrupt:
            stop_recording = True
            is_recording = False
            print("Interrupted by user")
        
        # Wait for the monitor thread to finish
        monitor_thread.join()
    
    print("Transcription stopped")                     


start_space_bar_transcription()

Press and hold SPACE to record. Release to transcribe. Press ESC to quit.
Recording... (holding space)


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


Processing...


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Transcription:  this is a task of the transcription feature of the visfer model.
Recording... (holding space)
Processing...
Transcription:  of the whisper model.
Recording... (holding space)
Processing...
Transcription:  The transcription is very good and it is very fast as well.
Recording... (holding space)
Processing...
Transcription:  I think it is phenomenal and I think that is the best transcription I have ever seen like this these two text is awesome.
Recording... (holding space)
Processing...
Transcription:  this speech to text.
Recording... (holding space)
Processing...
Transcription:  the speed to text I meant the speed to text model
Recording... (holding space)
Processing...
Transcription:  I don't mean the speed, I mean the speech to text mode.
Recording... (holding space)
Processing...
Transcription:  the switch to text model is very
Recording... (holding space)
Processing...
Transcription:  the speech to text model is very good.
Interrupted by user
Transcription stopped
