In [4]:
!pip install transformers torch torchaudio



In [5]:
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# Select device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load Whisper model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-base")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base").to(device)

In [6]:
import torchaudio
import numpy as np
from google.colab import files

# Upload an audio file
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# Load audio
waveform, sample_rate = torchaudio.load(filename)

# Convert stereo to mono if needed
if waveform.shape[0] > 1:
    waveform = torch.mean(waveform, dim=0, keepdim=True)

# Resample if necessary (Whisper expects 16kHz)
if sample_rate != 16000:
    transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = transform(waveform)

# Convert to NumPy array
audio_np = waveform.squeeze().numpy()

Saving health.wav to health (1).wav


In [7]:
# Tokenize input audio
input_features = processor(audio_np, sampling_rate=16000, return_tensors="pt").input_features.to(device)

# Generate transcription
with torch.no_grad():
    predicted_ids = model.generate(input_features)

# Decode transcription
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
print("Transcription:", transcription)


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Transcription:  Hello, I would like to say that today actually I am having very high headache and I am feeling some nauseous and vomiting and also I am having a strange type of stomach ache and I have some rash all around me. I see little blurry and the images I see are blurry and my head is very much bleeding.


In [9]:
from transformers import pipeline

# Load text classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define labels (expanded health-related keywords)
labels = [
    "doctor", "medicine", "hospital", "treatment", "infection",
    "symptoms", "disease", "fever", "cough", "pain", "surgery",
    "health", "diagnosis", "vaccine", "therapy", "illness",
    "pharmacy", "clinic", "nurse", "emergency", "ambulance",
    "prescription", "medical", "patient", "checkup", "wellness",
    "recovery", "disorder", "pathology", "biopsy", "screening",
    "diabetes", "cancer", "asthma", "allergy", "arthritis",
    "hypertension", "stroke", "heart disease", "migraine", "flu",
    "pneumonia", "tuberculosis", "hepatitis", "depression",
    "anxiety", "obesity", "cholesterol", "HIV", "AIDS",
    "Alzheimer's", "dementia", "osteoporosis", "bronchitis",
    "insomnia", "autism", "epilepsy", "malaria",
    "headache", "vomiting", "diarrhea", "fatigue", "shortness of breath",
    "rash", "dizziness", "swelling", "inflammation", "nausea",
    "cold", "sore throat", "chills", "loss of appetite", "muscle pain",
    "stiffness", "congestion", "blurred vision", "bleeding",
    "skin irritation", "fainting", "sneezing", "joint pain",
    "blood test", "MRI", "CT scan", "X-ray", "ultrasound",
    "endoscopy", "chemotherapy", "radiation therapy", "dialysis",
    "organ transplant", "sutures", "vaccination", "physical therapy",
    "rehabilitation", "IV therapy", "electrocardiogram (ECG)",
    "colonoscopy", "biopsy", "genetic testing", "anesthesia",
    "antibiotic", "painkiller", "antidepressant", "antiviral",
    "steroid", "immunotherapy", "chemotherapy", "probiotic",
    "insulin", "supplement", "multivitamin", "herbal medicine",
    "homeopathy", "acupuncture", "radiotherapy", "ointment",
    "capsule", "syrup", "injection", "dosage",
    "nutrition", "diet", "exercise", "mental health", "yoga",
    "meditation", "hydration", "sleep", "hygiene", "weight loss",
    "fitness", "stress", "workout", "balanced diet", "calories",
    "organic food", "detox", "immunity", "self-care",
    "preventive care", "public health", "health insurance",
    "cardiology", "neurology", "dermatology", "orthopedics",
    "gynecology", "pediatrics", "oncology", "psychiatry",
    "radiology", "urology", "gastroenterology", "pulmonology",
    "endocrinology", "nephrology", "ophthalmology",
    "hematology", "rheumatology", "anesthesiology",
    "genetics", "forensic medicine", "sports medicine"
]

# Perform classification
result = classifier(transcription, candidate_labels=labels, multi_label=True)

# Get the top-scoring labels
top_labels = result["labels"][:5]  # Consider top 5 predictions
top_scores = result["scores"][:5]

# Determine if any label belongs to our predefined health-related categories
threshold = 0.4  # Adjust confidence threshold as needed
is_health_related = any(label in labels and score > threshold for label, score in zip(top_labels, top_scores))

# Print results
print("Health-Related:", is_health_related)
print("Top Predictions:", list(zip(top_labels, top_scores)))


Device set to use cuda:0


Health-Related: True
Top Predictions: [('vomiting', 0.9976921081542969), ('bleeding', 0.997310996055603), ('headache', 0.9972224831581116), ('blurred vision', 0.9921610951423645), ('nausea', 0.9908934831619263)]
