In [None]:
!pip install git+https://github.com/openai/whisper.git -q -U

In [None]:
!pip install yt-dlp -q -U

In [None]:
!yt-dlp https://youtu.be/S4wWClQhZaA --format mp4 -o "/content/%(id)s.%(ext)s" -q
!mv /content/S4wWClQhZaA.mp4 /content/audio.mp4

In [None]:
!whisper "/content/audio.mp4" --model small --language English

In [None]:
!pip install -q openai-whisper resemblyzer pydub numpy scikit-learn

In [None]:
import whisper
from resemblyzer import VoiceEncoder, preprocess_wav
from sklearn.cluster import DBSCAN
from pydub import AudioSegment
from scipy.spatial.distance import cosine
import numpy as np
import os

# Load Whisper & Resemblyzer
whisper_model = whisper.load_model("base")
encoder = VoiceEncoder()

# Transcribe
audio_path = "/content/audio.mp4"
result = whisper_model.transcribe(audio_path, word_timestamps=True)

# Load audio for chunking
audio = AudioSegment.from_file(audio_path, format="mp4")

# Break into chunks using Whisper segments
segments = []
for seg in result["segments"]:
    start_ms = int(seg["start"] * 1000)
    end_ms = int(seg["end"] * 1000)
    audio_chunk = audio[start_ms:end_ms]
    chunk_path = f"chunk_{seg['id']}.wav"
    audio_chunk.export(chunk_path, format="wav")
    segments.append((chunk_path, seg["start"], seg["end"], seg["text"]))

# Get speaker embeddings
embeddings = []
for chunk_path, _, _, _ in segments:
    wav = preprocess_wav(chunk_path)
    embedding = encoder.embed_utterance(wav)
    embeddings.append(embedding)
    os.remove(chunk_path)  # cleanup

embeddings = np.array(embeddings)

# DBSCAN clustering (auto speaker detection)
clustering = DBSCAN(eps=0.55, min_samples=2).fit(embeddings)
labels = clustering.labels_

# Handle "Unknown" labels (-1) by assigning them to closest valid speaker
for i, label in enumerate(labels):
    if label == -1:
        distances = [
            (j, cosine(embeddings[i], embeddings[j]))
            for j in range(len(labels)) if labels[j] != -1
        ]
        if distances:
            closest_idx, _ = min(distances, key=lambda x: x[1])
            labels[i] = labels[closest_idx]  # Reassign to nearest speaker

# Show number of unique speakers
num_speakers = len(set(labels))
print(f"\n🔊 Estimated number of speakers (after reassignment): {num_speakers}\n")

# Output transcript with speaker labels
for (_, start, end, text), label in zip(segments, labels):
    speaker = f"Speaker {label}"
    print(f"[{speaker}] {start:.2f} - {end:.2f}: {text}")


In [None]:
from collections import Counter

# Count lines per speaker
speaker_counts = Counter(label for _, label in zip(segments, labels))

# Total lines
total_lines = len(segments)

print("\nSpeaker Line Counts:")
for label, count in speaker_counts.items():
    speaker = f"Speaker {label}" if label != -1 else "Unknown"
    print(f"{speaker}: {count} lines")

print(f"\nTotal lines: {total_lines}")

In [None]:
import google.generativeai as genai
from google.colab import userdata

GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
formatted_transcript = ""
for (_, start, end, text), label in zip(segments, labels):
    speaker = f"Speaker {label}"

    formatted_transcript += f"[{speaker}] {text}\n"

print(formatted_transcript)

In [None]:
prompt = f"""
You are a medical professional creating a SOAP note from a patient conversation transcript.
Generate a SOAP note based on the following transcript.

{formatted_transcript}
"""

model = genai.GenerativeModel('gemini-1.5-flash')
response = model.generate_content(prompt)

soap_note = response.text
print(soap_note)

In [None]:
import tiktoken

def count_tokens(text, model_name="gpt-4"):
    encoding = tiktoken.encoding_for_model(model_name)
    return len(encoding.encode(text))

input_tokens = count_tokens(prompt)
print(f"Input Tokens: {input_tokens}")

output_tokens = count_tokens(soap_note)
print(f"Output Tokens: {output_tokens}")

In [None]:
system_instruction = """
You are a clinical intake assistant. Your task is to conduct a focused medical interview with a patient through short, structured chat messages.

You must collect the following information in a clear, concise manner:
1. What symptoms the patient is currently experiencing
2. When the issue started
3. What makes the symptoms better or worse
4. How the symptoms feel (e.g., severity, quality, location)
5. Any associated symptoms (e.g., fever, nausea, vision changes)
6. Any past history of similar problems or known medical conditions

Important instructions:
- Do NOT ask for name, age, gender, lifestyle, or occupation.
- Avoid excessive friendliness, emojis, or small talk.
- Be direct and medical in tone, like a triage nurse or clinical assistant.
- Ask only one question at a time and wait for the user’s response.
- Summarize and show the collected data only when the user types "done".
- Format the summary in structured sections with labels: “Symptoms”, “Onset”, “Triggers”, “Pain Description”, “Associated Symptoms”, “Medical History”.

Begin by asking:
"Hey I'm Hami! How are you feeling today?"

"""


In [None]:
import google.generativeai as genai

model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    system_instruction=system_instruction
)

chat = model.start_chat(history=[])


In [None]:
import google.generativeai as genai

# Configure genai with the API key
genai.configure(api_key=GOOGLE_API_KEY)

model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    system_instruction="""
You are a clinical intake assistant. Your task is to conduct a focused medical interview with a patient through short, structured chat messages.

You must collect the following information in a clear, concise manner:
1. What symptoms the patient is currently experiencing
2. When the issue started
3. What makes the symptoms better or worse
4. How the symptoms feel (e.g., severity, quality, location)
5. Any associated symptoms (e.g., fever, nausea, vision changes)
6. Any past history of similar problems or known medical conditions

Important instructions:
- Do NOT ask for name, age, gender, lifestyle, or occupation.
- Avoid excessive friendliness, emojis, or small talk.
- Be direct and medical in tone, like a triage nurse or clinical assistant.
- Ask only one question at a time and wait for the user’s response.
- Summarize and show the collected data only when the user types "done".
- Format the summary in structured sections with labels: “Symptoms”, “Onset”, “Triggers”, “Pain Description”, “Associated Symptoms”, “Medical History”.

Begin by asking:
"Hey I'm Hami! How are you feeling today?"

"""
)

chat = model.start_chat(history=[])

print("🩺 AI Medical Assistant Ready (type 'done' to get the report)\n")

# Send the initial message from the AI
initial_response = chat.send_message("Hey I'm Hami! How are you feeling today?")
print("🤖 Assistant:", initial_response.text)


while True:
    user_input = input("👤 You: ")
    if user_input.lower() in ["done", "exit", "quit"]:
        # Trigger the summary
        response = chat.send_message("Please summarize the entire conversation in the format mentioned above.")
        print("\n📋 Pre-Visit Summary:\n")
        print(response.text)
        break
    else:
        response = chat.send_message(user_input)
        print("🤖 Assistant:", response.text)