# Splitting Audio
- for testing purposes

In [None]:
import subprocess

def get_audio_duration(filepath):
    result = subprocess.run(
        ["ffprobe", "-v", "error", "-show_entries", "format=duration",
         "-of", "default=noprint_wrappers=1:nokey=1", filepath],
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT
    )
    return float(result.stdout)

# Example:
file_path = "/content/drive/MyDrive/Uni/FS25/audio_files/scenario/ES2016a.Mix-Headset.wav"
duration = get_audio_duration(file_path)
print(f"Original Duration: {duration:.2f} seconds")


Original Duration: 1384.19 seconds


In [None]:
import os

# Set up input and output
half_duration = duration / 2
output_path = file_path.replace(".wav", "_half.wav")

# Run ffmpeg to extract the first half
os.system(f"ffmpeg -y -i \"{file_path}\" -t {half_duration:.2f} \"{output_path}\"")

print(f"✅ Half audio saved to: {output_path}")


✅ Half audio saved to: /content/drive/MyDrive/Uni/FS25/audio_files/scenario/ES2016a.Mix-Headset_half.wav


# Decoding

In [None]:
# 1) Set up Environment
!pip install git+https://github.com/openai/whisper.git
!sudo apt update && sudo apt install ffmpeg
#!pip install jiwer

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-ynioknir
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-ynioknir
  Resolved https://github.com/openai/whisper.git to commit c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_c

In [None]:
import torch
print(torch.cuda.is_available())  # Should print: True

True


In [None]:
# 3) Load Whisper
import whisper
import json
import os

model = whisper.load_model("base", device='cuda', download_root='./')  # You can use "small", "medium", etc.

100%|███████████████████████████████████████| 139M/139M [00:02<00:00, 65.2MiB/s]


In [None]:
# ADJUST
test_audio_dir = "/content/drive/MyDrive/Uni/FS25/audio_files/test"
nat_audio_dir = "/content/drive/MyDrive/Uni/FS25/audio_files/natural"
scen_audio_dir = "/content/drive/MyDrive/Uni/FS25/audio_files/scenario"
output_dir = "/content/drive/MyDrive/Uni/FS25/Whisper_outputs"


In [None]:
# 4) Transcribe
def transcribe_audio_folder(audio_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    results = {}  # dictionary to collect transcriptions

    for fname in os.listdir(audio_dir):
        if fname.endswith(".Mix-Headset.wav"):
            audio_path = os.path.join(audio_dir, fname)
            print(f"🔊 Transcribing {fname} ...")
            result = model.transcribe(audio_path)

            json_name = os.path.splitext(fname)[0] + ".json"
            output_path = os.path.join(output_dir, json_name)
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(result, f, ensure_ascii=False, indent=2)

            print(f"✅ Saved to {output_path}")
            results[fname] = result  # store result in dictionary

    return results

def save_transcripts_as_txt(results_dict, output_txt_dir):
    os.makedirs(output_txt_dir, exist_ok=True)

    for filename, result in results_dict.items():
        base_name = os.path.splitext(filename)[0]  # remove .wav
        txt_path = os.path.join(output_txt_dir, base_name + ".txt")

        with open(txt_path, "w", encoding="utf-8") as f:
            f.write(result["text"])
        print(f"📝 Saved text to {txt_path}")

In [None]:
print("TEST MEETINGS:")
output_path = os.path.join(output_dir, "test")
os.makedirs(output_path, exist_ok=True)
test_results = transcribe_audio_folder(test_audio_dir, output_path)
save_transcripts_as_txt(test_results, output_path)

TEST MEETINGS:
🔊 Transcribing ES2016a_half.Mix-Headset.wav ...
✅ Saved to /content/drive/MyDrive/Uni/FS25/Whisper_outputs/test/ES2016a_half.Mix-Headset.json
📝 Saved text to /content/drive/MyDrive/Uni/FS25/Whisper_outputs/test/ES2016a_half.Mix-Headset.txt


In [None]:
print("SCENARIO MEETINGS:")
output_path = os.path.join(output_dir, "scenario")
os.makedirs(output_path, exist_ok=True)
scen_results = transcribe_audio_folder(scen_audio_dir, output_path)
save_transcripts_as_txt(scen_results, output_path)

SCENARIO MEETINGS:
🔊 Transcribing ES2016a.Mix-Headset.wav ...
✅ Saved to /content/drive/MyDrive/Uni/FS25/Whisper_outputs/scenario/ES2016a.Mix-Headset.json
🔊 Transcribing ES2016b.Mix-Headset.wav ...
✅ Saved to /content/drive/MyDrive/Uni/FS25/Whisper_outputs/scenario/ES2016b.Mix-Headset.json
🔊 Transcribing ES2016c.Mix-Headset.wav ...
✅ Saved to /content/drive/MyDrive/Uni/FS25/Whisper_outputs/scenario/ES2016c.Mix-Headset.json
🔊 Transcribing ES2016d.Mix-Headset.wav ...
✅ Saved to /content/drive/MyDrive/Uni/FS25/Whisper_outputs/scenario/ES2016d.Mix-Headset.json
📝 Saved text to /content/drive/MyDrive/Uni/FS25/Whisper_outputs/scenario/ES2016a.Mix-Headset.txt
📝 Saved text to /content/drive/MyDrive/Uni/FS25/Whisper_outputs/scenario/ES2016b.Mix-Headset.txt
📝 Saved text to /content/drive/MyDrive/Uni/FS25/Whisper_outputs/scenario/ES2016c.Mix-Headset.txt
📝 Saved text to /content/drive/MyDrive/Uni/FS25/Whisper_outputs/scenario/ES2016d.Mix-Headset.txt


In [None]:
print("NATURAL MEETINGS:")
output_path = os.path.join(output_dir, "natural")
os.makedirs(output_path, exist_ok=True)
nat_results = transcribe_audio_folder(nat_audio_dir, output_path)
save_transcripts_as_txt(nat_results, output_path)

NATURAL MEETINGS:
🔊 Transcribing EN2009c.Mix-Headset.wav ...
✅ Saved to /content/drive/MyDrive/Uni/FS25/Whisper_outputs/natural/EN2009c.Mix-Headset.json
🔊 Transcribing EN2009d.Mix-Headset.wav ...
✅ Saved to /content/drive/MyDrive/Uni/FS25/Whisper_outputs/natural/EN2009d.Mix-Headset.json
📝 Saved text to /content/drive/MyDrive/Uni/FS25/Whisper_outputs/natural/EN2009c.Mix-Headset.txt
📝 Saved text to /content/drive/MyDrive/Uni/FS25/Whisper_outputs/natural/EN2009d.Mix-Headset.txt


# NER
- just trying out (not real script)

In [None]:
# NER
!pip install spacy
!python -m spacy download en_core_web_sm
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

for ent in doc.ents:
    print(f"{ent.text} → {ent.label_}")


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m117.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Nick Tabusk → PERSON
Corinne Whiting → ORG
three → CARDINAL
Ryan → PERSON
three → CARDINAL
Manuel → PERSON
25 minutes → TIME
three → CARDINAL
one → CARDINAL
first → ORDINAL
One → CARDINAL
third → ORDINAL
tomorrow → DATE
one → CARDINAL
about 25 → CARDINAL
around 50 million → CARDINAL
the International Remote Control Association → ORG


In [None]:
# NER with huggingface:
!pip install transformers
from transformers import pipeline

ner_pipeline = pipeline("ner", grouped_entities=True)
entities = ner_pipeline(text)

for e in entities:
    print(e)




No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cuda:0


{'entity_group': 'PER', 'score': np.float32(0.98454154), 'word': 'Nick Tabusk', 'start': 147, 'end': 158}
{'entity_group': 'PER', 'score': np.float32(0.9981381), 'word': 'Corinne Whiting', 'start': 330, 'end': 345}
{'entity_group': 'PER', 'score': np.float32(0.99960715), 'word': 'Ryan', 'start': 938, 'end': 942}
{'entity_group': 'PER', 'score': np.float32(0.9994386), 'word': 'Manuel', 'start': 1400, 'end': 1406}


In [None]:
# 5) Format Output for Evaluation
import json
import glob

def convert_to_jsonl_format(whisper_json_folder, output_jsonl_path):
    with open(output_jsonl_path, "w", encoding="utf-8") as fout:
        for path in glob.glob(os.path.join(whisper_json_folder, "*.json")):
            with open(path, "r", encoding="utf-8") as f:
                data = json.load(f)
                for segment in data["segments"]:
                    record = {
                        "start_time": segment["start"],
                        "end_time": segment["end"],
                        "text": segment["text"]
                    }
                    fout.write(json.dumps(record) + "\n")

test = "/content/drive/MyDrive/Uni/FS25/Whisper_outputs/ES2016a_half.Mix-Headset.json"
test_converted = "/content/drive/MyDrive/Uni/FS25/Whisper_outputs/converted/converted.jsonl"
# Convert Whisper JSONs to jsonl
convert_to_jsonl_format(test, test_converted)