In [None]:
!pip install pandas pydub openpyxl tqdm

# =======================
#  Import Libraries
# =======================
import os
from pydub import AudioSegment



In [None]:
import pandas as pd
from tqdm import tqdm


# Mount working directory
BASE_DIR = "/content/drive/MyDrive/disfluency_project"
os.makedirs(BASE_DIR, exist_ok=True)



In [None]:
csv_path = "/content/drive/MyDrive/whisper_train_manifest_fullpath.csv"
list_path = "/content/drive/MyDrive/Speech Disfluencies List.xlsx"
result_path = "/content/drive/MyDrive/Speech Disfluencies Result.xlsx"
output_audio_dir = "/content/disfluency_segments"                  # Folder for saving short clips

os.makedirs(output_audio_dir, exist_ok=True)

df_main = pd.read_csv(csv_path)
df_list = pd.read_excel(list_path)
df_result = pd.read_excel(result_path)

print(" Files Loaded Successfully!")
print("Main Dataset Columns:", df_main.columns.tolist())
print("List Columns:", df_list.columns.tolist())
print("Result Columns:", df_result.columns.tolist())


‚úÖ Files Loaded Successfully!
Main Dataset Columns: ['audio', 'transcript']
List Columns: ['Filled Pause', 'Repetition', 'False Start', 'Prolongation', 'Self-Correction']
Result Columns: ['disfluency_type', 'audio_segment_url', 'start_time (s)', 'end_time (s)', 'transcription_snippet', 'notes']


In [None]:
df_main

Unnamed: 0,audio,transcript
0,/content/drive/MyDrive/whisper_audio/split_aud...,"[\n {\n ""start"": 0.11,\n ""end"": 14.42,\..."
1,/content/drive/MyDrive/whisper_audio/split_aud...,‡§Ö‡§®‡•Å‡§≠‡§µ ‡§ï‡§∞‡§ï‡•á ‡§ï‡•Å‡§õ ‡§≤‡§ø‡§ñ‡§®‡§æ ‡§•‡§æ ‡§§‡•ã ‡§µ‡§π ‡§§‡•ã ‡§¨‡§ø‡§®‡§æ ‡§¶‡•á‡§ñ‡§ø‡§è ‡§®‡§π...
2,/content/drive/MyDrive/whisper_audio/split_aud...,"‡§®‡•Ä‡§ö‡•á""\n },\n {\n ""start"": 42.47,\n ""en..."
3,/content/drive/MyDrive/whisper_audio/split_aud...,‡§§‡•ã ‡§ú‡§¨ ‡§™‡§§‡§æ ‡§ú‡•à‡§∏‡•Ä ‡§∞‡§æ‡§§ ‡§π‡•Å‡§Ü ‡§®‡§æ ‡§∂‡§æ‡§Æ ‡§Æ‡§§‡§≤‡§¨ ‡§õ‡•à ‡§∏‡§æ‡§§ ‡§Æ‡•á‡§Ç ...
4,/content/drive/MyDrive/whisper_audio/split_aud...,‡§§‡§æ ‡§∏‡§¨ ‡§Ö‡§™‡§®‡§æ ‡§Ö‡§™‡§®‡§æ ‡§ï‡•à‡§Æ‡•ç‡§™ ‡§°‡§æ‡§≤ ‡§ï‡•á ‡§∞‡§π ‡§∞‡§π‡§æ ‡§•‡§æ ‡§î‡§∞ ‡§π‡§Æ ‡§Ö...
...,...,...
2495,/content/drive/MyDrive/whisper_audio/split_aud...,‡•á‡§Ç ‡§ï‡§π‡•Ä ‡§™‡•á ‡§≠‡•Ä ‡§®‡§π‡•Ä‡§Ç ‡§π‡•à ‡§§‡•ã ‡§Ö‡§ó‡§∞ ‡§π‡§Æ ‡§¶‡•ã ‡§π‡§ú‡§æ‡§∞ ‡§™‡•à‡§§‡§æ‡§≤‡•Ä‡§∏...
2496,/content/drive/MyDrive/whisper_audio/split_aud...,""": ""‡§ú‡•Ä ‡§ú‡•Ä""\n },\n {\n ""start"": 993.07,\n ..."
2497,/content/drive/MyDrive/whisper_audio/split_aud...,‡§∏‡•á ‡§¨‡§°‡§º‡§æ ‡§§‡•ã ‡§π‡§æ‡§• ‡§â‡§®‡§ï‡§æ ‡§ú‡•ã ‡§Ü‡§™‡§ï‡•á ‡§á‡§≤‡§æ‡§ï‡•á ‡§ï‡§æ ‡§°‡•Ä‡§è‡§Æ ‡§π‡•ã‡§§‡§æ...
2498,/content/drive/MyDrive/whisper_audio/split_aud...,‡§π‡§ú‡§æ‡§∞ ‡§™‡•à‡§Ç‡§§‡§æ‡§≤‡•Ä‡§∏ ‡§§‡§ï ‡§Ö‡§ó‡§∞ ‡§á‡§∏‡§Æ‡•á‡§Ç ‡§¨‡§¶‡§≤‡§æ‡§µ ‡§®‡§π‡•Ä‡§Ç ‡§Ü‡§è‡§ó‡§æ ‡§≤‡•ã‡§ó...


In [None]:
#  Prepare Disfluency Terms
# =======================
disfluency_terms = []

for col in df_list.columns:
    # drop NaN, strip spaces
    words = df_list[col].dropna().astype(str).str.strip().tolist()
    disfluency_terms.extend(words)

# Remove duplicates
disfluency_terms = list(set(disfluency_terms))

print(f" Total Disfluency Terms Loaded: {len(disfluency_terms)}")
print(disfluency_terms[:10])

‚úÖ Total Disfluency Terms Loaded: 193
['‡§è‡§ï‡•ç‡§ö‡•Å‡§Ö‡§≤‡•Ä ‡§¨‡§®‡§æ‡§§‡•á ‡§π‡•Å‡§è ‡§Æ‡§§‡§≤‡§¨ ‡§Æ‡§æ‡§® ‡§ï‡•á ‡§ö‡§≤‡§ø‡§è', '‡§π‡§Æ‡•ç‡§Æ‡•ç‡§Æ', '‡§π ‡§π ‡§π', '‡§ú‡•Ä-‡§π‡§æ‡§Å-‡§π‡§æ‡§Å-‡§ú‡•Ä-‡§π‡§æ‡§Å', '‡§ï‡•Å‡§õ ‡§®‡§æ ‡§ï‡•Å‡§õ ‡§§‡•ã ‡§Æ‡•á‡§∞‡•á ‡§§‡•ã', '‡§¨‡§∏ ‡§¨‡§∏', '‡§Ü ‡§∞‡§π‡•Ä ‡§Ü ‡§∞‡§π‡•Ä ‡§Ü ‡§∞‡§π‡•Ä', '‡§Ü‡§π‚Äî‡§®‡§π‡•Ä‡§Ç‚Äî', '‡§Ü‡§™ ‡§Ö', '‡§Ö‡§ö‡•ç‡§õ‡•ç‡§õ‡•ç‡§õ‡§æ']


In [None]:
# =======================
#  Search Disfluencies in Transcripts
# =======================
results = []

for idx, row in tqdm(df_main.iterrows(), total=len(df_main)):
    audio_path = row['audio']
    transcript = str(row['transcript']).strip()

    # Find which disfluency terms appear
    matched_terms = [term for term in disfluency_terms if term in transcript]

    if matched_terms:
        # store all matches for this segment
        for term in matched_terms:
            results.append({
                "recording_id": os.path.basename(audio_path),
                "disfluency": term,
                "audio_path": audio_path,
                "transcript": transcript
            })


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2500/2500 [00:00<00:00, 5492.05it/s]


In [None]:
# =======================
# Create DataFrame of Results
# =======================
df_out = pd.DataFrame(results)
print(f"Found Disfluency Segments: {len(df_out)}")

if len(df_out) > 0:
    # Save structured dataset
    output_csv = "/content/drive/MyDrive/Detected_Disfluencies.csv"
    df_out.to_csv(output_csv, index=False)
    print(f" Structured dataset saved at: {output_csv}")
else:
    print(" No disfluencies found. Check if words match exactly with transcripts.")

Found Disfluency Segments: 2583
 Structured dataset saved at: /content/drive/MyDrive/Detected_Disfluencies.csv


In [None]:
df_out

Unnamed: 0,recording_id,disfluency,audio_path,transcript
0,825780_audio_seg0.wav,‡§Ö‡§ö‡•ç‡§õ,/content/drive/MyDrive/whisper_audio/split_aud...,"[\n {\n ""start"": 0.11,\n ""end"": 14.42,\..."
1,825780_audio_seg0.wav,‡§µ‡•ã ‡§§‡•ã,/content/drive/MyDrive/whisper_audio/split_aud...,"[\n {\n ""start"": 0.11,\n ""end"": 14.42,\..."
2,825780_audio_seg6.wav,‡§π‡§Æ‡•ç,/content/drive/MyDrive/whisper_audio/split_aud...,"""speaker_id"": 245746,\n ""text"": ""‡§π‡§Æ‡•ç‡§Æ ‡§π‡§Æ ‡§≤‡•ã..."
3,825780_audio_seg6.wav,‡§π‡§Æ‡•ç‡§Æ,/content/drive/MyDrive/whisper_audio/split_aud...,"""speaker_id"": 245746,\n ""text"": ""‡§π‡§Æ‡•ç‡§Æ ‡§π‡§Æ ‡§≤‡•ã..."
4,825780_audio_seg7.wav,‡§Ö‡§Ç,/content/drive/MyDrive/whisper_audio/split_aud...,‡§•‡•á ‡§§‡•ã ‡§Æ‡•á‡§ï‡•ã ‡§∏‡§Æ‡§ù ‡§Æ‡•á‡§Ç ‡§Ü ‡§®‡§π‡•Ä‡§Ç ‡§∞‡§π‡§æ ‡§•‡§æ ‡§ï‡§ø ‡§ï‡•ç‡§Ø‡§æ ‡§¨‡•ã‡§≤ ‡§∞...
...,...,...,...,...
2578,840781_audio_seg34.wav,‡§Ö‡§ö‡•ç‡§õ,/content/drive/MyDrive/whisper_audio/split_aud...,‡§∏‡•á ‡§¨‡§°‡§º‡§æ ‡§§‡•ã ‡§π‡§æ‡§• ‡§â‡§®‡§ï‡§æ ‡§ú‡•ã ‡§Ü‡§™‡§ï‡•á ‡§á‡§≤‡§æ‡§ï‡•á ‡§ï‡§æ ‡§°‡•Ä‡§è‡§Æ ‡§π‡•ã‡§§‡§æ...
2579,840781_audio_seg34.wav,‡§Ö‡§Ç,/content/drive/MyDrive/whisper_audio/split_aud...,‡§∏‡•á ‡§¨‡§°‡§º‡§æ ‡§§‡•ã ‡§π‡§æ‡§• ‡§â‡§®‡§ï‡§æ ‡§ú‡•ã ‡§Ü‡§™‡§ï‡•á ‡§á‡§≤‡§æ‡§ï‡•á ‡§ï‡§æ ‡§°‡•Ä‡§è‡§Æ ‡§π‡•ã‡§§‡§æ...
2580,840781_audio_seg35.wav,‡§¨‡§ø‡§≤,/content/drive/MyDrive/whisper_audio/split_aud...,‡§π‡§ú‡§æ‡§∞ ‡§™‡•à‡§Ç‡§§‡§æ‡§≤‡•Ä‡§∏ ‡§§‡§ï ‡§Ö‡§ó‡§∞ ‡§á‡§∏‡§Æ‡•á‡§Ç ‡§¨‡§¶‡§≤‡§æ‡§µ ‡§®‡§π‡•Ä‡§Ç ‡§Ü‡§è‡§ó‡§æ ‡§≤‡•ã‡§ó...
2581,840781_audio_seg36.wav,‡§Ö‡§ö‡•ç‡§õ,/content/drive/MyDrive/whisper_audio/split_aud...,‡§∞ ‡§á‡§ï‡§ü‡•ç‡§†‡§æ ‡§ï‡§∞‡§ï‡•á ‡§Ö‡§™‡§®‡•Ä ‡§´‡•à‡§Æ ‡§Æ‡§§‡§≤‡§¨ ‡§ú‡•ç‡§Ø‡§æ‡§¶‡§æ ‡§¨‡§π‡•Å‡§§ ‡§ú‡•ç‡§Ø‡§æ‡§¶‡§æ...


In [None]:
# =======================
# Step 8: Clip Audio Segments
# =======================
# Since timestamps aren't provided, we export entire segment audio
# Each row = one utterance containing at least one disfluency

segmented_paths = []

for i, row in tqdm(df_out.iterrows(), total=len(df_out)):
    audio_path = row['audio_path']
    dis_term = row['disfluency']

    if os.path.exists(audio_path):
        try:
            audio = AudioSegment.from_file(audio_path)
            out_path = os.path.join(output_audio_dir, f"{i}_{dis_term}.wav")
            audio.export(out_path, format="wav")
            segmented_paths.append(out_path)
        except Exception as e:
            print(f"‚ö†Ô∏è Error processing {audio_path}: {e}")
            segmented_paths.append("ERROR")
    else:
        segmented_paths.append("NOT_FOUND")

df_out["segmented_audio_path"] = segmented_paths

# Save final dataset with segmented clip paths
final_csv = "/content/Final_Disfluency_Segments.csv"
df_out.to_csv(final_csv, index=False)

print(f" Final Dataset saved at: {final_csv}")
print(f" Segmented clips stored in: {output_audio_dir}")

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2583/2583 [14:08<00:00,  3.04it/s]

‚úÖ Final Dataset saved at: /content/Final_Disfluency_Segments.csv
üéß Segmented clips stored in: /content/disfluency_segments





In [None]:
pd.read_csv(final_csv).head(50)

Unnamed: 0,recording_id,disfluency,audio_path,transcript,segmented_audio_path
0,825780_audio_seg0.wav,‡§Ö‡§ö‡•ç‡§õ,/content/drive/MyDrive/whisper_audio/split_aud...,"[\n {\n ""start"": 0.11,\n ""end"": 14.42,\...",/content/disfluency_segments/0_‡§Ö‡§ö‡•ç‡§õ.wav
1,825780_audio_seg0.wav,‡§µ‡•ã ‡§§‡•ã,/content/drive/MyDrive/whisper_audio/split_aud...,"[\n {\n ""start"": 0.11,\n ""end"": 14.42,\...",/content/disfluency_segments/1_‡§µ‡•ã ‡§§‡•ã.wav
2,825780_audio_seg6.wav,‡§π‡§Æ‡•ç,/content/drive/MyDrive/whisper_audio/split_aud...,"""speaker_id"": 245746,\n ""text"": ""‡§π‡§Æ‡•ç‡§Æ ‡§π‡§Æ ‡§≤‡•ã...",/content/disfluency_segments/2_‡§π‡§Æ‡•ç.wav
3,825780_audio_seg6.wav,‡§π‡§Æ‡•ç‡§Æ,/content/drive/MyDrive/whisper_audio/split_aud...,"""speaker_id"": 245746,\n ""text"": ""‡§π‡§Æ‡•ç‡§Æ ‡§π‡§Æ ‡§≤‡•ã...",/content/disfluency_segments/3_‡§π‡§Æ‡•ç‡§Æ.wav
4,825780_audio_seg7.wav,‡§Ö‡§Ç,/content/drive/MyDrive/whisper_audio/split_aud...,‡§•‡•á ‡§§‡•ã ‡§Æ‡•á‡§ï‡•ã ‡§∏‡§Æ‡§ù ‡§Æ‡•á‡§Ç ‡§Ü ‡§®‡§π‡•Ä‡§Ç ‡§∞‡§π‡§æ ‡§•‡§æ ‡§ï‡§ø ‡§ï‡•ç‡§Ø‡§æ ‡§¨‡•ã‡§≤ ‡§∞...,/content/disfluency_segments/4_‡§Ö‡§Ç.wav
5,825727_audio_seg1.wav,‡§π‡§æ ‡§π‡§æ,/content/drive/MyDrive/whisper_audio/split_aud...,"},\n {\n ""start"": 34.37,\n ""end"": 34.88...",/content/disfluency_segments/5_‡§π‡§æ ‡§π‡§æ.wav
6,825727_audio_seg2.wav,‡§¨‡§ø‡§≤,/content/drive/MyDrive/whisper_audio/split_aud...,",\n ""speaker_id"": 291038,\n ""text"": ""‡§¨‡§ø‡§≤...",/content/disfluency_segments/6_‡§¨‡§ø‡§≤.wav
7,825727_audio_seg3.wav,‡§Ü‡§π,/content/drive/MyDrive/whisper_audio/split_aud...,"id"": 291038,\n ""text"": ""‡§ú‡•Ä ‡§ú‡•Ä ‡§ú‡•Ä ‡§Ü‡§π""\n },\...",/content/disfluency_segments/7_‡§Ü‡§π.wav
8,825727_audio_seg3.wav,‡§π‡§æ ‡§π‡§æ,/content/drive/MyDrive/whisper_audio/split_aud...,"id"": 291038,\n ""text"": ""‡§ú‡•Ä ‡§ú‡•Ä ‡§ú‡•Ä ‡§Ü‡§π""\n },\...",/content/disfluency_segments/8_‡§π‡§æ ‡§π‡§æ.wav
9,825727_audio_seg4.wav,‡§Ö‡§ö‡•ç‡§õ,/content/drive/MyDrive/whisper_audio/split_aud...,"""end"": 140.36,\n ""speaker_id"": 291038,\n ...",/content/disfluency_segments/9_‡§Ö‡§ö‡•ç‡§õ.wav
