In [None]:
!pip install pandas pydub openpyxl tqdm

# =======================
#  Import Libraries
# =======================
import os
from pydub import AudioSegment



In [None]:
import pandas as pd
from tqdm import tqdm


# Mount working directory
BASE_DIR = "/content/drive/MyDrive/disfluency_project"
os.makedirs(BASE_DIR, exist_ok=True)



In [None]:
csv_path = "/content/drive/MyDrive/whisper_train_manifest_fullpath.csv"
list_path = "/content/drive/MyDrive/Speech Disfluencies List.xlsx"
result_path = "/content/drive/MyDrive/Speech Disfluencies Result.xlsx"
output_audio_dir = "/content/disfluency_segments"                  # Folder for saving short clips

os.makedirs(output_audio_dir, exist_ok=True)

df_main = pd.read_csv(csv_path)
df_list = pd.read_excel(list_path)
df_result = pd.read_excel(result_path)

print(" Files Loaded Successfully!")
print("Main Dataset Columns:", df_main.columns.tolist())
print("List Columns:", df_list.columns.tolist())
print("Result Columns:", df_result.columns.tolist())


✅ Files Loaded Successfully!
Main Dataset Columns: ['audio', 'transcript']
List Columns: ['Filled Pause', 'Repetition', 'False Start', 'Prolongation', 'Self-Correction']
Result Columns: ['disfluency_type', 'audio_segment_url', 'start_time (s)', 'end_time (s)', 'transcription_snippet', 'notes']


In [None]:
df_main

Unnamed: 0,audio,transcript
0,/content/drive/MyDrive/whisper_audio/split_aud...,"[\n {\n ""start"": 0.11,\n ""end"": 14.42,\..."
1,/content/drive/MyDrive/whisper_audio/split_aud...,अनुभव करके कुछ लिखना था तो वह तो बिना देखिए नह...
2,/content/drive/MyDrive/whisper_audio/split_aud...,"नीचे""\n },\n {\n ""start"": 42.47,\n ""en..."
3,/content/drive/MyDrive/whisper_audio/split_aud...,तो जब पता जैसी रात हुआ ना शाम मतलब छै सात में ...
4,/content/drive/MyDrive/whisper_audio/split_aud...,ता सब अपना अपना कैम्प डाल के रह रहा था और हम अ...
...,...,...
2495,/content/drive/MyDrive/whisper_audio/split_aud...,ें कही पे भी नहीं है तो अगर हम दो हजार पैतालीस...
2496,/content/drive/MyDrive/whisper_audio/split_aud...,""": ""जी जी""\n },\n {\n ""start"": 993.07,\n ..."
2497,/content/drive/MyDrive/whisper_audio/split_aud...,से बड़ा तो हाथ उनका जो आपके इलाके का डीएम होता...
2498,/content/drive/MyDrive/whisper_audio/split_aud...,हजार पैंतालीस तक अगर इसमें बदलाव नहीं आएगा लोग...


In [None]:
#  Prepare Disfluency Terms
# =======================
disfluency_terms = []

for col in df_list.columns:
    # drop NaN, strip spaces
    words = df_list[col].dropna().astype(str).str.strip().tolist()
    disfluency_terms.extend(words)

# Remove duplicates
disfluency_terms = list(set(disfluency_terms))

print(f" Total Disfluency Terms Loaded: {len(disfluency_terms)}")
print(disfluency_terms[:10])

✅ Total Disfluency Terms Loaded: 193
['एक्चुअली बनाते हुए मतलब मान के चलिए', 'हम्म्म', 'ह ह ह', 'जी-हाँ-हाँ-जी-हाँ', 'कुछ ना कुछ तो मेरे तो', 'बस बस', 'आ रही आ रही आ रही', 'आह—नहीं—', 'आप अ', 'अच्छ्छ्छा']


In [None]:
# =======================
#  Search Disfluencies in Transcripts
# =======================
results = []

for idx, row in tqdm(df_main.iterrows(), total=len(df_main)):
    audio_path = row['audio']
    transcript = str(row['transcript']).strip()

    # Find which disfluency terms appear
    matched_terms = [term for term in disfluency_terms if term in transcript]

    if matched_terms:
        # store all matches for this segment
        for term in matched_terms:
            results.append({
                "recording_id": os.path.basename(audio_path),
                "disfluency": term,
                "audio_path": audio_path,
                "transcript": transcript
            })


100%|██████████| 2500/2500 [00:00<00:00, 5492.05it/s]


In [None]:
# =======================
# Create DataFrame of Results
# =======================
df_out = pd.DataFrame(results)
print(f"Found Disfluency Segments: {len(df_out)}")

if len(df_out) > 0:
    # Save structured dataset
    output_csv = "/content/drive/MyDrive/Detected_Disfluencies.csv"
    df_out.to_csv(output_csv, index=False)
    print(f" Structured dataset saved at: {output_csv}")
else:
    print(" No disfluencies found. Check if words match exactly with transcripts.")

Found Disfluency Segments: 2583
 Structured dataset saved at: /content/drive/MyDrive/Detected_Disfluencies.csv


In [None]:
df_out

Unnamed: 0,recording_id,disfluency,audio_path,transcript
0,825780_audio_seg0.wav,अच्छ,/content/drive/MyDrive/whisper_audio/split_aud...,"[\n {\n ""start"": 0.11,\n ""end"": 14.42,\..."
1,825780_audio_seg0.wav,वो तो,/content/drive/MyDrive/whisper_audio/split_aud...,"[\n {\n ""start"": 0.11,\n ""end"": 14.42,\..."
2,825780_audio_seg6.wav,हम्,/content/drive/MyDrive/whisper_audio/split_aud...,"""speaker_id"": 245746,\n ""text"": ""हम्म हम लो..."
3,825780_audio_seg6.wav,हम्म,/content/drive/MyDrive/whisper_audio/split_aud...,"""speaker_id"": 245746,\n ""text"": ""हम्म हम लो..."
4,825780_audio_seg7.wav,अं,/content/drive/MyDrive/whisper_audio/split_aud...,थे तो मेको समझ में आ नहीं रहा था कि क्या बोल र...
...,...,...,...,...
2578,840781_audio_seg34.wav,अच्छ,/content/drive/MyDrive/whisper_audio/split_aud...,से बड़ा तो हाथ उनका जो आपके इलाके का डीएम होता...
2579,840781_audio_seg34.wav,अं,/content/drive/MyDrive/whisper_audio/split_aud...,से बड़ा तो हाथ उनका जो आपके इलाके का डीएम होता...
2580,840781_audio_seg35.wav,बिल,/content/drive/MyDrive/whisper_audio/split_aud...,हजार पैंतालीस तक अगर इसमें बदलाव नहीं आएगा लोग...
2581,840781_audio_seg36.wav,अच्छ,/content/drive/MyDrive/whisper_audio/split_aud...,र इकट्ठा करके अपनी फैम मतलब ज्यादा बहुत ज्यादा...


In [None]:
# =======================
# Step 8: Clip Audio Segments
# =======================
# Since timestamps aren't provided, we export entire segment audio
# Each row = one utterance containing at least one disfluency

segmented_paths = []

for i, row in tqdm(df_out.iterrows(), total=len(df_out)):
    audio_path = row['audio_path']
    dis_term = row['disfluency']

    if os.path.exists(audio_path):
        try:
            audio = AudioSegment.from_file(audio_path)
            out_path = os.path.join(output_audio_dir, f"{i}_{dis_term}.wav")
            audio.export(out_path, format="wav")
            segmented_paths.append(out_path)
        except Exception as e:
            print(f"⚠️ Error processing {audio_path}: {e}")
            segmented_paths.append("ERROR")
    else:
        segmented_paths.append("NOT_FOUND")

df_out["segmented_audio_path"] = segmented_paths

# Save final dataset with segmented clip paths
final_csv = "/content/Final_Disfluency_Segments.csv"
df_out.to_csv(final_csv, index=False)

print(f" Final Dataset saved at: {final_csv}")
print(f" Segmented clips stored in: {output_audio_dir}")

100%|██████████| 2583/2583 [14:08<00:00,  3.04it/s]

✅ Final Dataset saved at: /content/Final_Disfluency_Segments.csv
🎧 Segmented clips stored in: /content/disfluency_segments





In [None]:
pd.read_csv(final_csv).head(50)

Unnamed: 0,recording_id,disfluency,audio_path,transcript,segmented_audio_path
0,825780_audio_seg0.wav,अच्छ,/content/drive/MyDrive/whisper_audio/split_aud...,"[\n {\n ""start"": 0.11,\n ""end"": 14.42,\...",/content/disfluency_segments/0_अच्छ.wav
1,825780_audio_seg0.wav,वो तो,/content/drive/MyDrive/whisper_audio/split_aud...,"[\n {\n ""start"": 0.11,\n ""end"": 14.42,\...",/content/disfluency_segments/1_वो तो.wav
2,825780_audio_seg6.wav,हम्,/content/drive/MyDrive/whisper_audio/split_aud...,"""speaker_id"": 245746,\n ""text"": ""हम्म हम लो...",/content/disfluency_segments/2_हम्.wav
3,825780_audio_seg6.wav,हम्म,/content/drive/MyDrive/whisper_audio/split_aud...,"""speaker_id"": 245746,\n ""text"": ""हम्म हम लो...",/content/disfluency_segments/3_हम्म.wav
4,825780_audio_seg7.wav,अं,/content/drive/MyDrive/whisper_audio/split_aud...,थे तो मेको समझ में आ नहीं रहा था कि क्या बोल र...,/content/disfluency_segments/4_अं.wav
5,825727_audio_seg1.wav,हा हा,/content/drive/MyDrive/whisper_audio/split_aud...,"},\n {\n ""start"": 34.37,\n ""end"": 34.88...",/content/disfluency_segments/5_हा हा.wav
6,825727_audio_seg2.wav,बिल,/content/drive/MyDrive/whisper_audio/split_aud...,",\n ""speaker_id"": 291038,\n ""text"": ""बिल...",/content/disfluency_segments/6_बिल.wav
7,825727_audio_seg3.wav,आह,/content/drive/MyDrive/whisper_audio/split_aud...,"id"": 291038,\n ""text"": ""जी जी जी आह""\n },\...",/content/disfluency_segments/7_आह.wav
8,825727_audio_seg3.wav,हा हा,/content/drive/MyDrive/whisper_audio/split_aud...,"id"": 291038,\n ""text"": ""जी जी जी आह""\n },\...",/content/disfluency_segments/8_हा हा.wav
9,825727_audio_seg4.wav,अच्छ,/content/drive/MyDrive/whisper_audio/split_aud...,"""end"": 140.36,\n ""speaker_id"": 291038,\n ...",/content/disfluency_segments/9_अच्छ.wav
