## Data loading and cleaning

In [26]:
import pandas as pd
from pathlib import Path
import torch
import time
import os
import re
import whisperx
from tqdm import tqdm
from whisperx.audio import load_audio
from whisperx.alignment import load_align_model, align
import json

# Tell Python where to find ffmpeg installed via brew
os.environ["PATH"] += os.pathsep + "/opt/homebrew/bin"

In [27]:
# import data from Byrd et al., 2023 – Study 2
data = pd.read_excel('../Data/Byrd_2023_Study_2.xlsx')

In [28]:
# Define root directory
root_dir = Path('../Data/Byrd_2023_Study_2_recordings/')

# List of tuples (id, path)
rows = []

# Walk through all subdirectories and find .mp3 files
for mp3_file in root_dir.rglob('*.mp3'):
    file_id = mp3_file.stem.split('-')[0]  # Remove suffix after '-' from stem (no extension)
    file_path = str(mp3_file.resolve())    # Absolute path
    rows.append((file_id, file_path))

# Create DataFrame
audio_data = pd.DataFrame(rows, columns=['id', 'path'])

In [None]:
# Uncomment the next line to only process the first 5 files (for testing)
#audio_data = pd.DataFrame(rows, columns=['id','path']).head(5)  # test first 5

# Device & model dtype
device       = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if torch.cuda.is_available() else "float32"

# Load ASR model
print("Loading WhisperX model…")
model = whisperx.load_model(
    "deepdml/faster-whisper-large-v3-turbo-ct2",
    device,
    compute_type=compute_type,
    language="en"
)

# Load alignment model
print("Loading alignment model…")
align_model, metadata = load_align_model(language_code="en", device=device)

# Prepare storage & timers
transcriptions   = []
all_word_segments = []
backup_every     = 50
output_file      = "../Data/audio_data_with_transcriptions_and_alignment.pkl"
start            = time.time()

for idx, path in enumerate(tqdm(audio_data['path'], desc="Files", unit="file")):
    print(f"\n---\nFile {idx+1}/{len(audio_data)}: {path}")
    try:
        # 1) Load audio
        audio = load_audio(path)         # np.ndarray @ 16 kHz

        # 2) ASR
        result   = model.transcribe(audio, 16000)
        segments = result["segments"]    # List[dict]

        # 3) Join text
        text = " ".join(s["text"] for s in segments)
        transcriptions.append(text)

        # 4) Forced alignment (word timestamps)
        word_segments = align(
            segments,      # transcript
            align_model,   # model
            metadata,      # metadata dict
            audio,         # waveform
            device=device  # cpu / cuda
        )
        all_word_segments.append(word_segments["word_segments"])

    except Exception as e:
        print(" ERROR:", e)
        transcriptions.append(None)
        all_word_segments.append(None)

    # Backup
    if (idx+1) % backup_every == 0:
        df = audio_data.loc[:idx].copy()
        df['transcription']  = transcriptions
        df['word_segments']  = all_word_segments
        df.to_pickle(f"backup_{os.path.basename(output_file)}")

# Final save
audio_data['transcription']   = transcriptions
audio_data['word_segments']   = all_word_segments
audio_data.to_pickle(output_file)

elapsed = time.time() - start
print(f"\nDone in {elapsed/60:.2f} min; avg {elapsed/len(audio_data):.2f}s/file")
print("Saved to", output_file)

In [33]:
audio_data = pd.read_pickle("../Data/audio_data_with_transcriptions_and_alignment.pkl")

In [34]:
# --- Clean checker ---
def check_cleaning(text):
    if not isinstance(text, str):
        return False  # Skip non-string entries

    # Match words repeated 8+ times, separated by any non-word characters (space, comma, etc.)
    pattern_words = r'\b(\w+)([\W_]+\1){7,}\b'
    pattern_letters = r'\b(\w)([\W_]+\1){7,}\b'

    if re.search(pattern_words, text, flags=re.IGNORECASE) or re.search(pattern_letters, text, flags=re.IGNORECASE):
        print("\nFaulty transcription detected:")
        print(text)
        print("-" * 50)
        return True

    return False

# --- Load audio file paths ---
rows = []
for mp3_file in root_dir.rglob('*.mp3'):
    file_id = mp3_file.stem.split('-')[0]
    file_path = str(mp3_file.resolve())
    rows.append((file_id, file_path))
audio_paths = pd.DataFrame(rows, columns=['id', 'path'])

# --- Ethnicity mapping ---
ethnicity_map = {
    1: 'American Indian or Native American',
    2: 'Pacific Islander',
    3: 'White',
    4: 'Black',
    5: 'Hispanic or Latino'
}

# --- Prepare rows ---
rows = []
demographic_cols = ['Age', 'Gender', 'Household Income', 'Familiar']
audio_cols = [col for col in data.columns if col.endswith(' Audio')]

for idx, row in data.iterrows():
    subject_id = idx + 1
    ethnicity = ethnicity_map.get(row['Q21 Data'], 'Other')

    for q_num, audio_col in enumerate(audio_cols, start=1):
        audio_pos = data.columns.get_loc(audio_col)
        code_col = data.columns[audio_pos - 3]
        response_col = data.columns[audio_pos - 2]
        transcription_old_col = data.columns[audio_pos - 1]
        d_col = data.columns[audio_pos + 1]
        c_col = data.columns[audio_pos + 2]
        lure_consideration_col = data.columns[audio_pos + 3]

        audio_url = row[audio_col]
        audio_id = None
        if isinstance(audio_url, str) and 'play/' in audio_url:
            audio_id = audio_url.split('play/')[1]

        transcription_new = None
        word_segments = None
        mp3_path = None
        if audio_id in audio_data['id'].values:
            matched = audio_data.loc[audio_data['id'] == audio_id]
            if 'transcription' in matched.columns:
                transcription_new = matched.iloc[0]['transcription']
            if 'word_segments' in matched.columns:
                word_segments = matched.iloc[0]['word_segments']
            mp3_path = audio_paths.loc[audio_paths['id'] == audio_id, 'path'].values[0] if audio_id in audio_paths['id'].values else None

        # ❗ Skip faulty transcriptions
        if check_cleaning(transcription_new):
            continue

        lure_consideration = 1 if row.get(lure_consideration_col) == 1 else 0

        row_dict = {
            'subject_id': subject_id,
            'question': q_num,
            'response': row.get(code_col, None),
            'response_text': row.get(response_col, None),
            'transcription_old': row.get(transcription_old_col, None),
            'transcription_new': transcription_new,
            'word_segments': word_segments,
            'audio_url': audio_url,
            'mp3_path': mp3_path,
            'reconsidered_initial_resp': row.get(d_col, None),
            'verbalized_reasons': row.get(c_col, None),
            'lure_consideration': lure_consideration,
            'ethnicity': ethnicity
        }

        for col in demographic_cols:
            row_dict[col] = row[col]

        rows.append(row_dict)

# --- Create DataFrame ---
data_long = pd.DataFrame(rows).reset_index(drop=True)

# --- CRT Questions ---
question_info = {
    1: {"question_text": "Mary’s father has 5 daughters but no sons—Nana, Nene, Nini, Nono. What is the fifth daughter’s name probably?", "correct_answer": "Mary", "lured_answer": "Nunu"},
    2: {"question_text": "If you were running a race, and you passed the person in 2nd place, what place would you be in now?", "correct_answer": "2nd", "lured_answer": "1st"},
    3: {"question_text": "It’s a stormy night and a plane crashes - in which country do you bury the survivors?", "correct_answer": "Don't bury survivors", "lured_answer": "Burial location"},
    4: {"question_text": "A monkey, a squirrel, and a bird are racing to the top of a coconut tree. Who will get the banana first?", "correct_answer": "No banana on coconut tree", "lured_answer": "Any animal"},
    5: {"question_text": "In a one-storey pink house with everything pink, what colour were the stairs probably?", "correct_answer": "No stairs", "lured_answer": "Pink"},
    6: {"question_text": "How many of each animal did Moses put on the ark?", "correct_answer": "None", "lured_answer": "Two"},
    7: {"question_text": "The wind blows west. An electric train runs east. In which direction does the smoke blow?", "correct_answer": "No smoke", "lured_answer": "West"},
    8: {"question_text": "If you have only one match and you walk into a dark room with an oil lamp, a newspaper and wood—which thing would you light first?", "correct_answer": "Match", "lured_answer": "Oil lamp / Newspaper / Wood"},
    9: {"question_text": "Would it be ethical for a man to marry the sister of his widow?", "correct_answer": "Not possible", "lured_answer": "Yes / No"},
    10: {"question_text": "Which sentence is correct: (a) 'the yolk of the egg are white' or (b) 'the yolk of the egg is white'?", "correct_answer": "The yolk is yellow", "lured_answer": "b"}
}
data_long['question_text'] = data_long['question'].map(lambda x: question_info[x]['question_text'])
data_long['correct_answer'] = data_long['question'].map(lambda x: question_info[x]['correct_answer'])
data_long['lured_answer'] = data_long['question'].map(lambda x: question_info[x]['lured_answer'])

# --- Rename and reorder ---
data_long = data_long.rename(columns={
    'Age': 'age',
    'Gender': 'gender',
    'Household Income': 'household_income',
    'Familiar': 'familiar'
})

column_order = [
    'subject_id', 'question',
    'question_text', 'correct_answer', 'lured_answer',
    'response', 'response_text', 'transcription_old', 'transcription_new', 'word_segments',
    'audio_url', 'mp3_path',
    'reconsidered_initial_resp', 'verbalized_reasons', 'lure_consideration',
    'ethnicity', 'age', 'gender', 'household_income', 'familiar'
]
data_long = data_long[column_order]

# --- Clean booleans ---
for col in ['reconsidered_initial_resp', 'verbalized_reasons']:
    data_long[col] = data_long[col].replace({'Y': 1, 'N': 0})
data_long['familiar'] = data_long['familiar'].fillna(0).replace({'Y': 1}).astype(int)

# Step 1: Print how many rows were skipped
total_possible = len(data) * len(audio_cols)
final_count = len(data_long)
removed_count = total_possible - final_count
print(f"Skipped {removed_count} row(s) during creation due to faulty transcriptions.")

# Step 4: Convert word_segments to JSON-safe string with native floats
def clean_word_segments(segment_list):
    if isinstance(segment_list, list):
        return json.dumps([
            {
                "word": w["word"],
                "start": float(w["start"]),
                "end": float(w["end"]),
                "score": float(w["score"])
            } for w in segment_list
        ])
    return None

data_long['word_segments'] = data_long['word_segments'].apply(clean_word_segments)

# Step 5: Save the cleaned DataFrame
data_long.to_csv("../Data/data_long.csv", index=False, encoding="utf-8-sig")
print("✅ Cleaned data saved to ../Data/data_long.csv")


🚨 Faulty transcription detected:
 Which sentence is correct? A, B, R, Y, B, U, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z.
--------------------------------------------------

🚨 Faulty transcription detected:
 Mary's father has five daughters, but no sons. Nana, Nini, Nini, Nono. What is the fifth daughter's name, probably? This looks like right away vowels, so A-E-I-O-U. So I would guess the fifth daughter's name is Nunu. This actually made me think of a...  a kid's song from when I was a kid by a Canadian singer called Fred Penner, and he had a song called Bananas and Benonos, so it went through the vowels. It was like, A, like, T, A, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,  New news.
----------------------------

  data_long['familiar'] = data_long['familiar'].fillna(0).replace({'Y': 1}).astype(int)
