## Data loading and cleaning

In [39]:
import pandas as pd
from pathlib import Path
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from tqdm import tqdm
import time
import os
import re

# Tell Python where to find ffmpeg installed via brew
os.environ["PATH"] += os.pathsep + "/opt/homebrew/bin"

In [4]:
# import data from Byrd et al., 2023 – Study 2
data = pd.read_excel('../Data/Byrd_2023_Study_2.xlsx')

In [5]:
# Define root directory
root_dir = Path('../Data/Byrd_2023_Study_2_recordings/')

# List of tuples (id, path)
rows = []

# Walk through all subdirectories and find .mp3 files
for mp3_file in root_dir.rglob('*.mp3'):
    file_id = mp3_file.stem.split('-')[0]  # Remove suffix after '-' from stem (no extension)
    file_path = str(mp3_file.resolve())    # Absolute path
    rows.append((file_id, file_path))

# Create DataFrame
audio_data = pd.DataFrame(rows, columns=['id', 'path'])

In [29]:
# Setup Whisper
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
).to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=30,
    batch_size=2,  # Increase only if lots of RAM/VRAM
    torch_dtype=torch_dtype,
    device=device,
)

# Prepare transcription list
transcriptions = []

# Backup settings
backup_every = 50  # Save every 50 files
output_file = "../Data/audio_data_with_transcriptions.csv"

# Timer start
start = time.time()

# Transcription Loop
for idx, path in enumerate(tqdm(audio_data['path'], desc="Transcribing all audio", unit="file")):
    try:
        result = pipe(path, generate_kwargs={"language": "english"})
        transcriptions.append(result['text'])
    except Exception as e:
        print(f"Error transcribing {path}: {e}")
        transcriptions.append(None)
    
    # Auto-save partial results every N files
    if (idx + 1) % backup_every == 0:
        tqdm.write(f"Saving backup at file {idx + 1}")
        audio_data.loc[:idx, 'transcription'] = transcriptions
        audio_data.loc[:idx].to_csv(f"backup_{output_file}", index=False)

# Final transcription assignment
audio_data['transcription'] = transcriptions

# Final save
audio_data.to_csv(output_file, index=False)

# Timer end
end = time.time()
elapsed_time = end - start

print(f"\nTranscription completed!")
print(f"Elapsed time: {elapsed_time / 60:.2f} minutes")
print(f"Average per file: {elapsed_time / len(audio_data):.2f} seconds")
print(f"Results saved to: {output_file}")

Device set to use cpu


Saving backup at file 50




Saving backup at file 100




Saving backup at file 150




Saving backup at file 200




Saving backup at file 250




Saving backup at file 300




Saving backup at file 350




Saving backup at file 400




Saving backup at file 450




Saving backup at file 500




Saving backup at file 550




Saving backup at file 600




Saving backup at file 650


Transcribing all audio:  32%|███▏      | 700/2158 [1:14:56<2:36:34,  6.44s/file]

Saving backup at file 700




Saving backup at file 750




Saving backup at file 800




Saving backup at file 850




Saving backup at file 900




Saving backup at file 950




Saving backup at file 1000




Saving backup at file 1050




Saving backup at file 1100




Saving backup at file 1150




Saving backup at file 1200




Saving backup at file 1250


Transcribing all audio:  60%|██████    | 1300/2158 [2:23:53<5:34:22, 23.38s/file]

Saving backup at file 1300




Saving backup at file 1350




Saving backup at file 1400




Saving backup at file 1450




Saving backup at file 1500




Saving backup at file 1550




Saving backup at file 1600




Saving backup at file 1650




Saving backup at file 1700


Transcribing all audio:  81%|████████  | 1750/2158 [3:26:27<37:11,  5.47s/file]

Saving backup at file 1750




Saving backup at file 1800




Saving backup at file 1850




Saving backup at file 1900




Saving backup at file 1950




Saving backup at file 2000




Saving backup at file 2050


Transcribing all audio:  97%|█████████▋| 2100/2158 [4:15:12<13:01, 13.47s/file]

Saving backup at file 2100


Transcribing all audio: 100%|█████████▉| 2150/2158 [4:22:42<01:32, 11.59s/file]

Saving backup at file 2150


Transcribing all audio: 100%|██████████| 2158/2158 [4:24:13<00:00,  7.35s/file]


Transcription completed!
Elapsed time: 264.23 minutes
Average per file: 7.35 seconds
Results saved to: audio_data_with_transcriptions.csv





In [79]:
# Ethnicity recoding mapping
ethnicity_map = {
    1: 'American Indian or Native American',
    2: 'Pacific Islander',
    3: 'White',
    4: 'Black',
    5: 'Hispanic or Latino'
}

# Prepare rows for long format
rows = []

demographic_cols = ['Age', 'Gender', 'Household Income', 'Familiar']
audio_cols = [col for col in data.columns if col.endswith(' Audio')]

for idx, row in data.iterrows():
    subject_id = idx + 1
    ethnicity = ethnicity_map.get(row['Q21 Data'], 'Other')
    
    for q_num, audio_col in enumerate(audio_cols, start=1):
        audio_pos = data.columns.get_loc(audio_col)

        code_col = data.columns[audio_pos - 3]
        response_col = data.columns[audio_pos - 2]  # data_1 → Response
        transcription_old_col = data.columns[audio_pos - 1]
        d_col = data.columns[audio_pos + 1]
        c_col = data.columns[audio_pos + 2]
        lure_consideration_col = data.columns[audio_pos + 3]  # data_2 → LureConsideration
        
        audio_url = row[audio_col]
        
        audio_id = None
        if isinstance(audio_url, str) and 'play/' in audio_url:
            audio_id = audio_url.split('play/')[1]
        
        transcription_new = None
        if audio_id in audio_data['id'].values:
            transcription_new = audio_data.loc[audio_data['id'] == audio_id, 'transcription'].values[0]
        
        lure_consideration = 1 if row.get(lure_consideration_col) == 1 else 0
        
        row_dict = {
            'subject_id': subject_id,
            'question': q_num,
            'response': row.get(code_col, None),
            'response_text': row.get(response_col, None),
            'transcription_old': row.get(transcription_old_col, None),
            'transcription_new': transcription_new,
            'audio_url': audio_url,
            'reconsidered_initial_resp': row.get(d_col, None),
            'verbalized_reasons': row.get(c_col, None),
            'lure_consideration': lure_consideration,
            'ethnicity': ethnicity
        }
        
        for col in demographic_cols:
            row_dict[col] = row[col]
        
        rows.append(row_dict)

# Create final long format dataframe
data_long = pd.DataFrame(rows).reset_index(drop=True)

# Common replacements for Windows weird characters
replace_dict = {
    'â€™': "'",  # apostrophe
    'â€“': '-',  # dash
    'â€œ': '"',  # opening quote
    'â€': '"',  # closing quote
    'â€¦': '...',  # ellipsis
    'Ã©': 'é',  # é character
    'â€˜': "'",  # left single quote
    'â€™': "'",  # right single quote
    '‚Äôs': "'s",  # possessive
    '‚Äù': '"',
    '‚Äú': '"',
    '‚Äô': "'",
}

for col in data_long.select_dtypes(include=['object']).columns:
    for wrong, right in replace_dict.items():
        data_long[col] = data_long[col].str.replace(wrong, right, regex=False)

def clean_transcription(text):
    original_text = text
    
    if not isinstance(text, str):
        return text  # skip non-string values

    cleaned_text = text

    # Remove crazy repeated words (like "no no no no no ..." or "na na na na ...")
    cleaned_text = re.sub(r'\b(\w+)( \1){10,}\b', r'\1', cleaned_text, flags=re.IGNORECASE)
    
    # Remove crazy repeated single characters (like "Z Z Z Z Z ..." or "A A A A A ...")
    cleaned_text = re.sub(r'\b(\w)( \1){10,}\b', r'\1', cleaned_text, flags=re.IGNORECASE)

    if cleaned_text != original_text:
        print("Cleaning:")
        print("Before:", original_text)
        print("After:", cleaned_text)
        print("-" * 50)
    
    return cleaned_text

# Apply to your data_long['transcription_new']
data_long['transcription_new'] = data_long['transcription_new'].apply(clean_transcription)

# CRT Question Info
question_info = {
    1: {
        'question_text': "Mary’s father has 5 daughters but no sons—Nana, Nene, Nini, Nono. What is the fifth daughter’s name probably?",
        'correct_answer': "Mary",
        'lured_answer': "Nunu"
    },
    2: {
        'question_text': "If you were running a race, and you passed the person in 2nd place, what place would you be in now?",
        'correct_answer': "2nd",
        'lured_answer': "1st"
    },
    3: {
        'question_text': "It’s a stormy night and a plane crashes - in which country do you bury the survivors?",
        'correct_answer': "Don't bury survivors",
        'lured_answer': "Burial location"
    },
    4: {
        'question_text': "A monkey, a squirrel, and a bird are racing to the top of a coconut tree. Who will get the banana first?",
        'correct_answer': "No banana on coconut tree",
        'lured_answer': "Any animal"
    },
    5: {
        'question_text': "In a one-storey pink house with everything pink, what colour were the stairs probably?",
        'correct_answer': "No stairs",
        'lured_answer': "Pink"
    },
    6: {
        'question_text': "How many of each animal did Moses put on the ark?",
        'correct_answer': "None",
        'lured_answer': "Two"
    },
    7: {
        'question_text': "The wind blows west. An electric train runs east. In which direction does the smoke blow?",
        'correct_answer': "No smoke",
        'lured_answer': "West"
    },
    8: {
        'question_text': "If you have only one match and you walk into a dark room with an oil lamp, a newspaper and wood—which thing would you light first?",
        'correct_answer': "Match",
        'lured_answer': "Oil lamp / Newspaper / Wood"
    },
    9: {
        'question_text': "Would it be ethical for a man to marry the sister of his widow?",
        'correct_answer': "Not possible",
        'lured_answer': "Yes / No"
    },
    10: {
        'question_text': "Which sentence is correct: (a) 'the yolk of the egg are white' or (b) 'the yolk of the egg is white'?",
        'correct_answer': "The yolk is yellow",
        'lured_answer': "b"
    }
}

# Add question_text, correct_answer, lured_answer to data_long
data_long['question_text'] = data_long['question'].map(lambda x: question_info[x]['question_text'])
data_long['correct_answer'] = data_long['question'].map(lambda x: question_info[x]['correct_answer'])
data_long['lured_answer'] = data_long['question'].map(lambda x: question_info[x]['lured_answer'])

# Rename variables for clarity and convention
data_long = data_long.rename(columns={
    'Age': 'age',
    'Gender': 'gender',
    'Household Income': 'household_income',
    'Familiar': 'familiar',
    'Deliberate': 'reconsidered_initial_resp',
    'Consciousness': 'verbalized_reasons'
})

# Reorder columns
column_order = [
    'subject_id', 'question',
    'question_text', 'correct_answer', 'lured_answer',
    'response', 'response_text', 'transcription_old', 'transcription_new',
    'audio_url',
    'reconsidered_initial_resp', 'verbalized_reasons', 'lure_consideration',
    'ethnicity', 'age', 'gender', 'household_income', 'familiar'
]

data_long = data_long[column_order]

# Clean reconsidered_initial_resp and verbalized_reasons
for col in ['reconsidered_initial_resp', 'verbalized_reasons']:
    data_long[col] = data_long[col].replace({'Y': 1, 'N': 0})

# Clean familiar
data_long['familiar'] = data_long['familiar'].fillna(0)
data_long['familiar'] = data_long['familiar'].replace({'Y': 1})
data_long['familiar'] = data_long['familiar'].astype(int)

# Save clean version
data_long.to_csv('../Data/data_long.csv', index=False, encoding='utf-8-sig')

Cleaning:
Before:  Mary's father has five daughters but no sons. No no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no