In [None]:
%load_ext autoreload
%autoreload 2

In [12]:
import os
import re
from glob import glob

import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile
from tqdm import tqdm

from enfify.config import EXTERNAL_DATA_DIR, INTERIM_DATA_DIR

In [13]:
np.random.seed(42)

def pad_number_in_filename(filename, n_digits):
    def replace_match(match):
        number = match.group(0)
        return number.zfill(n_digits)

    new_filename = re.sub(r"\d+", replace_match, filename)

    return new_filename

In [14]:
# Carioca 1 original files
files_pattern = str(EXTERNAL_DATA_DIR / "Carioca" / "BASE CARIOCA 1" / "**" / "*.wav")
all_files = sorted(glob(files_pattern, recursive=True))
regex_pattern = r"^(HC|MC)\d+\.wav$"  # Pattern to match filenames like HC1.wav, HC10.wav, HC15.wav
pattern = re.compile(regex_pattern)

files = [f for f in all_files if pattern.match(os.path.basename(f))]

print(f"{len(files)} matching files found: {os.path.basename(files[0])} - {os.path.basename(files[-1])}")

50 matching files found: HC1.wav - MC9.wav


In [15]:
NUM_CLIPS = 10
CLIP_LENGTH = 10  # seconds

interim_dir = INTERIM_DATA_DIR / "Carioca1"
authentic_dir = interim_dir / "authentic"
tampered_dir = interim_dir / "tampered"

os.makedirs(authentic_dir, exist_ok=True)
os.makedirs(tampered_dir, exist_ok=True)

for file in tqdm(files):
    basename = os.path.splitext(os.path.basename(file))[0]
    new_basename = pad_number_in_filename(basename, 2)
    rate, data = wavfile.read(file)
    # Convert stereo to mono by averaging channels
    if data.ndim == 2:
        data = np.mean(data, axis=1)

    # segmenting clips
    clip_mlen = round(CLIP_LENGTH * rate)
    clip_starts = np.linspace(0, len(data) - clip_mlen, NUM_CLIPS)
    overlap = clip_mlen - (clip_starts[1] - clip_starts[0])
    clip_starts[1:-1] += np.random.randint(-overlap // 2, overlap // 2, NUM_CLIPS - 2)
    clip_starts = np.clip(clip_starts, 0, len(data) - clip_mlen).astype(int)

    clips = [data[start:start + clip_mlen].copy() for start in clip_starts]

    for i, clip in enumerate(tqdm(clips, leave=False)):
        # Save authentic data
        filename = f"{basename}-{i:02}.wav"
        wavfile.write(authentic_dir / filename, rate, clip)

        # Save tampered data
        filename = f"{basename}-{i:02}-tamp.wav"
        cutlen = np.random.randint(clip_mlen // 10, clip_mlen // 5)
        start = np.random.randint(0, clip_mlen - cutlen)
        tampered_clip = np.delete(clip, slice(start, start + cutlen))
        wavfile.write(tampered_dir / filename, rate, tampered_clip)

    

  rate, data = wavfile.read(file)
100%|██████████| 50/50 [00:03<00:00, 13.01it/s]


In [16]:
print(len(glob(str(authentic_dir / "*.wav"))))

500
