In [1]:
import glob
import pickle
import os
import shutil

from tqdm import tqdm

from tamper_utils import *
from utils import *
import random

### Create Tampered and Non-tampered Queries

In [None]:
CLEAN_PATHS = glob.glob('daps-mp3/clean/*')  # with first 6 seconds of silences removed

In [None]:
random.seed(42)

In [None]:
file = wave.open('daps-mp3/clean/f1_script1_clean.wav')
SAMPLE_RATE = file.getframerate() 
SAMPLE_WIDTH = file.getsampwidth()

In [None]:
L = 10*SAMPLE_RATE *SAMPLE_WIDTH

In [None]:
# Sample 10 10-second queries
queries_path = 'daps-mp3/queries/'
os.makedirs(queries_path, exist_ok=True)
logpath = 'daps-mp3/queries/log.csv'
with open(logpath, 'w') as fout:
    fout.write("query_no,start(frames),end(frames),orig_audio\n")
    
    for file in tqdm(CLEAN_PATHS):
        filename = file.split('/')[-1][:-4]
        f = wave.open(file, 'rb')
        frames = f.readframes(f.getnframes())
        SAMPLE_RATE = f.getframerate()
        SAMPLE_WIDTH = f.getsampwidth()
        PARAMS = f.getparams()
        L = 10* SAMPLE_RATE *SAMPLE_WIDTH
        for i in range(10):

            # pick a random speech region and a start point, then get sample audio segment
            sample_start = random.randint(0, (len(frames)-L)//2) * 2
            sample_frame = frames[sample_start:sample_start + L]

            obj = wave.open(queries_path + f'/query_{i}_' + filename + '.wav', 'wb')
            obj.setparams(PARAMS)
            obj.writeframes(sample_frame)
            obj.close()

            fout.write(str(i) + ',' + str(sample_start) + ',' + str(sample_start + L) + ',' + file +'\n')

In [None]:
# Generate queries with different bitrates
query_paths = glob.glob('daps-mp3/queries/*.wav')
for bitrate in ["64k", "128k", "256k"]:
    for querypath in tqdm(sorted(query_paths)):
        queryname = querypath.split('/')[-1][:-4]
        sound = AudioSegment.from_file(querypath)
        sound.export(f'daps-mp3/queries/{queryname}-{bitrate}.mp3', format="mp3", bitrate=bitrate)
        
        # Convert back to wav to be able to calculate deltas and hashprints
        sound_wav = AudioSegment.from_file(f'daps-mp3/queries/{queryname}-{bitrate}.mp3')
        sound_wav.export(f'daps-mp3/queries/{queryname}-{bitrate}.wav', format="wav")

In [None]:
query_paths = glob.glob('daps-mp3/queries/*.wav')

In [None]:
# Tamper
random.seed(42)
ONE_SEC = 1 * SAMPLE_RATE * SAMPLE_WIDTH
queries_log = pd.read_csv('daps-mp3/queries/log.csv')

for l in [0.25, 0.5, 1, 2, 4]:
    L = int(l*SAMPLE_RATE*SAMPLE_WIDTH) # l seconds in frames

    tampered_path = f'daps-mp3/tampered{l}/'
    os.makedirs(tampered_path, exist_ok=True)
    log_path = tampered_path + 'log.csv'

    with open(log_path, 'w') as fout:
        fout.write("tamper_type,query_no,time_instant,seg_start,seg_end,orig_audio\n")

        for querypath in tqdm(sorted(query_paths)):
            queryname = querypath.split('/')[-1][:-4]

            query_no = int(queryname.split('/')[-1][:-4].split('_')[1])
            orig_audio = '_'.join(queryname.split('_')[2:]) + '.wav'
            orig_audio_path = 'daps-mp3/clean/' + orig_audio

            f = wave.open(orig_audio_path, 'rb')
            frames = f.readframes(f.getnframes())

            query_log = queries_log.loc[queries_log['orig_audio'] == orig_audio_path].loc[queries_log['query_no']==query_no]
            query_start, query_end = query_log['start(frames)'].item(), query_log['end(frames)'].item()
            
            ### UNTAMPERED ###
            fout.write("NONE," + str(query_no) + ',' + 'None,None,None,' + orig_audio+ '\n')

            ### INSERTION ###
            # select l second segment
            sample_start = random.randint(0, (len(frames)-L)//2) * 2

            # do not choose from the original segment
            while sample_start >= query_start-(L+ONE_SEC) and sample_start <= query_end + ONE_SEC:
                sample_start = random.randint(0, (len(frames)-L)//2) * 2

            filler = frames[sample_start:sample_start + L]

            # select start time btw seconds: [0, 10-l]
            time_instant = random.randint(0, (10-l)*SAMPLE_RATE*SAMPLE_WIDTH//2) * 2

            insert_segment(querypath, filler, time_instant, tampered_path+f'/ins_{query_no}_'+orig_audio, L)

            fout.write("INS," + str(query_no) + ',' + str(time_instant) + ',' + str(sample_start) +
                       ',' + str(sample_start+L) +','+ orig_audio+ '\n')

            ### DELETION ###

            # select start time btw seconds: [1, 10-l-1]
            time_instant = random.randint(SAMPLE_RATE*SAMPLE_WIDTH//2, (10-l-1)*SAMPLE_RATE*SAMPLE_WIDTH//2) * 2

            delete_segment(querypath, time_instant, tampered_path+f'/del_{query_no}_'+orig_audio, L)

            fout.write("DEL," + str(query_no) + ',' + str(time_instant) + ',None,None,' + orig_audio+ '\n')


            ### REPLACEMENT ###

            # select l second segment
            sample_start = random.randint(0, (len(frames)-L)//2) * 2

            # do not choose from the original segment
            while sample_start >= query_start-L and sample_start <= query_end:
                sample_start = random.randint(0, (len(frames)-L)//2) * 2

            filler = frames[sample_start:sample_start + L]

            # select start time btw seconds: [0, 10-l]
            time_instant = random.randint(0, (10-l)*SAMPLE_RATE*SAMPLE_WIDTH//2) * 2

            replace_segment(querypath, filler, time_instant, tampered_path+f'/rep_{query_no}_'+orig_audio, L)

            fout.write("REP," + str(query_no) + ',' + str(time_instant) + ',' + str(sample_start) +
                       ',' + str(sample_start+L) +','+ orig_audio+ '\n')


In [None]:
# Generate tampered queries with different bitrates
for l in [0.25, 0.5, 1, 2, 4]:
    query_paths = glob.glob(f'daps-mp3/tampered{l}/*.wav')
    for bitrate in ["64k", "128k", "256k"]:
        for querypath in tqdm(sorted(query_paths)):
            queryname = querypath.split('/')[-1][:-4]
            sound = AudioSegment.from_file(querypath)
            sound.export(f'daps-mp3/tampered{l}/{queryname}-{bitrate}.mp3', format="mp3", bitrate=bitrate)
            
            # Convert back to wav to be able to calculate deltas and hashprints
            sound_wav = AudioSegment.from_file(f'daps-mp3/tampered{l}/{queryname}-{bitrate}.mp3')
            sound_wav.export(f'daps-mp3/tampered{l}/{queryname}-{bitrate}.wav',  format="wav")

### Get Deltas, Hashprints, and Best Offsets

In [2]:
references = glob.glob('daps-mp3/clean/*.wav')
queries = glob.glob('daps-mp3/queries/*.wav')
tampered025 = glob.glob('daps-mp3/tampered0.25/*.wav')
tampered05 = glob.glob('daps-mp3/tampered0.5/*.wav')
tampered1 = glob.glob('daps-mp3/tampered1/*.wav')
tampered2 = glob.glob('daps-mp3/tampered2/*.wav')
tampered4 = glob.glob('daps-mp3/tampered4/*.wav')

In [None]:
cfgObj = ConfigObj('./cfg_files/projev.cfg')

In [3]:
# Compute and save filters
masks = {}
for ref in tqdm(references):
    refname = ref.split('/')[-1][:-4]
    mask = get_filter(ref, cfgObj)
    masks[refname] = mask

In [4]:
with open('daps-mp3/filters.pkl', 'wb') as f:
    pickle.dump(masks, f)

In [7]:
with open('daps-mp3/filters.pkl', 'rb') as f:
    masks = pickle.load(f)

In [None]:
# Save reference deltas and hashprints
os.makedirs('daps-mp3/hashprints/refs/', exist_ok=True)
os.makedirs('daps-mp3/deltas/refs/', exist_ok=True)
for ref in tqdm(references):
    refname = ref.split('/')[-1][:-4]
    hps, C = get_hps_and_deltas(ref, cfgObj, maskMatrix=masks[refname])
    np.save(f'daps-mp3/hashprints/refs/{refname}', hps)
    np.save(f'daps-mp3/deltas/refs/{refname}', C)

In [None]:
# Save query deltas and hashprints
folders = [queries, tampered025, tampered05, tampered1, tampered2, tampered4]
offsets = {'queries': {}, 'tampered0.25': {}, 'tampered0.5': {}, 'tampered1': {}, 'tampered2': {}, 'tampered4': {}}
for folder in folders:
    foldername = folder[0].split('/')[1]
    os.makedirs(f'daps-mp3/hashprints/{foldername}/', exist_ok=True)
    os.makedirs(f'daps-mp3/deltas/{foldername}/', exist_ok=True)
    for file in tqdm(folder):
        refname = '_'.join(file.split('/')[-1].split('-')[-2].split('_')[2:5])
        filename = file.split('/')[-1][:-4]
        hps, C = get_hps_and_deltas(file, cfgObj, maskMatrix=masks[refname])
        np.save(f'daps-mp3/hashprints/{foldername}/{filename}', hps)
        np.save(f'daps-mp3/deltas/{foldername}/{filename}', C)
        ref_hps = np.load(f'daps-mp3/hashprints/refs/{refname}.npy')
        offset = find_offset(hps, ref_hps)
        offsets[foldername][filename] = offset

In [None]:
# Log best offsets
with open('daps-mp3/best_offsets.csv', 'w') as fout:
    fout.write("folder,filename,offset\n")
    for folder in offsets.keys():
        for filename in folder.keys():
            offset = offsets[folder][filename]
            fout.write(f'{folder}, {filename},{offset}\n')

### Split into Train and Test

In [33]:
# Reference Deltas
train_refs, test_refs = [], []
for i in range(1,6):
    train_refs.extend(glob.glob(f'daps-mp3/deltas/refs/f{i}_*'))
    train_refs.extend(glob.glob(f'daps-mp3/deltas/refs/m{i}_*'))
    test_refs.extend(glob.glob(f'daps-mp3/deltas/refs/f{i+5}_*'))
    test_refs.extend(glob.glob(f'daps-mp3/deltas/refs/m{i+5}_*'))

In [36]:
os.makedirs('daps-mp3/train/deltas/refs/', exist_ok=True)
os.makedirs('daps-mp3/test/deltas/refs/', exist_ok=True)
for path in train_refs:
    shutil.copy(path, 'daps-mp3/train/deltas/refs/')
for path in test_refs:
    shutil.copy(path, 'daps-mp3/test/deltas/refs/')

In [41]:
# Reference Hashprints
train_refs, test_refs = [], []
for i in range(1,6):
    train_refs.extend(glob.glob(f'daps-mp3/hashprints/refs/f{i}_*'))
    train_refs.extend(glob.glob(f'daps-mp3/hashprints/refs/m{i}_*'))
    test_refs.extend(glob.glob(f'daps-mp3/hashprints/refs/f{i+5}_*'))
    test_refs.extend(glob.glob(f'daps-mp3/hashprints/refs/m{i+5}_*'))

In [None]:
os.makedirs('daps-mp3/train/hashprints/refs/', exist_ok=True)
os.makedirs('daps-mp3/test/hashprints/refs/', exist_ok=True)
for path in train_refs:
    shutil.copy(path, 'daps-mp3/train/hashprints/refs/')
for path in test_refs:
    shutil.copy(path, 'daps-mp3/test/hashprints/refs/')

In [10]:
query_folders = ['queries', 'tampered0.25', 'tampered0.5', 'tampered1', 'tampered2', 'tampered4']

In [18]:
# Query Deltas
train_lst, test_lst = [], []
for i in range(1,6):
    for folder in query_folders:
        train_lst.extend(glob.glob(f'daps-mp3/deltas/{folder}/*f{i}_*'))
        train_lst.extend(glob.glob(f'daps-mp3/deltas/{folder}/*m{i}_*'))
        test_lst.extend(glob.glob(f'daps-mp3/deltas/{folder}/*f{i+5}_*'))
        test_lst.extend(glob.glob(f'daps-mp3/deltas/{folder}/*m{i+5}_*'))

In [None]:
for path in train_lst:
    folder = path.split('/')[2]
    os.makedirs(f'daps-mp3/train/deltas/{folder}/', exist_ok=True)
    shutil.copy(path, f'daps-mp3/train/deltas/{folder}/')
for path in test_lst:
    folder = path.split('/')[2]
    os.makedirs(f'daps-mp3/test/deltas/{folder}/', exist_ok=True)
    shutil.copy(path, f'daps-mp3/test/deltas/{folder}/')

In [None]:
# Query Hashprints
train_lst, test_lst = [], []
for i in range(1,6):
    for folder in query_folders:
        train_lst.extend(glob.glob(f'daps-mp3/hashprints/{folder}/*f{i}_*'))
        train_lst.extend(glob.glob(f'daps-mp3/hashprints/{folder}/*m{i}_*'))
        test_lst.extend(glob.glob(f'daps-mp3/hashprints/{folder}/*f{i+5}_*'))
        test_lst.extend(glob.glob(f'daps-mp3/hashprints/{folder}/*m{i+5}_*'))

In [None]:
for path in train_lst:
    folder = path.split('/')[2]
    os.makedirs(f'daps-mp3/train/hashprints/{folder}/', exist_ok=True)
    shutil.copy(path, f'daps-mp3/train/hashprints/{folder}/')
for path in test_lst:
    folder = path.split('/')[2]
    os.makedirs(f'daps-mp3/test/hashprints/{folder}/', exist_ok=True)
    shutil.copy(path, f'daps-mp3/test/hashprints/{folder}/')