In [None]:
import os
from pydub import AudioSegment
import random
import librosa
import librosa.display
import soundfile as sf
import shutil

### 1. Time Filtering

In [None]:
import os
from pydub import AudioSegment

def delete_short_audio_files(folder_path, medium_duration_min=5, medium_duration_max=7):
    def get_audio_length(file_path):
        audio = AudioSegment.from_file(file_path)
        duration = len(audio) / 1000.0  
        return duration

    for filename in os.listdir(folder_path):
        if filename.endswith(('.wav', '.flac')):  
            file_path = os.path.join(folder_path, filename)
            try:
                duration = get_audio_length(file_path)
                
                if duration < medium_duration_min or duration > medium_duration_max:
                    os.remove(file_path)
                    print(f"Deleted {filename} (duration: {duration:.2f} seconds)")
            except Exception as e:
                print(f"Could not process {filename}: {e}")


In [None]:
folder_path = 'D:/DEEPFAKE_DETECTION/DATASETS/ENGLISH/DATA/ASVspoof2021_DF_eval_part03/flac'
minute_min = 3.83
minute_max = 6.8
delete_short_audio_files(folder_path, minute_min, minute_max)

### 2. Segment Spliting

In [None]:
def split_audio(input_file, output_folder, segment_duration):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    audio = AudioSegment.from_file(input_file)
    total_duration = len(audio)
    num_segments = total_duration // segment_duration

    for i in range(num_segments):
        start_time = i * segment_duration
        end_time = min((i + 1) * segment_duration, total_duration)
        segment = audio[start_time:end_time]
        output_file = os.path.join(output_folder, f"nu8_{str(i+1).zfill(len(str(num_segments)))}.wav")
        segment.export(output_file, format="wav")

    print(f"Audio file split into {num_segments} segments.")

In [None]:
input_file = "../DATASETS/TTS/GiongGIA_TTSV2/nu/SOURCE/nu8.wav"
output_folder = "../DATASETS/TTS/GiongGIA_TTSV2/nu/SOURCE/nu8"
Audio_Segment = 5000 #Tính theo Mili giây nên 5000ms --> 5s
split_audio(input_file, output_folder, Audio_Segment)

### 3. Shuffle

In [None]:
def shuffle_folder(folder_path):
    files = os.listdir(folder_path)
    
    random.shuffle(files)
    
    for i, filename in enumerate(files):
        random_prefix = str(random.randint(10000, 99999))
        
        new_filename = f"{random_prefix}_{filename}"
        
        old_file = os.path.join(folder_path, filename)
        new_file = os.path.join(folder_path, new_filename)
        os.rename(old_file, new_file)


### 4. Label

In [None]:
folder_path = '../PROJECT/1.English_EN/run/Runner'
file_names = sorted(os.listdir(folder_path))


In [None]:
with open('file_label_run.txt', 'w') as f:
    for file_name in file_names:
        file_name_no_ext = os.path.splitext(file_name)[0]
        
        if file_name_no_ext.endswith('_b'):
            f.write(f"GIONG {file_name_no_ext} - - bonafide\n")
        elif file_name_no_ext.endswith('_s'):
            f.write(f"GIONG {file_name_no_ext} - - spoof\n")

print("File list created successfully.")
