In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


## Preprocessing Report

This notebook performs the following preprocessing steps on your audio dataset:

1.  **Mounting Google Drive**: The notebook starts by mounting your Google Drive to access the audio files stored there.
2.  **Installing Libraries**: The necessary library (`pydub`) for audio manipulation and `ffmpeg` for audio format conversion are installed.
3.  **Listing Files**: The files within the main audio dataset folder are listed to provide an overview of the raw data.
4.  **Splitting into 5-second clips**: Initial splitting of the audio files in `/content/drive/MyDrive/Audio Dataset CSE400` into 5-second WAV clips, organized into subfolders within a `5sec_dataset` folder.
5.  **Counting 5-second clips**: The total number of 5-second WAV clips generated is counted and displayed.
6.  **Splitting 'Normal Child' audio into 1-second clips**: Audio files from the 'Normal Child' folder are split into 1-second WAV clips, organized into subfolders within a `1sec_dataset` folder inside the 'Normal Child' directory.
7.  **Counting 1-second clips in 'Audio Dataset CSE400/1sec_dataset'**: The total number of 1-second WAV clips in the `1sec_dataset` folder within `Audio Dataset CSE400` is counted.
8.  **Verifying File Structure**: The file structure within the `1sec_dataset` folder is displayed to confirm the organization of the split audio clips.
9.  **Counting WAV files in parent folders**: The total number of WAV files in the main `Audio Dataset CSE400` folder and the `Normal Child` folder are counted (this likely includes the newly created 1-second and 5-second datasets within these folders).
10. **Correcting 'Normal Child' 1-second splitting**: The code in cell `EBa-gGf7OS39` correctly splits the 'Normal Child' audio into 1-second clips within a `1sec_dataset` subfolder inside 'Normal Child'.
11. **Counting 'Normal Child' 1-second clips**: The total number of 1-second WAV clips specifically within the `/content/drive/MyDrive/Normal Child/1sec_dataset` folder is confirmed.
12. **Creating Metadata CSV**: A `combined_metadata.csv` file is generated, listing the filepaths and corresponding labels ('autistic' or 'non_autistic') for all the 1-second WAV clips created from both the main audio dataset and the 'Normal Child' dataset.
13. **Verifying clip counts per label**: The number of clips for 'autistic' and 'non-autistic' labels is counted from the generated dataset structure to confirm the distribution.
14. **Splitting 'Audio Dataset CSE400' into 1-second clips**: The code in cell `a0IvZJfXZNb3` splits the audio files in the main `Audio Dataset CSE400` folder into 1-second WAV clips, ensuring consistency in the time duration of clips across both categories.

These steps prepare the audio data for further analysis or model training by standardizing the clip length and creating a metadata file for easy access and labeling.

In [None]:
!pip install pydub
!apt install ffmpeg


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.


In [None]:
import os

folder_path = '/content/drive/MyDrive/Audio Dataset CSE400'
files = os.listdir(folder_path)

print("Files in folder:")
for file in files:
    print(file)


Files in folder:
Recording_18.m4a
Recording_19.m4a
Recording_16.m4a
Recording_11.m4a
Recording_5.m4a
Recording_8.m4a
Recording_14.m4a
Recording_13.m4a
Recording_7.m4a
Recording_4.m4a
Recording_10.m4a
Recording_12.m4a
Recording_17.m4a
Recording_15.m4a
Recording_9.m4a
Recording_6.m4a
1sec_dataset
5sec_dataset


In [None]:
from pydub import AudioSegment
import os

input_folder = '/content/drive/MyDrive/Audio Dataset CSE400'
output_folder = os.path.join(input_folder, '5sec_dataset')
os.makedirs(output_folder, exist_ok=True)

# Loop through all .m4a files
for filename in os.listdir(input_folder):
    if filename.endswith(".m4a"):
        audio_path = os.path.join(input_folder, filename)
        audio = AudioSegment.from_file(audio_path, format="m4a")

        chunk_length_ms = 5 * 1000  # 5 seconds

        # Create a subfolder for each audio file
        base_name = os.path.splitext(filename)[0]
        file_output_folder = os.path.join(output_folder, base_name)
        os.makedirs(file_output_folder, exist_ok=True)

        # Split into 5-second chunks
        for i, chunk_start in enumerate(range(0, len(audio), chunk_length_ms)):
            chunk = audio[chunk_start:chunk_start + chunk_length_ms]
            chunk.export(f"{file_output_folder}/clip_{i:04d}.wav", format="wav")

        print(f"✅ Done splitting: {filename}")


✅ Done splitting: Recording_18.m4a
✅ Done splitting: Recording_19.m4a
✅ Done splitting: Recording_16.m4a
✅ Done splitting: Recording_11.m4a
✅ Done splitting: Recording_5.m4a
✅ Done splitting: Recording_8.m4a
✅ Done splitting: Recording_14.m4a
✅ Done splitting: Recording_13.m4a
✅ Done splitting: Recording_7.m4a
✅ Done splitting: Recording_4.m4a
✅ Done splitting: Recording_10.m4a
✅ Done splitting: Recording_12.m4a
✅ Done splitting: Recording_17.m4a
✅ Done splitting: Recording_15.m4a
✅ Done splitting: Recording_9.m4a
✅ Done splitting: Recording_6.m4a


In [None]:
import os

base_folder = '/content/drive/MyDrive/Audio Dataset CSE400/5sec_dataset'
total_clips = 0

for subdir, _, files in os.walk(base_folder):
    for file in files:
        if file.endswith('.wav'):
            total_clips += 1

print(f"🎧 Total clips in dataset: {total_clips}")


🎧 Total clips in dataset: 462


In [None]:
normal_input_folder = '/content/drive/MyDrive//Normal Child'
normal_output_base = '/content/drive/MyDrive//1sec_dataset/Normal Child'

os.makedirs(normal_output_base, exist_ok=True)

for filename in os.listdir(normal_input_folder):
    if filename.endswith(".m4a"):
        audio_path = os.path.join(normal_input_folder, filename)
        audio = AudioSegment.from_file(audio_path, format="m4a")

        base_name = os.path.splitext(filename)[0]
        output_folder = os.path.join(normal_output_base, base_name)
        os.makedirs(output_folder, exist_ok=True)

        for i, chunk_start in enumerate(range(0, len(audio), chunk_length_ms)):
            chunk = audio[chunk_start:chunk_start + chunk_length_ms]
            chunk.export(f"{output_folder}/clip_{i:04d}.wav", format="wav")

        print(f"✅ Normal child audio split: {filename}")


✅ Normal child audio split: Recording_28.m4a
✅ Normal child audio split: Recording_23.m4a
✅ Normal child audio split: Recording_30.m4a
✅ Normal child audio split: Recording_27.m4a
✅ Normal child audio split: Recording_29.m4a
✅ Normal child audio split: Recording_26.m4a
✅ Normal child audio split: Recording_25.m4a
✅ Normal child audio split: Recording_24.m4a
✅ Normal child audio split: Recording_21.m4a
✅ Normal child audio split: Recording_22.m4a


In [None]:
import os

dataset_path = '/content/drive/MyDrive/Audio Dataset CSE400/1sec_dataset'
total_clips = 0

for subdir, _, files in os.walk(dataset_path):
    for file in files:
        if file.endswith('.wav'):
            total_clips += 1

print(f"🎧 Total number of 1-second clips: {total_clips}")


🎧 Total number of 1-second clips: 2284


In [None]:
import os

path = '/content/drive/MyDrive/Audio Dataset CSE400/1sec_dataset'

for root, dirs, files in os.walk(path):
    print(f"📁 Folder: {root}")
    for file in files:
        print(f"    - {file}")


📁 Folder: /content/drive/MyDrive/Audio Dataset CSE400/1sec_dataset
📁 Folder: /content/drive/MyDrive/Audio Dataset CSE400/1sec_dataset/Recording_18
    - clip_0000.wav
    - clip_0001.wav
    - clip_0002.wav
    - clip_0003.wav
    - clip_0004.wav
    - clip_0005.wav
    - clip_0006.wav
    - clip_0007.wav
    - clip_0008.wav
    - clip_0009.wav
    - clip_0010.wav
    - clip_0011.wav
    - clip_0012.wav
    - clip_0013.wav
    - clip_0014.wav
    - clip_0015.wav
    - clip_0016.wav
    - clip_0017.wav
    - clip_0018.wav
    - clip_0019.wav
    - clip_0020.wav
    - clip_0021.wav
    - clip_0022.wav
    - clip_0023.wav
    - clip_0024.wav
    - clip_0025.wav
    - clip_0026.wav
    - clip_0027.wav
    - clip_0028.wav
    - clip_0029.wav
    - clip_0030.wav
    - clip_0031.wav
    - clip_0032.wav
    - clip_0033.wav
    - clip_0034.wav
    - clip_0035.wav
    - clip_0036.wav
    - clip_0037.wav
    - clip_0038.wav
    - clip_0039.wav
    - clip_0040.wav
    - clip_0041.wav
    - clip_00

In [None]:
!find /content/drive/MyDrive/Audio\ Dataset\ CSE400/1sec_dataset -name "*.wav" | wc -l


2284


In [None]:
# Count in Audio Dataset CSE400
!find "/content/drive/MyDrive/Audio Dataset CSE400" -name "*.wav" | wc -l

# Count in Normal Child
!find "/content/drive/MyDrive/Normal Child" -name "*.wav" | wc -l


2746
1463


In [None]:
from pydub import AudioSegment
import os

normal_input_folder = '/content/drive/MyDrive/Normal Child'
normal_output_folder = '/content/drive/MyDrive/Normal Child/1sec_dataset'
os.makedirs(normal_output_folder, exist_ok=True)

chunk_length_ms = 1 * 1000  # 1 second

for filename in os.listdir(normal_input_folder):
    if filename.endswith(".m4a"):
        audio_path = os.path.join(normal_input_folder, filename)
        audio = AudioSegment.from_file(audio_path, format="m4a")

        base_name = os.path.splitext(filename)[0]
        output_folder = os.path.join(normal_output_folder, base_name)
        os.makedirs(output_folder, exist_ok=True)

        for i, chunk_start in enumerate(range(0, len(audio), chunk_length_ms)):
            chunk = audio[chunk_start:chunk_start + chunk_length_ms]
            chunk.export(f"{output_folder}/clip_{i:04d}.wav", format="wav")

        print(f"✅ Normal child audio split: {filename}")


✅ Normal child audio split: Recording_28.m4a
✅ Normal child audio split: Recording_23.m4a
✅ Normal child audio split: Recording_30.m4a
✅ Normal child audio split: Recording_27.m4a
✅ Normal child audio split: Recording_29.m4a
✅ Normal child audio split: Recording_26.m4a
✅ Normal child audio split: Recording_25.m4a
✅ Normal child audio split: Recording_24.m4a
✅ Normal child audio split: Recording_21.m4a
✅ Normal child audio split: Recording_22.m4a


In [None]:
!find "/content/drive/MyDrive/Normal Child/1sec_dataset" -name "*.wav" | wc -l


1463


In [None]:
import csv
import os

base_folders = {
    'autistic': '/content/drive/MyDrive/Audio Dataset CSE400/1sec_dataset',
    'non_autistic': '/content/drive/MyDrive/Normal Child/1sec_dataset'
}

metadata_path = '/content/drive/MyDrive/combined_metadata.csv'

with open(metadata_path, mode='w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['filepath', 'label'])

    for label, base_folder in base_folders.items():
        for root, _, files in os.walk(base_folder):
            for file in files:
                if file.endswith('.wav'):
                    filepath = os.path.join(root, file)
                    writer.writerow([filepath, label])

print(f"✅ Metadata CSV saved at: {metadata_path}")


✅ Metadata CSV saved at: /content/drive/MyDrive/combined_metadata.csv


In [None]:
import os

def count_clips(folder):
    count = 0
    for root, _, files in os.walk(folder):
        count += sum(1 for file in files if file.endswith('.wav'))
    return count

autistic_folder = '/content/drive/MyDrive/Audio Dataset CSE400/1sec_dataset'
non_autistic_folder = '/content/drive/MyDrive/Normal Child/1sec_dataset'

print("Autistic clips:", count_clips(autistic_folder))
print("Non-autistic clips:", count_clips(non_autistic_folder))


Autistic clips: 2284
Non-autistic clips: 1463


In [None]:
from pydub import AudioSegment
import os

input_folder = '/content/drive/MyDrive/Audio Dataset CSE400'
output_folder = os.path.join(input_folder, '1sec_dataset')  # updated folder name
os.makedirs(output_folder, exist_ok=True)

# Loop through all .m4a files
for filename in os.listdir(input_folder):
    if filename.endswith(".m4a"):
        audio_path = os.path.join(input_folder, filename)
        audio = AudioSegment.from_file(audio_path, format="m4a")

        chunk_length_ms = 1 * 1000  # ✅ 1 second

        # Create a subfolder for each audio file
        base_name = os.path.splitext(filename)[0]
        file_output_folder = os.path.join(output_folder, base_name)
        os.makedirs(file_output_folder, exist_ok=True)

        # Split into 1-second chunks
        for i, chunk_start in enumerate(range(0, len(audio), chunk_length_ms)):
            chunk = audio[chunk_start:chunk_start + chunk_length_ms]
            chunk.export(f"{file_output_folder}/clip_{i:04d}.wav", format="wav")

        print(f"✅ Done splitting: {filename}")


✅ Done splitting: Recording_18.m4a
✅ Done splitting: Recording_19.m4a
✅ Done splitting: Recording_16.m4a
✅ Done splitting: Recording_11.m4a
✅ Done splitting: Recording_5.m4a
✅ Done splitting: Recording_8.m4a
✅ Done splitting: Recording_14.m4a
✅ Done splitting: Recording_13.m4a
✅ Done splitting: Recording_7.m4a
✅ Done splitting: Recording_4.m4a
✅ Done splitting: Recording_10.m4a
✅ Done splitting: Recording_12.m4a
✅ Done splitting: Recording_17.m4a
✅ Done splitting: Recording_15.m4a
✅ Done splitting: Recording_9.m4a
✅ Done splitting: Recording_6.m4a
