In [1]:
!pip install librosa soundfile numpy tqdm -q

In [31]:
from google.colab import files
import os
import shutil
import librosa
import soundfile as sf
import numpy as np
from tqdm import tqdm

In [32]:
print("üì§ Select your violent_audio_dataset.zip file")
uploaded = files.upload()

# Make sure base folder exists
os.makedirs('/content/dataset', exist_ok=True)

# Extract any uploaded zip into /content/dataset
for filename in uploaded.keys():
    if filename.endswith('.zip'):
        print(f"üì¶ Extracting {filename}...")
        shutil.unpack_archive(filename, '/content/dataset')
        print("‚úÖ Extracted!")

print("\nüìÅ Contents of /content/dataset:")
for item in os.listdir('/content/dataset'):
    print("  ", item)


üì§ Select your violent_audio_dataset.zip file


Saving violent_audio_dataset.zip to violent_audio_dataset (1).zip
üì¶ Extracting violent_audio_dataset (1).zip...
‚úÖ Extracted!

üìÅ Contents of /content/dataset:
   content
   violent_audio
   violent_audio_preprocessed


In [33]:
print("üîç Searching for category folders...\n")
for root, dirs, files in os.walk('/content/dataset'):
    # Look for a folder that contains at least one known category name
    if any(d in dirs for d in [
        'screams_human_distress',
        'gunshots_firearms',
        'glass_breaking',
        'explosions_blasts',
        'sirens_alarms',
        'fighting_impact',
        'aggressive_speech',
        'crash_collision'
    ]):
        print("‚úÖ Found category root folder:")
        print("   ", root)
        print("   Subfolders:", dirs)


üîç Searching for category folders...

‚úÖ Found category root folder:
    /content/dataset/content/datasets/violent_audio
   Subfolders: ['explosions_blasts', 'sirens_alarms', 'glass_breaking', 'gunshots_firearms', 'screams_human_distress', 'fighting_impact', 'aggressive_speech', 'crash_collision']


In [35]:
def preprocess_audio(input_path, output_path, target_sr=16000):
    """
    Preprocess a single audio file:
    - Load audio
    - Convert to 16kHz mono
    - Normalize volume
    - Save as WAV
    """
    try:
        y, sr = librosa.load(input_path, sr=target_sr, mono=True)
        y = y / (np.max(np.abs(y)) + 1e-8)
        sf.write(output_path, y, target_sr)
        return True
    except Exception as e:
        print(f"    ‚ùå Error on {input_path}: {e}")
        return False

# IMPORTANT: use the path you just saw
dataset_path = "/content/dataset/content/datasets/violent_audio"
output_path = "/content/dataset/violent_audio_preprocessed"

os.makedirs(output_path, exist_ok=True)

print("=" * 70)
print("üéµ AUDIO PREPROCESSING STARTED")
print("=" * 70)

total_files = 0
successful = 0

# Category folders under your root
categories = sorted([
    d for d in os.listdir(dataset_path)
    if os.path.isdir(os.path.join(dataset_path, d))
])

print(f"\nüìä Processing {len(categories)} categories\n")

for category in categories:
    category_path = os.path.join(dataset_path, category)
    output_category = os.path.join(output_path, category)
    os.makedirs(output_category, exist_ok=True)

    audio_files = [
        f for f in os.listdir(category_path)
        if f.endswith(('.mp3', '.wav', '.m4a'))
    ]

    print(f"üìÇ {category}: {len(audio_files)} files")

    for audio_file in tqdm(audio_files, desc="  Processing", leave=False):
        total_files += 1
        input_file = os.path.join(category_path, audio_file)
        output_file = os.path.join(
            output_category,
            audio_file.rsplit('.', 1)[0] + '.wav'
        )

        if preprocess_audio(input_file, output_file):
            successful += 1

print("\n" + "=" * 70)
print("‚úÖ PREPROCESSING COMPLETE!")
print("=" * 70)
print(f"Total files: {total_files}")
print(f"‚úÖ Successful: {successful}")
print(f"üìÅ Saved to: {output_path}")
print("=" * 70)


üéµ AUDIO PREPROCESSING STARTED

üìä Processing 8 categories

üìÇ aggressive_speech: 76 files




üìÇ crash_collision: 93 files




üìÇ explosions_blasts: 130 files




üìÇ fighting_impact: 108 files




üìÇ glass_breaking: 129 files




üìÇ gunshots_firearms: 91 files




üìÇ screams_human_distress: 151 files




üìÇ sirens_alarms: 126 files


                                                               


‚úÖ PREPROCESSING COMPLETE!
Total files: 904
‚úÖ Successful: 904
üìÅ Saved to: /content/dataset/violent_audio_preprocessed




In [36]:
base = "/content/dataset/violent_audio_preprocessed"
print("üìÅ Categories and file counts:\n")
for cat in sorted(os.listdir(base)):
    cat_path = os.path.join(base, cat)
    if os.path.isdir(cat_path):
        n = len([f for f in os.listdir(cat_path) if f.endswith(".wav")])
        print(f"- {cat}: {n} files")


üìÅ Categories and file counts:

- aggressive_speech: 76 files
- crash_collision: 93 files
- explosions_blasts: 130 files
- fighting_impact: 108 files
- glass_breaking: 129 files
- gunshots_firearms: 91 files
- screams_human_distress: 151 files
- sirens_alarms: 126 files


In [41]:
from google.colab import files
import shutil

# Create ZIP from the preprocessed folder
shutil.make_archive(
    "/content/violent_audio_preprocessed",  # output path (without .zip)
    "zip",
    "/content/dataset",                    # root directory
    "violent_audio_preprocessed"           # folder inside root to zip
)

# Download the ZIP file
files.download("/content/violent_audio_preprocessed.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

‚úÖ Download started
