In [1]:
import os

# Define all paths
BASE_DIR = r"D:\illegal-logging-detector"
DATA_DIR = os.path.join(BASE_DIR, "data", "raw")

categories = ["chainsaw", "truck", "machinery", "birds", "rain", "wind", "forest_ambience"]

# Check all folders exist
print("Checking folder structure...")
all_good = True
for cat in categories:
    path = os.path.join(DATA_DIR, cat)
    if os.path.exists(path):
        print(f"‚úÖ {cat}")
    else:
        print(f"‚ùå {cat} - MISSING")
        all_good = False

if all_good:
    print("\n‚úÖ All folders exist, ready for data collection!")
else:
    print("\n‚ùå Some folders are missing, run mkdir commands again")

Checking folder structure...
‚úÖ chainsaw
‚úÖ truck
‚úÖ machinery
‚úÖ birds
‚úÖ rain
‚úÖ wind
‚úÖ forest_ambience

‚úÖ All folders exist, ready for data collection!


In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

CLIENT_ID = os.getenv("FREESOUND_CLIENT_ID")
API_KEY = os.getenv("FREESOUND_API_KEY")

if CLIENT_ID and API_KEY:
    print("‚úÖ API keys loaded successfully")
    print(f"‚úÖ Client ID starts with: {CLIENT_ID[:4]}...")
    print(f"‚úÖ API Key starts with: {API_KEY[:4]}...")
else:
    print("‚ùå Keys not found, check your .env file")

‚úÖ API keys loaded successfully
‚úÖ Client ID starts with: SxK8...
‚úÖ API Key starts with: EVAu...


In [2]:
import requests
import os
import time
from dotenv import load_dotenv

load_dotenv()
API_KEY = os.getenv("FREESOUND_API_KEY")

BASE_DIR = r"D:\illegal-logging-detector"
DATA_DIR = os.path.join(BASE_DIR, "data", "raw")

# Search terms for each category
search_queries = {
    "chainsaw":        "chainsaw forest",
    "truck":           "logging truck engine",
    "machinery":       "heavy machinery outdoor",
    "birds":           "forest birds ambience",
    "rain":            "rain forest outdoor",
    "wind":            "wind forest trees",
    "forest_ambience": "forest ambience nature"
}

# How many clips per category
CLIPS_PER_CATEGORY = 30  # start small, we can always download more

def search_freesound(query, num_results=30):
    url = "https://freesound.org/apiv2/search/text/"
    params = {
        "query": query,
        "token": API_KEY,
        "fields": "id,name,previews,duration",
        "filter": "duration:[3 TO 30]",  # only 3-30 second clips
        "page_size": num_results,
        "format": "json"
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json().get("results", [])
    else:
        print(f"‚ùå Error {response.status_code}: {response.text}")
        return []

def download_clip(preview_url, save_path):
    try:
        r = requests.get(preview_url, stream=True)
        with open(save_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024):
                f.write(chunk)
        return True
    except Exception as e:
        print(f"‚ùå Download failed: {e}")
        return False

# Main download loop
for category, query in search_queries.items():
    print(f"\nüìÇ Downloading: {category}")
    folder = os.path.join(DATA_DIR, category)
    results = search_freesound(query, CLIPS_PER_CATEGORY)
    
    if not results:
        print(f"‚ùå No results found for {category}")
        continue

    downloaded = 0
    for sound in results:
        preview_url = sound["previews"].get("preview-hq-mp3") or sound["previews"].get("preview-lq-mp3")
        if not preview_url:
            continue
        
        filename = f"{category}_{sound['id']}.mp3"
        save_path = os.path.join(folder, filename)
        
        if os.path.exists(save_path):
            print(f"  ‚è≠Ô∏è Already exists: {filename}")
            downloaded += 1
            continue
        
        success = download_clip(preview_url, save_path)
        if success:
            downloaded += 1
            print(f"  ‚úÖ {downloaded}/{len(results)} {filename}")
        
        time.sleep(0.3)  # be polite to the API
    
    print(f"‚úÖ {category} done: {downloaded} clips downloaded")

print("\nüéâ All downloads complete!")


üìÇ Downloading: chainsaw
  ‚úÖ 1/3 chainsaw_669382.mp3
  ‚úÖ 2/3 chainsaw_670300.mp3
  ‚úÖ 3/3 chainsaw_328097.mp3
‚úÖ chainsaw done: 3 clips downloaded

üìÇ Downloading: truck
‚ùå No results found for truck

üìÇ Downloading: machinery
‚ùå No results found for machinery

üìÇ Downloading: birds
  ‚úÖ 1/30 birds_800712.mp3
  ‚úÖ 2/30 birds_467418.mp3
  ‚úÖ 3/30 birds_464477.mp3
  ‚úÖ 4/30 birds_221758.mp3
  ‚úÖ 5/30 birds_234534.mp3
  ‚úÖ 6/30 birds_632754.mp3
  ‚úÖ 7/30 birds_698356.mp3
  ‚úÖ 8/30 birds_456122.mp3
  ‚úÖ 9/30 birds_619325.mp3
  ‚úÖ 10/30 birds_278870.mp3
  ‚úÖ 11/30 birds_616231.mp3
  ‚úÖ 12/30 birds_485371.mp3
  ‚úÖ 13/30 birds_484811.mp3
  ‚úÖ 14/30 birds_386869.mp3
  ‚úÖ 15/30 birds_427085.mp3
  ‚úÖ 16/30 birds_570492.mp3
  ‚úÖ 17/30 birds_585902.mp3
  ‚úÖ 18/30 birds_723913.mp3
  ‚úÖ 19/30 birds_241995.mp3
  ‚úÖ 20/30 birds_241994.mp3
  ‚úÖ 21/30 birds_642763.mp3
  ‚úÖ 22/30 birds_558198.mp3
  ‚úÖ 23/30 birds_632346.mp3
  ‚úÖ 24/30 birds_649959.mp3
  ‚úÖ 25/30 

In [3]:
# Fixed search queries for failed categories
failed_queries = {
    "chainsaw": ["chainsaw", "chain saw cutting", "wood cutting saw"],
    "truck":    ["truck engine", "diesel engine", "heavy vehicle"],
    "rain":     ["rain", "rainfall", "rain drops outdoor"]
}

for category, queries in failed_queries.items():
    folder = os.path.join(DATA_DIR, category)
    existing = len(os.listdir(folder))
    print(f"\nüìÇ {category} (currently has {existing} clips)")
    
    downloaded = existing
    for query in queries:
        if downloaded >= 30:
            break
            
        print(f"  üîç Searching: {query}")
        results = search_freesound(query, 20)
        
        if not results:
            print(f"  ‚ùå No results for: {query}")
            continue
        
        for sound in results:
            if downloaded >= 30:
                break
                
            preview_url = sound["previews"].get("preview-hq-mp3") or sound["previews"].get("preview-lq-mp3")
            if not preview_url:
                continue
            
            filename = f"{category}_{sound['id']}.mp3"
            save_path = os.path.join(folder, filename)
            
            if os.path.exists(save_path):
                continue
            
            success = download_clip(preview_url, save_path)
            if success:
                downloaded += 1
                print(f"  ‚úÖ {downloaded}/30 {filename}")
            
            time.sleep(0.3)
    
    print(f"‚úÖ {category} done: {downloaded} clips total")

print("\nüéâ Fix downloads complete!")


üìÇ chainsaw (currently has 3 clips)
  üîç Searching: chainsaw
  ‚úÖ 4/30 chainsaw_453248.mp3
  ‚úÖ 5/30 chainsaw_453255.mp3
  ‚úÖ 6/30 chainsaw_453254.mp3
  ‚úÖ 7/30 chainsaw_453242.mp3
  ‚úÖ 8/30 chainsaw_453253.mp3
  ‚úÖ 9/30 chainsaw_453247.mp3
  ‚úÖ 10/30 chainsaw_453241.mp3
  ‚úÖ 11/30 chainsaw_185576.mp3
  ‚úÖ 12/30 chainsaw_453259.mp3
  ‚úÖ 13/30 chainsaw_431740.mp3
  ‚úÖ 14/30 chainsaw_185578.mp3
  ‚úÖ 15/30 chainsaw_431739.mp3
  ‚úÖ 16/30 chainsaw_185580.mp3
  ‚úÖ 17/30 chainsaw_453256.mp3
  ‚úÖ 18/30 chainsaw_431742.mp3
  ‚úÖ 19/30 chainsaw_810390.mp3
  ‚úÖ 20/30 chainsaw_810391.mp3
  ‚úÖ 21/30 chainsaw_505193.mp3
  ‚úÖ 22/30 chainsaw_94718.mp3
  ‚úÖ 23/30 chainsaw_531550.mp3
  üîç Searching: chain saw cutting
  ‚úÖ 24/30 chainsaw_463736.mp3
  ‚úÖ 25/30 chainsaw_463732.mp3
  ‚úÖ 26/30 chainsaw_463729.mp3
  ‚úÖ 27/30 chainsaw_463733.mp3
  ‚úÖ 28/30 chainsaw_463735.mp3
  ‚úÖ 29/30 chainsaw_425515.mp3
  ‚úÖ 30/30 chainsaw_523434.mp3
‚úÖ chainsaw done: 30 clips total

üìÇ t

In [4]:
import os

BASE_DIR = r"D:\illegal-logging-detector"
DATA_DIR = os.path.join(BASE_DIR, "data", "raw")

categories = ["chainsaw", "truck", "machinery", "birds", "rain", "wind", "forest_ambience"]

print("üìä Dataset Summary:")
print("-" * 30)
total = 0
for cat in categories:
    folder = os.path.join(DATA_DIR, cat)
    count = len(os.listdir(folder))
    total += count
    status = "‚úÖ" if count >= 20 else "‚ö†Ô∏è" if count > 0 else "‚ùå"
    print(f"{status} {cat}: {count} clips")

print("-" * 30)
print(f"üìÅ Total clips: {total}")

üìä Dataset Summary:
------------------------------
‚úÖ chainsaw: 30 clips
‚úÖ truck: 30 clips
‚ùå machinery: 0 clips
‚úÖ birds: 30 clips
‚úÖ rain: 30 clips
‚úÖ wind: 30 clips
‚úÖ forest_ambience: 30 clips
------------------------------
üìÅ Total clips: 180


In [5]:
machinery_queries = [
    "excavator engine",
    "bulldozer outdoor",
    "generator machine outdoor",
    "tractor engine field",
    "heavy equipment motor"
]

category = "machinery"
folder = os.path.join(DATA_DIR, category)
downloaded = 0

print(f"üìÇ Downloading: {category}")

for query in machinery_queries:
    if downloaded >= 30:
        break
    
    print(f"  üîç Searching: {query}")
    results = search_freesound(query, 10)
    
    if not results:
        print(f"  ‚ùå No results for: {query}")
        continue
    
    for sound in results:
        if downloaded >= 30:
            break
        
        preview_url = sound["previews"].get("preview-hq-mp3") or sound["previews"].get("preview-lq-mp3")
        if not preview_url:
            continue
        
        filename = f"{category}_{sound['id']}.mp3"
        save_path = os.path.join(folder, filename)
        
        if os.path.exists(save_path):
            continue
        
        success = download_clip(preview_url, save_path)
        if success:
            downloaded += 1
            print(f"  ‚úÖ {downloaded}/30 {filename}")
        
        time.sleep(0.3)

print(f"\n‚úÖ machinery done: {downloaded} clips total")


üìÇ Downloading: machinery
  üîç Searching: excavator engine
  ‚úÖ 1/30 machinery_606943.mp3
  ‚úÖ 2/30 machinery_606941.mp3
  ‚úÖ 3/30 machinery_332633.mp3
  ‚úÖ 4/30 machinery_332632.mp3
  ‚úÖ 5/30 machinery_346035.mp3
  üîç Searching: bulldozer outdoor
  ‚ùå No results for: bulldozer outdoor
  üîç Searching: generator machine outdoor
  ‚ùå No results for: generator machine outdoor
  üîç Searching: tractor engine field
  ‚úÖ 6/30 machinery_57548.mp3
  üîç Searching: heavy equipment motor
  ‚úÖ 7/30 machinery_256807.mp3
  ‚úÖ 8/30 machinery_256809.mp3

‚úÖ machinery done: 8 clips total


In [6]:
extra_machinery_queries = [
    "engine running",
    "motor running outdoor",
    "diesel motor",
    "power tool outdoor",
    "industrial machine",
    "crane machine",
    "forklift engine",
    "compressor machine",
    "sawmill",
    "wood chipper"
]

category = "machinery"
folder = os.path.join(DATA_DIR, category)
downloaded = len(os.listdir(folder))
print(f"Starting from {downloaded} existing clips")

for query in extra_machinery_queries:
    if downloaded >= 30:
        break
    
    print(f"  üîç Searching: {query}")
    results = search_freesound(query, 15)
    
    if not results:
        print(f"  ‚ùå No results for: {query}")
        continue
    
    for sound in results:
        if downloaded >= 30:
            break
        
        preview_url = sound["previews"].get("preview-hq-mp3") or sound["previews"].get("preview-lq-mp3")
        if not preview_url:
            continue
        
        filename = f"{category}_{sound['id']}.mp3"
        save_path = os.path.join(folder, filename)
        
        if os.path.exists(save_path):
            continue
        
        success = download_clip(preview_url, save_path)
        if success:
            downloaded += 1
            print(f"  ‚úÖ {downloaded}/30 {filename}")
        
        time.sleep(0.3)

print(f"\n‚úÖ machinery final count: {downloaded} clips")

Starting from 8 existing clips
  üîç Searching: engine running
  ‚úÖ 9/30 machinery_495384.mp3
  ‚úÖ 10/30 machinery_495383.mp3
  ‚úÖ 11/30 machinery_181565.mp3
  ‚úÖ 12/30 machinery_325817.mp3
  ‚úÖ 13/30 machinery_181564.mp3
  ‚úÖ 14/30 machinery_50898.mp3
  ‚úÖ 15/30 machinery_186962.mp3
  ‚úÖ 16/30 machinery_107038.mp3
  ‚úÖ 17/30 machinery_209864.mp3
  ‚úÖ 18/30 machinery_14599.mp3
  ‚úÖ 19/30 machinery_440508.mp3
  ‚úÖ 20/30 machinery_700145.mp3
  ‚úÖ 21/30 machinery_319152.mp3
  ‚úÖ 22/30 machinery_182329.mp3
  ‚úÖ 23/30 machinery_325807.mp3
  üîç Searching: motor running outdoor
  ‚úÖ 24/30 machinery_839672.mp3
  ‚úÖ 25/30 machinery_839671.mp3
  ‚úÖ 26/30 machinery_839670.mp3
  ‚úÖ 27/30 machinery_839669.mp3
  ‚úÖ 28/30 machinery_839668.mp3
  ‚úÖ 29/30 machinery_839666.mp3
  ‚úÖ 30/30 machinery_839667.mp3

‚úÖ machinery final count: 30 clips


In [7]:
extra_machinery_queries = [
    "engine running",
    "motor running outdoor",
    "diesel motor",
    "power tool outdoor",
    "industrial machine",
    "crane machine",
    "forklift engine",
    "compressor machine",
    "sawmill",
    "wood chipper"
]

category = "machinery"
folder = os.path.join(DATA_DIR, category)
downloaded = len(os.listdir(folder))
print(f"Starting from {downloaded} existing clips")

for query in extra_machinery_queries:
    if downloaded >= 30:
        break
    
    print(f"  üîç Searching: {query}")
    results = search_freesound(query, 15)
    
    if not results:
        print(f"  ‚ùå No results for: {query}")
        continue
    
    for sound in results:
        if downloaded >= 30:
            break
        
        preview_url = sound["previews"].get("preview-hq-mp3") or sound["previews"].get("preview-lq-mp3")
        if not preview_url:
            continue
        
        filename = f"{category}_{sound['id']}.mp3"
        save_path = os.path.join(folder, filename)
        
        if os.path.exists(save_path):
            continue
        
        success = download_clip(preview_url, save_path)
        if success:
            downloaded += 1
            print(f"  ‚úÖ {downloaded}/30 {filename}")
        
        time.sleep(0.3)

print(f"\n‚úÖ machinery final count: {downloaded} clips")

Starting from 30 existing clips

‚úÖ machinery final count: 30 clips


In [8]:
print("üìä Final Dataset Summary:")
print("-" * 30)
total = 0
for cat in categories:
    folder = os.path.join(DATA_DIR, cat)
    count = len(os.listdir(folder))
    total += count
    status = "‚úÖ" if count >= 20 else "‚ö†Ô∏è" if count > 0 else "‚ùå"
    print(f"{status} {cat}: {count} clips")

print("-" * 30)
print(f"üìÅ Total clips: {total}")
print(f"\n{'‚úÖ Dataset ready!' if total >= 180 else '‚ö†Ô∏è Need more clips'}")

üìä Final Dataset Summary:
------------------------------
‚úÖ chainsaw: 30 clips
‚úÖ truck: 30 clips
‚úÖ machinery: 30 clips
‚úÖ birds: 30 clips
‚úÖ rain: 30 clips
‚úÖ wind: 30 clips
‚úÖ forest_ambience: 30 clips
------------------------------
üìÅ Total clips: 210

‚úÖ Dataset ready!
