In [2]:
import requests
import os
import time
import json
from datetime import datetime

# YOUR UNSPLASH API KEY
ACCESS_KEY = "Z1-NoQ-KtNSR8GUfZNgvrfg3KWifCwcPMZq6MBvDUpU"

def scrape_unsplash_violence_dataset(num_violence=300, num_normal=300):
    """
    Download 600 images for violence detection:
    - 300 violence/fight images
    - 300 normal/non-violence images
    Uses Unsplash API (legal, free, no rate limit issues)
    """

    base_url = "https://api.unsplash.com/search/photos"
    headers = {
        "Authorization": f"Client-ID {ACCESS_KEY}",
        "User-Agent": "Mozilla/5.0"
    }

    # Create folders
    os.makedirs("violence_dataset/violence", exist_ok=True)
    os.makedirs("violence_dataset/normal", exist_ok=True)
    os.makedirs("violence_dataset/metadata", exist_ok=True)

    # Search queries for violence/fight images
    violence_queries = [
        "street fight",
        "people fighting",
        "physical fight",
        "combat action",
        "aggressive contact",
        "boxing match",
        "MMA fight",
        "crowd violence",
        "fight scene"
    ]

    # Search queries for normal/non-violence images
    normal_queries = [
        "people walking",
        "crowd normal",
        "people talking",
        "street scene",
        "people standing",
        "urban scene",
        "people gathering",
        "city street"
    ]

    print("=" * 60)
    print("VIOLENCE DETECTION DATASET SCRAPER")
    print("=" * 60)
    print(f"Target: {num_violence} violence + {num_normal} normal images")
    print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("=" * 60)

    # Download violence images
    downloaded_violence = download_images_by_queries(
        base_url, headers, violence_queries,
        num_violence, "violence_dataset/violence",
        label="VIOLENCE"
    )

    print("\n" + "=" * 60)

    # Download normal images
    downloaded_normal = download_images_by_queries(
        base_url, headers, normal_queries,
        num_normal, "violence_dataset/normal",
        label="NORMAL"
    )

    print("\n" + "=" * 60)
    print("DOWNLOAD COMPLETE!")
    print("=" * 60)
    print(f"✓ Violence images: {downloaded_violence}/{num_violence}")
    print(f"✓ Normal images: {downloaded_normal}/{num_normal}")
    print(f"✓ Total images: {downloaded_violence + downloaded_normal}/{num_violence + num_normal}")
    print(f"✓ Location: violence_dataset/")
    print("=" * 60)

    # Create summary file
    summary = {
        "total_images": downloaded_violence + downloaded_normal,
        "violence_images": downloaded_violence,
        "normal_images": downloaded_normal,
        "downloaded_at": datetime.now().isoformat(),
        "source": "Unsplash API",
        "api_key_used": "Yes"
    }

    with open("violence_dataset/metadata/download_summary.json", "w") as f:
        json.dump(summary, f, indent=2)

    print(f"✓ Summary saved to: violence_dataset/metadata/download_summary.json\n")


def download_images_by_queries(base_url, headers, queries, target_count, output_folder, label=""):
    """
    Download images using multiple search queries
    """
    downloaded = 0
    page = 1
    query_index = 0

    while downloaded < target_count and query_index < len(queries):
        query = queries[query_index]
        print(f"\n[{label}] Searching: '{query}'")

        page = 1
        query_downloaded = 0

        while downloaded < target_count and query_downloaded < 50:  # Max 50 per query
            params = {
                "query": query,
                "per_page": 30,
                "page": page,
                "order_by": "relevant"
            }

            try:
                print(f"  └─ Page {page}...", end=" ", flush=True)
                response = requests.get(base_url, headers=headers, params=params, timeout=10)

                if response.status_code != 200:
                    print(f"❌ Error {response.status_code}")
                    break

                data = response.json()
                results = data.get('results', [])

                if not results:
                    print("No more images")
                    break

                print(f"Found {len(results)} images", end="")

                for photo in results:
                    if downloaded >= target_count or query_downloaded >= 50:
                        break

                    try:
                        # Get high-quality image URL
                        img_url = photo['urls']['regular']
                        photo_id = photo['id']
                        photographer = photo['user']['name']

                        # Download image
                        img_response = requests.get(img_url, timeout=15)
                        if img_response.status_code == 200:
                            filename = f"{label}_{downloaded:04d}.jpg"
                            filepath = os.path.join(output_folder, filename)

                            with open(filepath, 'wb') as f:
                                f.write(img_response.content)

                            # Save metadata
                            metadata = {
                                "filename": filename,
                                "label": label,
                                "photo_id": photo_id,
                                "photographer": photographer,
                                "query": query,
                                "image_url": img_url,
                                "unsplash_link": photo['links']['html'],
                                "downloaded_at": datetime.now().isoformat()
                            }

                            metadata_file = os.path.join(
                                "violence_dataset/metadata",
                                f"{label}_{downloaded:04d}.json"
                            )
                            with open(metadata_file, 'w') as f:
                                json.dump(metadata, f, indent=2)

                            downloaded += 1
                            query_downloaded += 1
                            print(".", end="", flush=True)
                            time.sleep(0.3)  # Gentle delay

                    except Exception as e:
                        print("x", end="", flush=True)
                        continue

                print(f" ✓ ({query_downloaded}/{50})")
                page += 1
                time.sleep(0.5)

            except Exception as e:
                print(f"❌ Error: {str(e)[:40]}")
                break

        query_index += 1

    return downloaded


# RUN THE SCRIPT
scrape_unsplash_violence_dataset(num_violence=300, num_normal=300)


VIOLENCE DETECTION DATASET SCRAPER
Target: 300 violence + 300 normal images
Started: 2025-11-28 17:02:29

[VIOLENCE] Searching: 'street fight'
  └─ Page 1... Found 30 images.............................. ✓ (30/50)
  └─ Page 2... Found 30 images.................... ✓ (50/50)

[VIOLENCE] Searching: 'people fighting'
  └─ Page 1... Found 30 images.............................. ✓ (30/50)
  └─ Page 2... Found 30 images.................... ✓ (50/50)

[VIOLENCE] Searching: 'physical fight'
  └─ Page 1... Found 30 images.............................. ✓ (30/50)
  └─ Page 2... Found 30 images.................... ✓ (50/50)

[VIOLENCE] Searching: 'combat action'
  └─ Page 1... Found 30 images.............................. ✓ (30/50)
  └─ Page 2... Found 30 images.................... ✓ (50/50)

[VIOLENCE] Searching: 'aggressive contact'
  └─ Page 1... Found 1 images. ✓ (1/50)
  └─ Page 2... No more images

[VIOLENCE] Searching: 'boxing match'
  └─ Page 1... Found 30 images...........................

In [3]:
# Download the dataset as ZIP file

import shutil
import os

# Compress the violence_dataset folder
print("Compressing dataset...")
shutil.make_archive('violence_dataset', 'zip', '.', 'violence_dataset')

# Get file size
size_mb = os.path.getsize('violence_dataset.zip') / (1024 * 1024)
print(f"\n✓ Dataset compressed!")
print(f"File size: {size_mb:.2f} MB")
print("\nReady to download: violence_dataset.zip")


Compressing dataset...

✓ Dataset compressed!
File size: 114.66 MB

Ready to download: violence_dataset.zip


In [6]:
import shutil
from google.colab import files

shutil.make_archive("violence_dataset", 'zip', '.', "violence_dataset")
files.download("violence_dataset.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>