In [2]:
import os
import subprocess
import zipfile
from pathlib import Path
import pandas as pd

# Create directories
os.makedirs('datasets/raw', exist_ok=True)
os.makedirs('datasets/processed', exist_ok=True)
os.makedirs('datasets/metadata', exist_ok=True)

In [3]:
def download_kaggle_dataset(dataset_path, output_name):
    """
    Download and extract Kaggle dataset
    """
    output_dir = f'datasets/raw/{output_name}'
    os.makedirs(output_dir, exist_ok=True)
    
    print(f"\n{'='*60}")
    print(f"Downloading: {dataset_path}")
    print(f"Output: {output_dir}")
    print('='*60)
    
    result = subprocess.run(
        ['kaggle', 'datasets', 'download', '-d', dataset_path, '-p', output_dir],
        capture_output=True,
        text=True
    )
    
    if result.returncode == 0:
        print(f"✅ Downloaded successfully")
        
        # Unzip all zip files
        zip_files = list(Path(output_dir).glob('*.zip'))
        for zip_file in zip_files:
            print(f"Extracting {zip_file.name}...")
            try:
                with zipfile.ZipFile(zip_file, 'r') as zip_ref:
                    zip_ref.extractall(output_dir)
                zip_file.unlink()
                print(f"✅ Extracted and removed zip")
            except Exception as e:
                print(f"⚠️  Error extracting: {e}")
        
        # Show contents
        print(f"\nContents of {output_dir}:")
        items = list(Path(output_dir).iterdir())
        for item in items[:10]:  # Show first 10 items
            size = item.stat().st_size / (1024*1024)  # MB
            print(f"  - {item.name} ({size:.2f} MB)")
        if len(items) > 10:
            print(f"  ... and {len(items)-10} more items")
        
        return True
    else:
        print(f"❌ Error downloading: {result.stderr}")
        return False

In [4]:
# Download the main datasets we identified
datasets = {
    'nsfw_detection': 'jjeevanprakash/nsfw-detection',
    'violence_safe': 'kartikeybartwal/graphical-violence-and-safe-images-dataset',
    'hate_speech': 'thedevastator/hate-speech-and-offensive-language-detection',
}

results = {}

for name, kaggle_path in datasets.items():
    success = download_kaggle_dataset(kaggle_path, name)
    results[name] = success
    
print("\n" + "="*60)
print("DOWNLOAD SUMMARY")
print("="*60)
for name, success in results.items():
    status = "✅ Success" if success else "❌ Failed"
    print(f"{name}: {status}")


Downloading: jjeevanprakash/nsfw-detection
Output: datasets/raw/nsfw_detection
✅ Downloaded successfully
Extracting nsfw-detection.zip...
✅ Extracted and removed zip

Contents of datasets/raw/nsfw_detection:
  - out (0.00 MB)

Downloading: kartikeybartwal/graphical-violence-and-safe-images-dataset
Output: datasets/raw/violence_safe
✅ Downloaded successfully
Extracting graphical-violence-and-safe-images-dataset.zip...
✅ Extracted and removed zip

Contents of datasets/raw/violence_safe:
  - Graphically Violent Images (0.00 MB)
  - Graphically Safe Images (0.07 MB)

Downloading: thedevastator/hate-speech-and-offensive-language-detection
Output: datasets/raw/hate_speech
✅ Downloaded successfully
Extracting hate-speech-and-offensive-language-detection.zip...
✅ Extracted and removed zip

Contents of datasets/raw/hate_speech:
  - train.csv (2.30 MB)

DOWNLOAD SUMMARY
nsfw_detection: ✅ Success
violence_safe: ✅ Success
hate_speech: ✅ Success


In [5]:
# Check what we got
print("\nExploring downloaded datasets...\n")

for dataset_name in ['nsfw_detection', 'violence_safe', 'hate_speech']:
    dataset_path = Path(f'datasets/raw/{dataset_name}')
    
    if not dataset_path.exists():
        print(f"⚠️  {dataset_name} not found")
        continue
    
    print(f"\n{'='*60}")
    print(f"Dataset: {dataset_name}")
    print('='*60)
    
    # Count files by extension
    file_types = {}
    all_files = list(dataset_path.rglob('*'))
    
    for f in all_files:
        if f.is_file():
            ext = f.suffix.lower()
            file_types[ext] = file_types.get(ext, 0) + 1
    
    print(f"Total items: {len(all_files)}")
    print(f"File types: {file_types}")
    
    # Look for common patterns
    subdirs = [d for d in dataset_path.iterdir() if d.is_dir()]
    if subdirs:
        print(f"Subdirectories: {[d.name for d in subdirs[:5]]}")
    
    # Check for CSV files
    csv_files = list(dataset_path.rglob('*.csv'))
    if csv_files:
        print(f"\nCSV files found: {[f.name for f in csv_files]}")
        # Load first CSV to see structure
        df = pd.read_csv(csv_files[0])
        print(f"\nFirst CSV structure ({csv_files[0].name}):")
        print(f"  Rows: {len(df)}")
        print(f"  Columns: {list(df.columns)}")
        print(f"\nFirst few rows:")
        print(df.head(3))


Exploring downloaded datasets...


Dataset: nsfw_detection
Total items: 27741
File types: {'.jpg': 27532, '.png': 199}
Subdirectories: ['out']

Dataset: violence_safe
Total items: 1170
File types: {'.jpg': 64, '.jpeg': 899, '.png': 198, '.webp': 1, '.gif': 5, '.py': 1}
Subdirectories: ['Graphically Violent Images', 'Graphically Safe Images']

Dataset: hate_speech
Total items: 1
File types: {'.csv': 1}

CSV files found: ['train.csv']

First CSV structure (train.csv):
  Rows: 24783
  Columns: ['count', 'hate_speech_count', 'offensive_language_count', 'neither_count', 'class', 'tweet']

First few rows:
   count  hate_speech_count  offensive_language_count  neither_count  class  \
0      3                  0                         0              3      2   
1      3                  0                         3              0      1   
2      3                  0                         3              0      1   

                                               tweet  
0  !!! RT @mayasolov

In [6]:
# Save metadata about what we downloaded
metadata = {
    'datasets': []
}

for name, kaggle_path in datasets.items():
    dataset_path = Path(f'datasets/raw/{name}')
    
    if dataset_path.exists():
        all_files = list(dataset_path.rglob('*'))
        image_files = [f for f in all_files if f.suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif']]
        
        metadata['datasets'].append({
            'name': name,
            'kaggle_path': kaggle_path,
            'local_path': str(dataset_path),
            'total_files': len(all_files),
            'image_count': len(image_files),
            'downloaded': True
        })
    else:
        metadata['datasets'].append({
            'name': name,
            'kaggle_path': kaggle_path,
            'downloaded': False
        })

import json
with open('datasets/metadata/download_info.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print("\n✅ Metadata saved to datasets/metadata/download_info.json")

# Print summary
print("\n" + "="*60)
print("FINAL SUMMARY")
print("="*60)
for ds in metadata['datasets']:
    if ds['downloaded']:
        print(f"\n{ds['name']}:")
        print(f"  Images: {ds['image_count']}")
        print(f"  Total files: {ds['total_files']}")
    else:
        print(f"\n{ds['name']}: ❌ Not downloaded")


✅ Metadata saved to datasets/metadata/download_info.json

FINAL SUMMARY

nsfw_detection:
  Images: 27731
  Total files: 27741

violence_safe:
  Images: 1166
  Total files: 1170

hate_speech:
  Images: 0
  Total files: 1
