# üöÄ AI Visual Search - Custom Dataset Generator

Since public datasets are broken/private, we will **create our own**!

This notebook downloads images directly from the web for the parts YOU want.

**Parts Included**: Battery, Brake Pads, Engine, Alternator, etc.

---

## Setup Instructions

1. **Enable GPU**: Runtime ‚Üí Change runtime type ‚Üí GPU ‚Üí Save
2. **Run all cells** in order
3. **Wait for download** (takes ~5-10 mins)

## 1Ô∏è‚É£ Install Dependencies

In [None]:
!pip install -q tensorflow pillow matplotlib scikit-learn bing-image-downloader

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import os
from bing_image_downloader import downloader

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")

## 2Ô∏è‚É£ Define Your Car Parts

Add any car part you want to this list!

In [None]:
# List of parts to download
CAR_PARTS = [
    "car battery",
    "car brake pads",
    "car engine",
    "car alternator",
    "car radiator",
    "car spark plug",
    "car oil filter",
    "car air filter",
    "car headlight",
    "car tire",
    "car steering wheel",
    "car turbocharger",
    "car suspension shock absorber",
    "car exhaust pipe",
    "car transmission gearbox"
]

print(f"We will download images for {len(CAR_PARTS)} categories.")

## 3Ô∏è‚É£ Download Images (Automated)

This downloads 50 images per category.

In [None]:
# Create dataset directory
!mkdir -p dataset/train

for part in CAR_PARTS:
    print(f"\n‚¨áÔ∏è Downloading {part}...")
    
    # Download 60 images (we'll use 50 for train, 10 for test)
    downloader.download(
        part, 
        limit=60, 
        output_dir='dataset_raw', 
        adult_filter_off=True, 
        force_replace=False, 
        timeout=60,
        verbose=False
    )

print("\n‚úì All images downloaded!")

## 4Ô∏è‚É£ Organize & Clean Data

In [None]:
import shutil
from PIL import Image

# Create train/test structure
!mkdir -p dataset/train dataset/test

def is_valid_image(path):
    try:
        img = Image.open(path)
        img.verify()
        return True
    except:
        return False

for part in CAR_PARTS:
    # Clean category name (remove 'car ' prefix)
    clean_name = part.replace('car ', '').replace(' ', '_')
    
    # Create class folders
    os.makedirs(f'dataset/train/{clean_name}', exist_ok=True)
    os.makedirs(f'dataset/test/{clean_name}', exist_ok=True)
    
    # Get downloaded images
    src_dir = f'dataset_raw/{part}'
    if not os.path.exists(src_dir): continue
        
    images = [f for f in os.listdir(src_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    
    valid_images = []
    for img in images:
        src_path = os.path.join(src_dir, img)
        if is_valid_image(src_path):
            valid_images.append(img)
            
    # Split 80/20
    split_idx = int(len(valid_images) * 0.8)
    train_imgs = valid_images[:split_idx]
    test_imgs = valid_images[split_idx:]
    
    # Move files
    for img in train_imgs:
        shutil.copy(os.path.join(src_dir, img), f'dataset/train/{clean_name}/{img}')
        
    for img in test_imgs:
        shutil.copy(os.path.join(src_dir, img), f'dataset/test/{clean_name}/{img}')
        
    print(f"{clean_name}: {len(train_imgs)} train, {len(test_imgs)} test")

print("\n‚úì Dataset organized!")

## 5Ô∏è‚É£ Prepare Data Generators

In [None]:
# Configuration
IMG_SIZE = 224
BATCH_SIZE = 32
EPOCHS = 20

# Data augmentation
train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    zoom_range=0.2,
    brightness_range=[0.8, 1.2]
)

val_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
    'dataset/train',
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='categorical'
)

val_generator = val_datagen.flow_from_directory(
    'dataset/test',
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='categorical'
)

num_classes = len(train_generator.class_indices)
print(f"\nClasses ({num_classes}): {list(train_generator.class_indices.keys())}")

## 6Ô∏è‚É£ Build Model

In [None]:
base_model = tf.keras.applications.EfficientNetB0(
    include_top=False,
    weights='imagenet',
    input_shape=(IMG_SIZE, IMG_SIZE, 3)
)

base_model.trainable = False

model = tf.keras.Sequential([
    base_model,
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

## 7Ô∏è‚É£ Train

In [None]:
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        'best_model.h5',
        monitor='val_accuracy',
        save_best_only=True,
        verbose=1
    ),
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    )
]

history = model.fit(
    train_generator,
    epochs=EPOCHS,
    validation_data=val_generator,
    callbacks=callbacks
)

print("\n‚úì Training complete!")

## 8Ô∏è‚É£ Save & Download

In [None]:
# Save model
model.save('visual_search_model.h5')

# Save labels
class_labels = {v: k for k, v in train_generator.class_indices.items()}
import json
with open('class_labels.json', 'w') as f:
    json.dump(class_labels, f, indent=2)

# Save info
model_info = {
    'model': 'EfficientNet-B0',
    'num_classes': num_classes,
    'categories': list(train_generator.class_indices.keys())
}
with open('model_info.json', 'w') as f:
    json.dump(model_info, f, indent=2)

from google.colab import files
files.download('visual_search_model.h5')
files.download('class_labels.json')
files.download('model_info.json')

print("\n‚úì All files downloaded!")