#Task 1 Data Engineer

##1.Google Colab Setup

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Create a directory for your project data in Google Drive
import os
project_path = '/content/drive/My Drive/AnimalSubspeciesDataset'
os.makedirs(project_path, exist_ok=True)
%cd {project_path}

Mounted at /content/drive
/content/drive/My Drive/AnimalSubspeciesDataset


##2.Data Collection (Web Crawling)

**Web scraping techniques can be used to gather a huge number of photos for particular classes.  Jmd_imagescraper (DuckDuckGo Image Scraper) is a handy library for Google Colab that makes the procedure easier.**

**Steps for Data Collection:**

*   Install jmd_imagescraper:

In [2]:
!pip install jmd_imagescraper

Collecting jmd_imagescraper
  Downloading jmd_imagescraper-1.0.2-py3-none-any.whl.metadata (2.2 kB)
Collecting bs4 (from jmd_imagescraper)
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets->jmd_imagescraper)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jmd_imagescraper-1.0.2-py3-none-any.whl (12 kB)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi, bs4, jmd_imagescraper
Successfully installed bs4-0.0.2 jedi-0.19.2 jmd_imagescraper-1.0.2


*   Import necessary libraries:

In [3]:
from pathlib import Path
from jmd_imagescraper.core import * #

##3.Define your classes and download images: For each of your chosen animal subspecies

In [4]:
root = Path().cwd()/"images" # This will create an 'images' folder in your project_path
os.makedirs(root, exist_ok=True) # Ensure the root directory exists

# Class 1: Golden Retriever
duckduckgo_search(root, "Golden_Retriever", "golden retriever dog", max_results=1500)

# Class 2: Siamese Cat
duckduckgo_search(root, "Siamese_Cat", "siamese cat", max_results=1500)

# Class 3: Bengal Tiger
duckduckgo_search(root, "Bengal_Tiger", "bengal tiger animal", max_results=1500)

# Class 4: African Elephant
duckduckgo_search(root, "African_Elephant", "african elephant animal", max_results=1500)

# Class 5: Emperor Penguin
duckduckgo_search(root, "Emperor_Penguin", "emperor penguin bird", max_results=1500)

# Class 6: Red Panda
duckduckgo_search(root, "Red_Panda", "red panda animal", max_results=1500)

# Class 7: Blue Jay
duckduckgo_search(root, "Blue_Jay", "blue jay bird", max_results=1500)

# Class 8: Grizzly Bear
duckduckgo_search(root, "Grizzly_Bear", "grizzly bear animal", max_results=1500)

Duckduckgo search: golden retriever dog
Downloading results into /content/drive/MyDrive/AnimalSubspeciesDataset/images/Golden_Retriever


Duckduckgo search: siamese cat
Downloading results into /content/drive/MyDrive/AnimalSubspeciesDataset/images/Siamese_Cat


Duckduckgo search: bengal tiger animal
Downloading results into /content/drive/MyDrive/AnimalSubspeciesDataset/images/Bengal_Tiger


Duckduckgo search: african elephant animal
Downloading results into /content/drive/MyDrive/AnimalSubspeciesDataset/images/African_Elephant


Duckduckgo search: emperor penguin bird
Downloading results into /content/drive/MyDrive/AnimalSubspeciesDataset/images/Emperor_Penguin


Duckduckgo search: red panda animal
Downloading results into /content/drive/MyDrive/AnimalSubspeciesDataset/images/Red_Panda


Duckduckgo search: blue jay bird
Downloading results into /content/drive/MyDrive/AnimalSubspeciesDataset/images/Blue_Jay


Duckduckgo search: grizzly bear animal
Downloading results into /content/drive/MyDrive/AnimalSubspeciesDataset/images/Grizzly_Bear


[PosixPath('/content/drive/MyDrive/AnimalSubspeciesDataset/images/Grizzly_Bear/001_42d83d78.jpg'),
 PosixPath('/content/drive/MyDrive/AnimalSubspeciesDataset/images/Grizzly_Bear/002_b28d8fc9.jpg'),
 PosixPath('/content/drive/MyDrive/AnimalSubspeciesDataset/images/Grizzly_Bear/003_660442b6.jpg'),
 PosixPath('/content/drive/MyDrive/AnimalSubspeciesDataset/images/Grizzly_Bear/004_c9753496.jpg'),
 PosixPath('/content/drive/MyDrive/AnimalSubspeciesDataset/images/Grizzly_Bear/005_d1b4bf9d.jpg'),
 PosixPath('/content/drive/MyDrive/AnimalSubspeciesDataset/images/Grizzly_Bear/006_c72de8bd.jpg'),
 PosixPath('/content/drive/MyDrive/AnimalSubspeciesDataset/images/Grizzly_Bear/007_0050b866.jpg'),
 PosixPath('/content/drive/MyDrive/AnimalSubspeciesDataset/images/Grizzly_Bear/008_e1bb6536.jpg'),
 PosixPath('/content/drive/MyDrive/AnimalSubspeciesDataset/images/Grizzly_Bear/009_915b3601.jpg'),
 PosixPath('/content/drive/MyDrive/AnimalSubspeciesDataset/images/Grizzly_Bear/010_e946ee31.jpg'),
 PosixPath

##4.Data Standardization (Image Preprocessing)

**Neural networks typically require images to be of a uniform size.**

**Steps for Data Standardization:**

*   Install Pillow

In [5]:
!pip install Pillow



*  Import necessary libraries:

In [None]:
from PIL import Image
import glob
import os

*  Define target size: A common size for image classification models is 224x224 or 128x128.

In [None]:
IMG_WIDTH = 224
IMG_HEIGHT = 224
TARGET_SIZE = (IMG_WIDTH, IMG_HEIGHT)

*  Batch Resize Images:

In [None]:
input_base_dir = Path().cwd()/"images" # Where your downloaded images are
output_base_dir = Path().cwd()/"processed_images"
os.makedirs(output_base_dir, exist_ok=True)

for class_folder in os.listdir(input_base_dir):
    class_input_path = os.path.join(input_base_dir, class_folder)
    class_output_path = os.path.join(output_base_dir, class_folder)
    os.makedirs(class_output_path, exist_ok=True)

    if os.path.isdir(class_input_path):
        print(f"Processing images in: {class_folder}")
        for img_file in glob.glob(os.path.join(class_input_path, '*.*')):
            try:
                with Image.open(img_file) as img:
                    img = img.resize(TARGET_SIZE)
                    # Save with original filename in the new processed directory
                    img_name = os.path.basename(img_file)
                    img.save(os.path.join(class_output_path, img_name))
            except Exception as e:
                print(f"Could not process {img_file}: {e}")
print("Image standardization complete.")

Processing images in: Golden_Retriever
Processing images in: Siamese_Cat
Processing images in: Bengal_Tiger
Processing images in: African_Elephant
Processing images in: Emperor_Penguin
Processing images in: Red_Panda
Processing images in: Blue_Jay
Processing images in: Grizzly_Bear
Image standardization complete.


##5.Creating Dataset (Splitting into Train, Validation, Test)

**Steps for Splitting the Dataset:**

*   Install split_folders:

In [None]:
!pip install split_folders

Collecting split_folders
  Downloading split_folders-0.5.1-py3-none-any.whl.metadata (6.2 kB)
Downloading split_folders-0.5.1-py3-none-any.whl (8.4 kB)
Installing collected packages: split_folders
Successfully installed split_folders-0.5.1


*  Import the library:

In [None]:
import splitfolders

*  Perform the split:

**Common ratios are 70% for training, 15% for validation, and 15% for testing.**

In [None]:
input_folder = Path().cwd()/"processed_images"
output_folder = Path().cwd()/"dataset_splits" # This will be the output directory for your splits

# Split with a ratio of 70% training, 15% validation, 15% testing
# set 'seed' for reproducibility
# 'group_prefix' is useful if you have multiple datasets and want to prefix the output folders
splitfolders.ratio(input_folder, output=output_folder, seed=42, ratio=(0.7, 0.15, 0.15), group_prefix=None)

print("Dataset splitting complete.")

Copying files: 3476 files [01:25, 40.69 files/s]

Dataset splitting complete.





##6.Verification

In [None]:
import os

split_base_path = Path().cwd()/"dataset_splits"

for split_type in ['train', 'val', 'test']:
    split_path = os.path.join(split_base_path, split_type)
    print(f"\n--- {split_type.upper()} SET ---")
    total_images_in_split = 0
    for class_name in os.listdir(split_path):
        class_path = os.path.join(split_path, class_name)
        if os.path.isdir(class_path):
            num_images = len(os.listdir(class_path))
            print(f"  {class_name}: {num_images} images")
            total_images_in_split += num_images
    print(f"Total images in {split_type} set: {total_images_in_split}")


--- TRAIN SET ---
  Golden_Retriever: 263 images
  Siamese_Cat: 272 images
  Bengal_Tiger: 322 images
  African_Elephant: 308 images
  Emperor_Penguin: 294 images
  Red_Panda: 312 images
  Blue_Jay: 343 images
  Grizzly_Bear: 317 images
Total images in train set: 2431

--- VAL SET ---
  Golden_Retriever: 56 images
  Siamese_Cat: 58 images
  Bengal_Tiger: 69 images
  African_Elephant: 66 images
  Emperor_Penguin: 63 images
  Red_Panda: 67 images
  Blue_Jay: 73 images
  Grizzly_Bear: 67 images
Total images in val set: 519

--- TEST SET ---
  Golden_Retriever: 57 images
  Siamese_Cat: 59 images
  Bengal_Tiger: 69 images
  African_Elephant: 66 images
  Emperor_Penguin: 63 images
  Red_Panda: 68 images
  Blue_Jay: 75 images
  Grizzly_Bear: 69 images
Total images in test set: 526
