# üîç Data Exploration
Explore the waste classification dataset

In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# üîç Data Exploration\n",
    "Explore the waste classification dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "import os\n",
    "import sys\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from PIL import Image\n",
    "from pathlib import Path\n",
    "\n",
    "# Add parent directory to path\n",
    "sys.path.append('..')\n",
    "from utils.visualization import plot_class_distribution\n",
    "\n",
    "# Set style\n",
    "sns.set_style('whitegrid')\n",
    "plt.rcParams['figure.figsize'] = (12, 8)\n",
    "\n",
    "print('‚úÖ Imports successful')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Dataset path\n",
    "DATA_DIR = '../data/raw'\n",
    "\n",
    "# Check if data exists\n",
    "if not os.path.exists(DATA_DIR):\n",
    "    print('‚ùå Dataset not found!')\n",
    "    print(f'Please download dataset to: {DATA_DIR}')\n",
    "else:\n",
    "    print(f'‚úÖ Dataset found at: {DATA_DIR}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Dataset Statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Count images per class\n",
    "class_counts = {}\n",
    "total_images = 0\n",
    "\n",
    "for class_name in os.listdir(DATA_DIR):\n",
    "    class_path = os.path.join(DATA_DIR, class_name)\n",
    "    if os.path.isdir(class_path):\n",
    "        count = len([f for f in os.listdir(class_path) \n",
    "                    if f.lower().endswith(('.jpg', '.jpeg', '.png'))])\n",
    "        class_counts[class_name] = count\n",
    "        total_images += count\n",
    "\n",
    "print(f'Total Images: {total_images}')\n",
    "print(f'Number of Classes: {len(class_counts)}')\n",
    "print('\\nImages per class:')\n",
    "for class_name, count in sorted(class_counts.items(), key=lambda x: x[1], reverse=True):\n",
    "    print(f'  {class_name}: {count}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Plot class distribution\n",
    "plot_class_distribution(DATA_DIR)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Sample Images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Display sample images from each class\n",
    "fig, axes = plt.subplots(2, 4, figsize=(16, 8))\n",
    "axes = axes.ravel()\n",
    "\n",
    "for idx, class_name in enumerate(list(class_counts.keys())[:8]):\n",
    "    class_path = os.path.join(DATA_DIR, class_name)\n",
    "    images = [f for f in os.listdir(class_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]\n",
    "    \n",
    "    if images:\n",
    "        sample_img = Image.open(os.path.join(class_path, images[0]))\n",
    "        axes[idx].imshow(sample_img)\n",
    "        axes[idx].set_title(f'{class_name}\\n({class_counts[class_name]} images)', fontsize=12)\n",
    "        axes[idx].axis('off')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Image Dimensions Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Analyze image dimensions\n",
    "dimensions = []\n",
    "sample_size = 100  # Sample 100 images per class\n",
    "\n",
    "for class_name in list(class_counts.keys()):\n",
    "    class_path = os.path.join(DATA_DIR, class_name)\n",
    "    images = [f for f in os.listdir(class_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]\n",
    "    \n",
    "    for img_name in images[:sample_size]:\n",
    "        try:\n",
    "            img = Image.open(os.path.join(class_path, img_name))\n",
    "            dimensions.append({\n",
    "                'class': class_name,\n",
    "                'width': img.width,\n",
    "                'height': img.height,\n",
    "                'aspect_ratio': img.width / img.height\n",
    "            })\n",
    "        except:\n",
    "            pass\n",
    "\n",
    "df_dims = pd.DataFrame(dimensions)\n",
    "print(df_dims.describe())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Plot dimension distributions\n",
    "fig, axes = plt.subplots(1, 3, figsize=(18, 5))\n",
    "\n",
    "axes[0].hist(df_dims['width'], bins=50, edgecolor='black')\n",
    "axes[0].set_title('Width Distribution')\n",
    "axes[0].set_xlabel('Width (pixels)')\n",
    "axes[0].set_ylabel('Frequency')\n",
    "\n",
    "axes[1].hist(df_dims['height'], bins=50, edgecolor='black')\n",
    "axes[1].set_title('Height Distribution')\n",
    "axes[1].set_xlabel('Height (pixels)')\n",
    "axes[1].set_ylabel('Frequency')\n",
    "\n",
    "axes[2].hist(df_dims['aspect_ratio'], bins=50, edgecolor='black')\n",
    "axes[2].set_title('Aspect Ratio Distribution')\n",
    "axes[2].set_xlabel('Aspect Ratio')\n",
    "axes[2].set_ylabel('Frequency')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Recommendations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "print('üìä Dataset Recommendations:')\n",
    "print(f'1. Total images: {total_images}')\n",
    "print(f'2. Number of classes: {len(class_counts)}')\n",
    "print(f'3. Recommended image size: 224x224 or 299x299')\n",
    "print(f'4. Data augmentation: Strongly recommended')\n",
    "print(f'5. Train/Val/Test split: 70/20/10')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from pathlib import Path

# Add parent directory to path
sys.path.append('..')
from utils.visualization import plot_class_distribution

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

print('‚úÖ Imports successful')

In [None]:
# Dataset path
DATA_DIR = '../data/raw'

# Check if data exists
if not os.path.exists(DATA_DIR):
    print('‚ùå Dataset not found!')
    print(f'Please download dataset to: {DATA_DIR}')
else:
    print(f'‚úÖ Dataset found at: {DATA_DIR}')

## 1. Dataset Statistics

In [None]:
# Count images per class
class_counts = {}
total_images = 0

for class_name in os.listdir(DATA_DIR):
    class_path = os.path.join(DATA_DIR, class_name)
    if os.path.isdir(class_path):
        count = len([f for f in os.listdir(class_path) 
                    if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
        class_counts[class_name] = count
        total_images += count

print(f'Total Images: {total_images}')
print(f'Number of Classes: {len(class_counts)}')
print('\nImages per class:')
for class_name, count in sorted(class_counts.items(), key=lambda x: x[1], reverse=True):
    print(f'  {class_name}: {count}')

In [None]:
# Plot class distribution
plot_class_distribution(DATA_DIR)

## 2. Sample Images

In [None]:
# Display sample images from each class
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.ravel()

for idx, class_name in enumerate(list(class_counts.keys())[:8]):
    class_path = os.path.join(DATA_DIR, class_name)
    images = [f for f in os.listdir(class_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    
    if images:
        sample_img = Image.open(os.path.join(class_path, images[0]))
        axes[idx].imshow(sample_img)
        axes[idx].set_title(f'{class_name}\n({class_counts[class_name]} images)', fontsize=12)
        axes[idx].axis('off')

plt.tight_layout()
plt.show()

## 3. Image Dimensions Analysis

In [None]:
# Analyze image dimensions
dimensions = []
sample_size = 100  # Sample 100 images per class

for class_name in list(class_counts.keys()):
    class_path = os.path.join(DATA_DIR, class_name)
    images = [f for f in os.listdir(class_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    
    for img_name in images[:sample_size]:
        try:
            img = Image.open(os.path.join(class_path, img_name))
            dimensions.append({
                'class': class_name,
                'width': img.width,
                'height': img.height,
                'aspect_ratio': img.width / img.height
            })
        except:
            pass

df_dims = pd.DataFrame(dimensions)
print(df_dims.describe())

In [None]:
# Plot dimension distributions
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

axes[0].hist(df_dims['width'], bins=50, edgecolor='black')
axes[0].set_title('Width Distribution')
axes[0].set_xlabel('Width (pixels)')
axes[0].set_ylabel('Frequency')

axes[1].hist(df_dims['height'], bins=50, edgecolor='black')
axes[1].set_title('Height Distribution')
axes[1].set_xlabel('Height (pixels)')
axes[1].set_ylabel('Frequency')

axes[2].hist(df_dims['aspect_ratio'], bins=50, edgecolor='black')
axes[2].set_title('Aspect Ratio Distribution')
axes[2].set_xlabel('Aspect Ratio')
axes[2].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 4. Recommendations

In [None]:
print('üìä Dataset Recommendations:')
print(f'1. Total images: {total_images}')
print(f'2. Number of classes: {len(class_counts)}')
print(f'3. Recommended image size: 224x224 or 299x299')
print(f'4. Data augmentation: Strongly recommended')
print(f'5. Train/Val/Test split: 70/20/10')