In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Import Libraries


In [None]:

import os
import glob
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import cv2
import torchvision.transforms as T


### Define Dataset Paths

In [None]:

base_dir = "/kaggle/input/py-crack"  

class_dir = os.path.join(base_dir, "Classification")
seg_img_dir = os.path.join(base_dir, "Segmentation", "Original image")
seg_mask_dir = os.path.join(base_dir, "Segmentation", "Ground truth")


### List All Folders and Files

In [None]:

for root, dirs, files in os.walk(base_dir):
    print(f"📁 {root}: {len(files)} files")


### Count Images per Class

In [None]:


class_counts = {}
for class_name in os.listdir(class_dir):
    path = os.path.join(class_dir, class_name)
    if os.path.isdir(path):
        files = glob.glob(os.path.join(path, "**", "*.jpg"), recursive=True) + \
                glob.glob(os.path.join(path, "**", "*.png"), recursive=True)
        class_counts[class_name] = len(files)

df_class = pd.DataFrame(list(class_counts.items()), columns=["Class", "Image_Count"])
print(df_class)

plt.figure(figsize=(6,4))
sns.barplot(data=df_class, x="Class", y="Image_Count", palette="Set2", dodge=False)
plt.title("Class Distribution")
plt.ylabel("Number of Images")
plt.show()


### Display Sample Images per Class

In [None]:
num_samples = 3
plt.figure(figsize=(num_samples*3, len(df_class)*3))

for i, class_name in enumerate(df_class["Class"]):
    class_path = os.path.join(class_dir, class_name)
    img_files = glob.glob(os.path.join(class_path, "**", "*.jpg"), recursive=True) + \
                glob.glob(os.path.join(class_path, "**", "*.png"), recursive=True)
    
    for j in range(num_samples):
        plt.subplot(len(df_class), num_samples, i*num_samples + j + 1)
        img = Image.open(random.choice(img_files))
        plt.imshow(img)
        plt.axis("off")
        if j == 1:
            plt.title(class_name)

plt.tight_layout()
plt.show()


### Analyze Image Sizes and Aspect Ratios

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [None]:
widths, heights, aspect_ratios = [], [], []
means, stds = [], []

all_imgs = glob.glob(os.path.join(class_dir, "**/*.jpg"), recursive=True) + \
            glob.glob(os.path.join(class_dir, "**/*.png"), recursive=True)
all_imgs = all_imgs[:500]  # limit to 500 for speed

for img_path in all_imgs:
    img = cv2.imread(img_path)
    if img is None:
        continue
    h, w, c = img.shape
    widths.append(w)
    heights.append(h)
    aspect_ratios.append(w/h)
    means.append(np.mean(img, axis=(0,1)))
    stds.append(np.std(img, axis=(0,1)))


fig, axs = plt.subplots(1, 3, figsize=(15, 4))
sns.histplot(widths, bins=20, ax=axs[0], color="orange"); axs[0].set_title("Width Distribution")
sns.histplot(heights, bins=20, ax=axs[1], color="green"); axs[1].set_title("Height Distribution")
sns.histplot(aspect_ratios, bins=20, ax=axs[2], color="blue"); axs[2].set_title("Aspect Ratio Distribution")
plt.show()

mean_rgb = np.mean(np.array(means), axis=0)
plt.bar(['Blue', 'Green', 'Red'], mean_rgb)
plt.title('Average Color Channel Intensity (BGR)')
plt.show()



### Segmentation Image-Mask Pairs

In [None]:
img_files = sorted(glob.glob(os.path.join(seg_img_dir, "*.jpg")) + glob.glob(os.path.join(seg_img_dir, "*.png")))
mask_files = sorted(glob.glob(os.path.join(seg_mask_dir, "*.jpg")) + glob.glob(os.path.join(seg_mask_dir, "*.png")))

plt.figure(figsize=(8,4))
for i in range(min(3, len(img_files))):
    img = Image.open(img_files[i])
    mask = Image.open(mask_files[i])

    plt.subplot(2, 3, i+1)
    plt.imshow(img)
    plt.axis("off")
    if i == 1: plt.title("Original Image")

    plt.subplot(2, 3, i+4)
    plt.imshow(mask, cmap="gray")
    plt.axis("off")
    if i == 1: plt.title("Ground Truth Mask")

plt.tight_layout()
plt.show()


### Random Samples from Classification

In [None]:
crack_files = glob.glob(os.path.join(class_dir, "With crack", "*"))
no_crack_files = glob.glob(os.path.join(class_dir, "Without crack", "*"))

plt.figure(figsize=(10,4))
samples = random.sample(crack_files, min(3,len(crack_files))) + random.sample(no_crack_files, min(3,len(no_crack_files)))

for i, img_path in enumerate(samples):
    img = Image.open(img_path)
    plt.subplot(2,3,i+1)
    plt.imshow(img)
    plt.axis("off")
    plt.title("With crack" if i<3 else "Without crack")
plt.tight_layout()
plt.show()


### Preview Image Augmentations

In [None]:
sample_img = Image.open(random.choice(crack_files))

augmentations = {
    "Original": T.Compose([]),
    "Horizontal Flip": T.RandomHorizontalFlip(p=1),
    "Vertical Flip": T.RandomVerticalFlip(p=1),
    "Random Crop": T.RandomResizedCrop(size=(sample_img.size[1], sample_img.size[0]), scale=(0.8,1.0)),
    "Color Jitter": T.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3)
}

plt.figure(figsize=(12,4))
for i, (name, transform) in enumerate(augmentations.items()):
    aug_img = transform(sample_img)
    plt.subplot(1,len(augmentations),i+1)
    plt.imshow(aug_img)
    plt.title(name)
    plt.axis("off")
plt.tight_layout()
plt.show()
