# Training Notebook (XRay Classification)

This notebook trains a baseline CNN on XRay images using the labels you created in `inspect_dataset.ipynb`.

Requirements: `tensorflow`, `pandas`, `numpy`.

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras import layers, models

In [None]:
# Hyperparameters (centralized)
THRESHOLD = 0.036
IMG_SIZE = (224, 224)
BATCH_SIZE = 16
EPOCHS = 5

# Model architecture
CONV_FILTERS = [16, 32, 64]
DENSE_UNITS = 64

In [3]:
# Load labeled data (rebuild mapping here so notebook is standalone)
import re
from pathlib import Path
import pandas as pd

csv_path = Path("CrackVoid Ratios") / "Xray Void Ratio.csv"
df = pd.read_csv(csv_path, sep=";")
df["Void rate"] = pd.to_numeric(df["Void rate"], errors="coerce")

df["label"] = (df["Void rate"] >= THRESHOLD).astype(int)

def key_from_row(r):
    return (r["Led Type"], int(r["Panel"]), int(r["LED Number"]))

lookup = {key_from_row(r): r for _, r in df.iterrows()}

xray_root = Path("XRay") / "XRay"
rows = []

for img_path in xray_root.rglob("*.jpg"):
    name = img_path.name
    parts = name.split("_")
    if len(parts) < 5:
        continue
    led_type = parts[1]
    panel = None
    led_num = None
    for part in parts:
        if part.startswith("Panel"):
            panel = int(re.sub(r"[^\d]", "", part))
        if part.startswith("LED"):
            led_num = int(re.sub(r"[^\d]", "", part))
    if panel is None or led_num is None:
        continue
    key = (led_type, panel, led_num)
    if key in lookup:
        r = lookup[key]
        rows.append({
            "path": str(img_path),
            "label": int(r["label"]),
            "void_rate": float(r["Void rate"]),
            "panel": panel,
        })

labeled = pd.DataFrame(rows)
labeled.head()


Unnamed: 0,path,label,void_rate,panel
0,XRay\XRay\FC-GB1\XRay_FC-GB1_SAC105_Panel1_LED...,0,0.031611,1
1,XRay\XRay\FC-GB1\XRay_FC-GB1_SAC105_Panel1_LED...,0,0.018718,1
2,XRay\XRay\FC-GB1\XRay_FC-GB1_SAC105_Panel1_LED...,0,0.005978,1
3,XRay\XRay\FC-GB1\XRay_FC-GB1_SAC105_Panel1_LED...,0,0.015036,1
4,XRay\XRay\FC-GB1\XRay_FC-GB1_SAC105_Panel1_LED...,0,0.023375,1


In [4]:
labeled["label"].value_counts()
labeled["void_rate"].describe()


count    1800.000000
mean        0.028636
std         0.023569
min         0.000000
25%         0.010268
50%         0.023420
75%         0.039848
max         0.107209
Name: void_rate, dtype: float64

In [5]:
# Train/val/test split with panel-level split to reduce leakage
from sklearn.model_selection import train_test_split

panels = labeled["panel"].unique()
train_panels, temp_panels = train_test_split(panels, test_size=0.3, random_state=42)
val_panels, test_panels = train_test_split(temp_panels, test_size=0.5, random_state=42)

train_df = labeled[labeled["panel"].isin(train_panels)].reset_index(drop=True)
val_df = labeled[labeled["panel"].isin(val_panels)].reset_index(drop=True)
test_df = labeled[labeled["panel"].isin(test_panels)].reset_index(drop=True)

train_df["label"].value_counts(), val_df["label"].value_counts(), test_df["label"].value_counts()

(label
 0    645
 1    255
 Name: count, dtype: int64,
 label
 0    375
 1     75
 Name: count, dtype: int64,
 label
 0    255
 1    195
 Name: count, dtype: int64)

In [6]:
# Create tf.data pipelines
def load_image(path, label):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, IMG_SIZE)
    img = tf.cast(img, tf.float32) / 255.0
    return img, label

def make_ds(df, shuffle=True):
    paths = df["path"].values
    labels = df["label"].values
    ds = tf.data.Dataset.from_tensor_slices((paths, labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df), seed=42)
    ds = ds.map(load_image, num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    return ds

train_ds = make_ds(train_df, shuffle=True)
val_ds = make_ds(val_df, shuffle=False)
test_ds = make_ds(test_df, shuffle=False)

In [7]:
# Baseline CNN
model = models.Sequential([
    layers.Input(shape=(*IMG_SIZE, 3)),
    layers.Conv2D(CONV_FILTERS[0], 3, activation="relu"),
    layers.MaxPooling2D(),
    layers.Conv2D(CONV_FILTERS[1], 3, activation="relu"),
    layers.MaxPooling2D(),
    layers.Conv2D(CONV_FILTERS[2], 3, activation="relu"),
    layers.MaxPooling2D(),
    layers.Flatten(),
    layers.Dense(DENSE_UNITS, activation="relu"),
    layers.Dense(1, activation="sigmoid"),
])

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"],
)

model.summary()

In [8]:
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
)

model.evaluate(test_ds)

Epoch 1/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 271ms/step - accuracy: 0.7144 - loss: 0.5433 - val_accuracy: 0.7067 - val_loss: 0.4439
Epoch 2/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 213ms/step - accuracy: 0.8133 - loss: 0.3845 - val_accuracy: 0.6911 - val_loss: 0.4353
Epoch 3/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 207ms/step - accuracy: 0.8567 - loss: 0.3558 - val_accuracy: 0.7356 - val_loss: 0.4939
Epoch 4/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 207ms/step - accuracy: 0.8589 - loss: 0.3278 - val_accuracy: 0.7689 - val_loss: 0.4886
Epoch 5/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 210ms/step - accuracy: 0.8589 - loss: 0.3180 - val_accuracy: 0.7578 - val_loss: 0.4475
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 142ms/step - accuracy: 0.7489 - loss: 0.7468


[0.746765673160553, 0.7488889098167419]