# BY: Hrishikesh Dalal

In [None]:
# Cell 1: Install required packages
!pip install -q kaggle
!pip install -q tensorflow==2.14.0   # compatible; adjust if needed
!pip install -q shap==0.42.1
!pip install -q lime
!pip install -q scikit-image
!pip install -q matplotlib seaborn scikit-learn


[31mERROR: Could not find a version that satisfies the requirement tensorflow==2.14.0 (from versions: 2.16.0rc0, 2.16.1, 2.16.2, 2.17.0rc0, 2.17.0rc1, 2.17.0, 2.17.1, 2.18.0rc0, 2.18.0rc1, 2.18.0rc2, 2.18.0, 2.18.1, 2.19.0rc0, 2.19.0, 2.19.1, 2.20.0rc0, 2.20.0)[0m[31m
[0m[31mERROR: No matching distribution found for tensorflow==2.14.0[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m402.7/402.7 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for shap (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for lime (setup.py) ... [?25l[?25hdone


In [None]:
# Cell 2: Kaggle auth and download dataset
import os
from google.colab import files

# Upload kaggle.json if not already present
if not os.path.exists('/root/.kaggle/kaggle.json'):
    print("Please upload kaggle.json (Kaggle API token).")
    uploaded = files.upload()  # use the Colab file-picker
    for fn in uploaded:
        if fn == 'kaggle.json':
            os.makedirs('/root/.kaggle', exist_ok=True)
            !cp kaggle.json /root/.kaggle/kaggle.json
            !chmod 600 /root/.kaggle/kaggle.json

# Replace below with the dataset you want; the paper's author dataset:
KAGGLE_DATASET = "mdismielhossenabir/hieroglyphs-handwriting-letter-recognition"

# Download and unzip
!kaggle datasets download -d $KAGGLE_DATASET -p /content --unzip
print("Dataset downloaded to /content")


Please upload kaggle.json (Kaggle API token).


Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/mdismielhossenabir/hieroglyphs-handwriting-letter-recognition
License(s): apache-2.0
Downloading hieroglyphs-handwriting-letter-recognition.zip to /content
  0% 0.00/643k [00:00<?, ?B/s]
100% 643k/643k [00:00<00:00, 972MB/s]
Dataset downloaded to /content


In [None]:
# Cell 3: List files to see structure
import os, glob
base = "/content"
for root, dirs, files in os.walk(base):
    # show top-level files/folders only
    if root.count('/') <= base.count('/') + 1:
        print(root, " | dirs:", len(dirs), " files:", len(files))
        print("  -", dirs[:20])


/content  | dirs: 3  files: 1
  - ['.config', 'New folder', 'sample_data']
/content/.config  | dirs: 2  files: 8
  - ['logs', 'configurations']
/content/New folder  | dirs: 3  files: 0
  - ['f', 'h', 'kh']
/content/sample_data  | dirs: 0  files: 6
  - []


In [None]:
import shutil, random
from pathlib import Path

DATA_ROOT = Path("/content/dataset")
OUT_DIR = Path("/content/hieroglyphs_split")

if OUT_DIR.exists():
    shutil.rmtree(OUT_DIR)
for split in ["train", "val", "test"]:
    (OUT_DIR/split).mkdir(parents=True)

random.seed(42)

# Each subfolder in dataset/ is a class (f, h, kh, etc.)
for cls in DATA_ROOT.iterdir():
    if not cls.is_dir():
        continue
    imgs = list(cls.glob("*.jpg")) + list(cls.glob("*.png")) + list(cls.glob("*.PNG"))
    random.shuffle(imgs)
    n = len(imgs)
    n_train = int(0.8*n)
    n_val = int(0.1*n)

    train_imgs = imgs[:n_train]
    val_imgs = imgs[n_train:n_train+n_val]
    test_imgs = imgs[n_train+n_val:]

    for split, files in zip(["train","val","test"], [train_imgs,val_imgs,test_imgs]):
        dest = OUT_DIR/split/cls.name
        dest.mkdir(parents=True, exist_ok=True)
        for p in files:
            shutil.copy(p, dest/p.name)

    print(f"Processed class {cls.name}: {len(train_imgs)} train, {len(val_imgs)} val, {len(test_imgs)} test")

print("✅ Created train/val/test at", OUT_DIR)


FileNotFoundError: [Errno 2] No such file or directory: '/content/dataset'

In [None]:
# Cell 5: Keras data generators with augmentation (resize to 100x100 as paper)
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

IMG_SIZE = (100, 100)
BATCH_SIZE = 32

train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

test_val_datagen = ImageDataGenerator(rescale=1./255)

train_gen = train_datagen.flow_from_directory(
    OUT_DIR/"train",
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=True
)

val_gen = test_val_datagen.flow_from_directory(
    OUT_DIR/"val",
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False
)

test_gen = test_val_datagen.flow_from_directory(
    OUT_DIR/"test",
    target_size=IMG_SIZE,
    batch_size=1,
    class_mode='categorical',
    shuffle=False
)

num_classes = train_gen.num_classes
class_indices = train_gen.class_indices
print("num_classes:", num_classes)
print("class_indices sample:", dict(list(class_indices.items())[:10]))


In [None]:
# Cell 6: Model definition — 4 conv blocks + FC layers, L2 reg, BatchNorm, Dropout optional
from tensorflow.keras import layers, models, regularizers, optimizers

def create_light_cnn(input_shape=(100,100,3), num_classes=num_classes, l2=1e-4):
    reg = regularizers.l2(l2)
    inp = layers.Input(shape=input_shape)

    x = layers.Conv2D(32, (3,3), padding='same', activation='relu', kernel_regularizer=reg)(inp)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D()(x)

    x = layers.Conv2D(64, (3,3), padding='same', activation='relu', kernel_regularizer=reg)(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D()(x)

    x = layers.Conv2D(128, (3,3), padding='same', activation='relu', kernel_regularizer=reg)(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D()(x)

    x = layers.Conv2D(128, (3,3), padding='same', activation='relu', kernel_regularizer=reg)(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D()(x)

    x = layers.Flatten()(x)
    x = layers.Dense(256, activation='relu', kernel_regularizer=reg)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.5)(x)

    x = layers.Dense(128, activation='relu', kernel_regularizer=reg)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.4)(x)

    out = layers.Dense(num_classes, activation='softmax')(x)

    model = models.Model(inputs=inp, outputs=out)
    return model

model = create_light_cnn(input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3))
model.summary()


In [None]:
# Cell 7: Compile and set callbacks
model.compile(
    optimizer=optimizers.Adam(learning_rate=1e-4),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

checkpoint_path = "/content/best_model.h5"
callbacks = [
    ModelCheckpoint(checkpoint_path, save_best_only=True, monitor='val_accuracy', mode='max'),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-7, verbose=1),
    EarlyStopping(monitor='val_loss', patience=12, restore_best_weights=True)
]


In [None]:
# Cell 8: Train the model
EPOCHS = 8  # you can increase
history = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=EPOCHS,
    callbacks=callbacks
)


In [None]:
# Cell 9: Plot metrics and load best model
import matplotlib.pyplot as plt
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend()
plt.subplot(1,2,2)
plt.plot(history.history['accuracy'], label='train_acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
plt.legend()
plt.show()

# load best
from tensorflow.keras.models import load_model
model = load_model(checkpoint_path)


In [None]:
# Cell 10: Evaluate on test set — predictions, classification report, confusion matrix, ROC/AUC
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
y_true = []
y_pred = []
y_score = []

# since test_gen has batch_size=1
for i in range(len(test_gen)):
    x, y = test_gen[i]
    preds = model.predict(x)
    y_true.append(np.argmax(y, axis=1)[0])
    y_pred.append(np.argmax(preds, axis=1)[0])
    y_score.append(preds[0])

y_true = np.array(y_true)
y_pred = np.array(y_pred)
y_score = np.array(y_score)

labels = list(class_indices.keys())

print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=labels, zero_division=0))

import seaborn as sns
import matplotlib.pyplot as plt
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10,8))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted"); plt.ylabel("True")
plt.show()

# ROC/AUC (one-vs-rest)
from sklearn.preprocessing import label_binarize
y_bin = label_binarize(y_true, classes=range(num_classes))
try:
    aucs = []
    for i in range(num_classes):
        auc = roc_auc_score(y_bin[:,i], y_score[:,i])
        aucs.append(auc)
    print("Per-class AUC sample (first 10):", aucs[:10])
    print("Macro AUC:", np.mean(aucs))
except Exception as e:
    print("AUC calculation failed (maybe small samples per class):", e)


In [None]:
# Cell 11: SHAP explanations (DeepExplainer for TF models)
import shap
import numpy as np
import matplotlib.pyplot as plt

# Build a small background dataset (e.g., 30 train images)
bg_imgs = []
count = 0
for x,y in train_gen:
    for img in x:
        bg_imgs.append(img)
        count += 1
        if count >= 30: break
    if count >= 30: break
bg = np.array(bg_imgs)

# choose a few test images to explain
test_imgs = []
test_paths = []
for i in range(5):  # first 5 test images
    xi, yi = test_gen[i]
    test_imgs.append(xi[0])
    test_paths.append(test_gen.filepaths[i])
test_imgs = np.array(test_imgs)

# Use GradientExplainer or DeepExplainer depending on model backend
# Try DeepExplainer (works with many TF models)
explainer = shap.DeepExplainer(model, bg)
shap_values = explainer.shap_values(test_imgs)  # list of arrays (num_classes, N, H, W, C)

# Visualize SHAP for first image (shows per-class shap values)
idx = 0
plt.figure(figsize=(8,4))
plt.suptitle(f"Original: {test_paths[idx].split('/')[-1]}")
plt.subplot(1,2,1)
plt.imshow(test_imgs[idx])
plt.axis('off')

# aggregate absolute shap across color channels for the predicted class
pred_class = np.argmax(model.predict(test_imgs[idx:idx+1]), axis=1)[0]
sv = shap_values[pred_class][idx]  # HxWxC
sv_abs = np.mean(np.abs(sv), axis=2)
plt.subplot(1,2,2)
plt.imshow(sv_abs, cmap='coolwarm')
plt.title(f"SHAP (class {pred_class})")
plt.axis('off')
plt.colorbar()
plt.show()


In [None]:
# Cell 12: LIME image explanations
from lime import lime_image
from skimage.segmentation import mark_boundaries
from tensorflow.keras.applications.imagenet_utils import preprocess_input

explainer = lime_image.LimeImageExplainer()

def predict_fn(images):
    # images come in as uint8 HxWx3; convert to float and rescale
    imgs = np.array([img.astype('float32')/255.0 for img in images])
    return model.predict(imgs)

# explain first test image
test_img = test_imgs[0]
explanation = explainer.explain_instance((test_img*255).astype('uint8'),
                                         classifier_fn=predict_fn,
                                         top_labels=5, hide_color=0, num_samples=200)

# get image and mask for top predicted label
temp, mask = explanation.get_image_and_mask(explanation.top_labels[0], positive_only=True, num_features=5, hide_rest=False)
plt.figure(figsize=(6,6))
plt.imshow(mark_boundaries(temp/255.0, mask))
plt.axis('off')
plt.title("LIME positive regions")
plt.show()


In [None]:
# Cell 13: Save model
model.save("/content/hieroglyph_light_cnn.h5")
print("Saved model to /content/hieroglyph_light_cnn.h5")


In [None]:
# Downgrade numpy to a version compatible with shap
!pip install numpy==1.24.4

After installing the compatible NumPy version, please rerun the SHAP cell (`GnZIHIfFMUeg`) to see if the error is resolved.