COnvert to IMG

In [1]:
import tifffile as tiff
from PIL import Image
import os

root_dir = "test"

for folder in os.listdir(root_dir):
    folder_path = os.path.join(root_dir, folder)
    if os.path.isdir(folder_path):
        for filename in os.listdir(folder_path):
            if filename.lower().endswith(".tif") or filename.lower().endswith(".tiff"):
                filepath = os.path.join(folder_path, filename)
                try:
                    image_array = tiff.imread(filepath)
                    img = Image.fromarray(image_array)
                    new_filename = filename.rsplit(".", 1)[0] + ".jpg"
                    img.convert("RGB").save(os.path.join(folder_path, new_filename))
                    print(f"Converted: {filename} -> {new_filename}")
                except Exception as e:
                    print(f"Failed to convert {filename}: {e}")


Converted: 0000001863.tif -> 0000001863.jpg
Converted: 00001014.tif -> 00001014.jpg
Converted: 0000106842.tif -> 0000106842.jpg
Converted: 0000107404.tif -> 0000107404.jpg
Converted: 0000109229.tif -> 0000109229.jpg
Converted: 0000114851.tif -> 0000114851.jpg
Converted: 0000121283.tif -> 0000121283.jpg
Converted: 0000121284.tif -> 0000121284.jpg
Converted: 0000121291.tif -> 0000121291.jpg
Converted: 00001216_1217.tif -> 00001216_1217.jpg
Converted: 0000121886.tif -> 0000121886.jpg
Converted: 0000125386.tif -> 0000125386.jpg
Converted: 0000125675.tif -> 0000125675.jpg
Converted: 0000126052.tif -> 0000126052.jpg
Converted: 0000126151.tif -> 0000126151.jpg
Converted: 0000126164.tif -> 0000126164.jpg
Converted: 0000126377.tif -> 0000126377.jpg
Converted: 0000126554.tif -> 0000126554.jpg
Converted: 0000126586.tif -> 0000126586.jpg
Converted: 0000127597.tif -> 0000127597.jpg
Converted: 0000128690.tif -> 0000128690.jpg
Converted: 0000136298.tif -> 0000136298.jpg
Converted: 0000139610.tif -> 0

<tifffile.TiffPages @85476> invalid offset to first page 85476


Converted: 2500017197_2500017204.tif -> 2500017197_2500017204.jpg
Failed to convert 2500126531_2500126536.tif: cannot write empty image as JPEG
Converted: 2501092768.tif -> 2501092768.jpg
Converted: 2501093860_2501093868.tif -> 2501093860_2501093868.jpg
Converted: 2501093874-g_2501093874-i.tif -> 2501093874-g_2501093874-i.jpg
Converted: 2501093886.tif -> 2501093886.jpg
Converted: 2501112551.tif -> 2501112551.jpg
Converted: 2501115627.tif -> 2501115627.jpg
Converted: 2501153543_2501153547.tif -> 2501153543_2501153547.jpg
Converted: 2501154343_2501154345.tif -> 2501154343_2501154345.jpg
Converted: 2501154388_2501154392.tif -> 2501154388_2501154392.jpg
Converted: 2501154465_2501154472.tif -> 2501154465_2501154472.jpg
Converted: 2501154661.tif -> 2501154661.jpg
Converted: 2501158073_8093.tif -> 2501158073_8093.jpg
Converted: 2501170379_0386.tif -> 2501170379_0386.jpg
Converted: 2501197516_2501197522.tif -> 2501197516_2501197522.jpg
Converted: 2501203107.tif -> 2501203107.jpg
Converted: 250

Data Cleaning

In [None]:
from PIL import Image, UnidentifiedImageError
import os

def is_valid_image(filepath):
    try:
        with Image.open(filepath) as img:
            img.verify()  
        return True
    except (UnidentifiedImageError, IOError, OSError):
        return False

image_folder = "./test"  

for root, dirs, files in os.walk(image_folder):
    for file in files:
        file_path = os.path.join(root, file)
        if not is_valid_image(file_path):
            print(f"Removing corrupted file: {file_path}")
            os.remove(file_path)


Removing corrupted file: ./test\scientific_publication\2500126531_2500126536.tif


Load and Preprocess

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)

train_data = datagen.flow_from_directory(
    'test', 
    target_size=(224, 224), 
    batch_size=32,  
    class_mode='categorical',  
    subset='training', 
)

val_data = datagen.flow_from_directory(
    'test', 
    target_size=(224, 224), 
    batch_size=32,  
    class_mode='categorical',  
    subset='validation',  
)


Found 64001 images belonging to 16 classes.
Found 15991 images belonging to 16 classes.


Build The model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    MaxPooling2D(pool_size=(2, 2)),
    
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(len(train_data.class_indices), activation='softmax') 
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


Train The Model

In [None]:
history = model.fit(
    train_data,
    epochs=10,  
    validation_data=val_data,
)


Epoch 1/10
[1m2001/2001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1432s[0m 715ms/step - accuracy: 0.4267 - loss: 1.9779 - val_accuracy: 0.4412 - val_loss: 2.0818
Epoch 2/10
[1m2001/2001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1460s[0m 730ms/step - accuracy: 0.7678 - loss: 0.7682 - val_accuracy: 0.4625 - val_loss: 2.3415
Epoch 3/10
[1m2001/2001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1369s[0m 684ms/step - accuracy: 0.9114 - loss: 0.2810 - val_accuracy: 0.4571 - val_loss: 3.2345
Epoch 4/10
[1m2001/2001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1325s[0m 662ms/step - accuracy: 0.9528 - loss: 0.1464 - val_accuracy: 0.4563 - val_loss: 3.7910
Epoch 5/10
[1m2001/2001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1318s[0m 659ms/step - accuracy: 0.9682 - loss: 0.0988 - val_accuracy: 0.4528 - val_loss: 3.7162
Epoch 6/10
[1m2001/2001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1335s[0m 667ms/step - accuracy: 0.9726 - loss: 0.0877 - val_accuracy: 0.4664 - val

Test The Model

In [None]:
val_loss, val_acc = model.evaluate(val_data)
print(f"Validation Accuracy: {val_acc*100:.2f}%")
print(f"Validation Loss: {val_loss:.4f}")


[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 141ms/step - accuracy: 0.4355 - loss: 4.7837
Validation Accuracy: 43.94%
Validation Loss: 4.7257


Save the model

In [None]:
model.save("document_classifier.keras")  