In [25]:
import boto3
import zipfile
import os
import json
import sagemaker
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import classification_report, accuracy_score

### Data Preprocessing

In [2]:
# Define dataset path
s3_bucket = "id-classifier-images"
s3_key = "images.zip"
local_zip_path = "/tmp/images.zip"
dataset_dir = "/tmp/dataset"

# Download and extract dataset from S3
s3 = boto3.client("s3")
# s3.download_file(s3_bucket, s3_key, local_zip_path)

# with zipfile.ZipFile(local_zip_path, "r") as zip_ref:
#     zip_ref.extractall(dataset_dir)

### Initialize SageMaker session and role

In [3]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()


### Define image parameters for model input

In [4]:
IMG_HEIGHT = 150  # Height of input images
IMG_WIDTH = 150   # Width of input images
BATCH_SIZE = 32   # Number of images per batch

### Create ImageDataGenerator for data augmentation and preprocessing

In [5]:
data_gen = ImageDataGenerator(rescale=1./255, validation_split=0.2)  # Normalize pixel values

### Load training dataset

In [6]:
unzip_dir = "/tmp/dataset/images"
train_generator = data_gen.flow_from_directory(
    unzip_dir,
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    subset='training'  # Use 80% of data for training
)

Found 800 images belonging to 10 classes.


### Load validation dataset

In [7]:
validation_generator = data_gen.flow_from_directory(
    unzip_dir,
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    subset='validation'  # Use 20% of data for validation
)

Found 200 images belonging to 10 classes.


### Define a simple Convolutional Neural Network (CNN) model

In [8]:
def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH, 3)),
        tf.keras.layers.MaxPooling2D(2,2),
        tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
        tf.keras.layers.MaxPooling2D(2,2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(len(train_generator.class_indices), activation='softmax')  # Output layer with softmax activation
    ])
    # Compile the model with categorical crossentropy loss and Adam optimizer
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

### Create and train the model

In [None]:
model = create_model()
print("Starting model training...")
history = model.fit(
    train_generator,
    validation_data=validation_generator,
    epochs=5  # Train for 5 epochs
)

Starting model training...
Epoch 1/5


### Evaluate the model performance

In [27]:
# List of class names corresponding to the dataset classes
class_names = ['alb_id', 'aze_passport', 'esp_id', 'est_id', 'fin_id', 
               'grc_passport', 'lva_passport', 'rus_internalpassport', 
               'srb_passport', 'svk_id']

with open('class_names.json', 'w') as f:
    json.dump(class_names, f)

# ### Evaluate the model performance

# In[28]:


val_images, val_labels = next(iter(validation_generator))

# Generate predictions
predictions = model.predict(val_images)
predicted_classes = np.argmax(predictions, axis=1)
true_classes = np.argmax(val_labels, axis=1)

# Print classification performance
print("Classification Report:")
print(classification_report(true_classes, predicted_classes, target_names=class_names, zero_division=1))
print("Accuracy:", accuracy_score(true_classes, predicted_classes))

# Map predicted class indices to class names for a better output
predicted_class_names = [class_names[i] for i in predicted_classes]
true_class_names = [class_names[i] for i in true_classes]

# Print the first few predictions for validation
for i in range(5):
    print(f"True label: {true_class_names[i]}, Predicted label: {predicted_class_names[i]}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 335ms/step
Classification Report:
                      precision    recall  f1-score   support

              alb_id       0.00      0.00      0.00         1
        aze_passport       1.00      0.20      0.33         5
              esp_id       0.33      1.00      0.50         1
              est_id       0.00      0.00      0.00         3
              fin_id       1.00      0.50      0.67         2
        grc_passport       0.20      0.25      0.22         4
        lva_passport       0.50      0.25      0.33         4
rus_internalpassport       0.67      0.40      0.50         5
        srb_passport       0.00      0.00      0.00         5
              svk_id       1.00      0.50      0.67         2

            accuracy                           0.25        32
           macro avg       0.47      0.31      0.32        32
        weighted avg       0.48      0.25      0.30        32

Accuracy: 0.25
True label: rus_in

### Save the trained model

In [24]:
model.save("document_classifier.keras", save_format="keras")
model.save("document_classifier.h5", save_format="h5")
print("Model training completed and saved successfully.")

Model training completed and saved successfully.


In [23]:
s3_key_keras = "document_classifier.keras"
s3_key_h5 = "document_classifier.h5"
s3.upload_file("document_classifier.h5", s3_bucket, s3_key_keras)
s3.upload_file("document_classifier.keras", s3_bucket, s3_key_h5)

In [17]:
!jupyter nbconvert --to script ml_id_classifier_tf.ipynb

[NbConvertApp] Converting notebook ml_id_classifier_tf.ipynb to script
[NbConvertApp] Writing 3819 bytes to ml_id_classifier_tf.py
