# WORD LEVEL PREPROCESSING

## DATA PREPARATION

In [1]:
# Install Library : Tensorflow
pip install tensorflow



In [2]:
# Install Library : Keras
pip install keras



In [28]:
# Import Library
import os
import cv2
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import load_img, img_to_array
from tensorflow.keras.models import load_model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Labels List
folder_path = '/content/drive/MyDrive/dataset_word_only/train_proccess'
file_names = os.listdir(folder_path)
print(file_names)

['tugas', 'tanya', 'saya', 'tidak', 'siapa', 'terlambat', 'tolong', 'terima kasih', 'tempat', 'selamat', 'sama-sama', 'sakit', 'paham', 'ibu', 'nama', 'maaf', 'halo', 'kamu', 'mau', 'pagi', 'makan', 'buat', 'buangairkecil', 'bantu', 'bapak', 'baca']


In [24]:
# Number of Training Dataset in each labels
base_path = '/content/drive/MyDrive/dataset_word_only/train_proccess'

def count_files_in_base_folder(folder_path):
    total_files = 0
    for subfolder_name in os.listdir(folder_path):
        subfolder_path = os.path.join(folder_path, subfolder_name)
        if os.path.isdir(subfolder_path):
            num_files = len(os.listdir(subfolder_path))
            print(f"Labels [{subfolder_name}] have {num_files} files")
            total_files += num_files
    return total_files

total_files = count_files_in_base_folder(base_path)
print(f"File number in each Library Folder: {total_files}")


Labels [tugas] have 82 files
Labels [tanya] have 103 files
Labels [saya] have 83 files
Labels [tidak] have 100 files
Labels [siapa] have 88 files
Labels [terlambat] have 102 files
Labels [tolong] have 100 files
Labels [terima kasih] have 89 files
Labels [tempat] have 117 files
Labels [selamat] have 80 files
Labels [sama-sama] have 85 files
Labels [sakit] have 103 files
Labels [paham] have 87 files
Labels [ibu] have 115 files
Labels [nama] have 81 files
Labels [maaf] have 87 files
Labels [halo] have 88 files
Labels [kamu] have 81 files
Labels [mau] have 87 files
Labels [pagi] have 89 files
Labels [makan] have 103 files
Labels [buat] have 93 files
Labels [buangairkecil] have 105 files
Labels [bantu] have 88 files
Labels [bapak] have 107 files
Labels [baca] have 101 files
File number in each Library Folder: 2444


In [8]:
# Copy folder into colab files
import shutil
shutil.copytree('/content/drive/MyDrive/dataset_word_only', '/content/dataset/images')

'/content/dataset/images'

## DATA PRE-PROCESSING

In [10]:
# Image pre-processing with OpenCV Libraries

def resize_images_in_folder(folder_path, target_size=(224, 224)):
    categories = os.listdir(folder_path)
    for category in categories:
        category_path = os.path.join(folder_path, category)
        if os.path.isdir(category_path):
            image_files = [f for f in os.listdir(category_path) if f.endswith(('.jpg', '.png', '.jpeg'))]
            for img_file in image_files:
                img_path = os.path.join(category_path, img_file)
                img = cv2.imread(img_path)
                if img is None:
                    print(f'Failed to read {img_path}')
                    continue
                img_resized = cv2.resize(img, target_size)
                cv2.imwrite(img_path, img_resized)

train_folder = '/content/drive/MyDrive/dataset_word_only/train_proccess'
val_folder = '/content/drive/MyDrive/dataset_word_only/validation_proccess'

resize_images_in_folder(train_folder)
resize_images_in_folder(val_folder)


In [11]:
# Trainning Image pre-processing with ImagedDataGenerator

train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    brightness_range=(0.8, 1.2),  # Sesuaikan kecerahan
    fill_mode='nearest'
)

In [12]:
# Validation Image pre-processing with ImagedDataGenerator

val_datagen = ImageDataGenerator(rescale=1./255)
train_generator = train_datagen.flow_from_directory(
    train_folder,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical'
)
val_generator = val_datagen.flow_from_directory(
    val_folder,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical'
)

Found 2444 images belonging to 26 classes.
Found 520 images belonging to 26 classes.


## MODELING

In [31]:
# Define Model by MobileNetV2 Transfer Learning
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
model = Sequential([
    base_model,
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(train_generator.num_classes, activation='softmax')
])

In [32]:
# Model Summary
model.compile(
    optimizer=Adam(learning_rate=0.0001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
model.summary()

In [33]:
# Define Checkpoint
checkpoint = ModelCheckpoint(
    filepath='model_checkpoint.keras',
    monitor='val_loss',
    save_best_only=True,
    save_weights_only=False,
    mode='min',
    verbose=1
)

# Model Trainning
history = model.fit(
    train_generator,
    epochs=100,
    validation_data=val_generator,
    callbacks=[checkpoint]
)

Epoch 1/100


KeyboardInterrupt: 

In [None]:
def plot_loss_acc(history):
    '''Plots the training and validation loss and accuracy from a history object'''
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs = range(len(acc))

    fig, ax = plt.subplots(1,2, figsize=(12, 6))
    ax[0].plot(epochs, acc, 'bo', label='Training accuracy')
    ax[0].plot(epochs, val_acc, 'b', label='Validation accuracy')
    ax[0].set_title('Training and validation accuracy')
    ax[0].set_xlabel('epochs')
    ax[0].set_ylabel('accuracy')
    ax[0].legend()

    ax[1].plot(epochs, loss, 'bo', label='Training Loss')
    ax[1].plot(epochs, val_loss, 'b', label='Validation Loss')
    ax[1].set_title('Training and validation loss')
    ax[1].set_xlabel('epochs')
    ax[1].set_ylabel('loss')
    ax[1].legend()

    plt.show()

In [None]:
import matplotlib.pyplot as plt
plot_loss_acc(history)

In [None]:
# Convert model to H5
os.rename('model_checkpoint.keras', 'whord_model.h5')
model.save('word_model.h5')

## DATA TESTING

In [20]:
class_labels = sorted(os.listdir('/content/drive/MyDrive/dataset_word_only/test_proccess'))
print(class_labels)

['baca', 'bantu', 'bapak', 'buangairkecil', 'buat', 'halo', 'ibu', 'kamu', 'maaf', 'makan', 'mau', 'nama', 'pagi', 'paham', 'sakit', 'sama-sama', 'saya', 'selamat', 'siapa', 'tanya', 'tempat', 'terima kasih', 'terlambat', 'tidak', 'tolong', 'tugas']


In [25]:
# Load Model
model = load_model('word_model.h5')

# Load Class
dataset_path = '/content/drive/MyDrive/dataset_word_only/test_proccess'
class_labels = sorted(os.listdir(dataset_path))



In [26]:
# Test Image Pre-processing
image_path = '/content/drive/MyDrive/dataset_word_only/test_proccess/bapak/bapak1-00010_jpg.rf.3d13062be3b2a21c39ee5eaf159a2b02.jpg'
img = load_img(image_path, target_size=(224, 224))
img_array = img_to_array(img)
img_array = img_array / 255.0
img_array = np.expand_dims(img_array, axis=0)

In [27]:
# Prediction
predictions = model.predict(img_array)
predicted_class = np.argmax(predictions)
predicted_label = class_labels[predicted_class]
predicted_probability = np.max(predictions)

# Result
print(f"Predicted Class Index: {predicted_class}")
print(f"Predicted Class Label: {predicted_label}")
print(f"Predicted Probability: {predicted_probability:.2f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Predicted Class Index: 2
Predicted Class Label: bapak
Predicted Probability: 1.00
