<a href="https://colab.research.google.com/github/MDaniyalTariq/Urdu_Handwritten_Words_Recognition_through_CNN_Ensembling/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import os
import json
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Function to load and preprocess data from a specific JSON file (for each character)
def load_and_preprocess_data(dataset_folder):
    data = []
    labels = []
    character_labels = {}  # A dictionary to map character to a unique label

    # Get all JSON files in the folder
    files = os.listdir(dataset_folder)
    for idx, file_name in enumerate(files):
        if file_name.endswith('.json'):
            label = idx  # Unique label based on the order of the files
            character_labels[file_name] = label
            file_path = os.path.join(dataset_folder, file_name)
            try:
                with open(file_path, 'r') as file:
                    json_data = json.load(file)
                    features = []
                    # Flatten each stroke into a sequence of points
                    for stroke in json_data:
                        for point in stroke:
                            features.append([point['dx'], point['dy'], point['timestamp']])
                    data.append(features)
                    labels.append(label)  # Add the unique label for this character
            except json.JSONDecodeError as e:
                print(f"Error reading JSON file {file_path}: {e}")

    # Normalize the features (dx, dy, timestamp)
    scaler = StandardScaler()
    data = [scaler.fit_transform(np.array(seq)[:, :3]) for seq in data]  # Normalize dx, dy, timestamp

    # Pad sequences to ensure uniform length
    max_sequence_length = max(len(seq) for seq in data)
    padded_data = pad_sequences(data, maxlen=max_sequence_length, padding='post', dtype='float32')

    return np.array(padded_data), np.array(labels), character_labels

# Define a simple LSTM model for recognition
def create_model(input_shape, num_classes):
    model = Sequential()
    model.add(LSTM(128, activation='relu', input_shape=input_shape, return_sequences=True))
    model.add(LSTM(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))  # Multi-class classification
    model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Function to train the model for all characters
def train_model_on_dataset(dataset_folder):
    if not os.path.exists(dataset_folder):
        raise FileNotFoundError(f"Dataset folder '{dataset_folder}' does not exist.")

    # Load and preprocess data
    X, y, character_labels = load_and_preprocess_data(dataset_folder)

    # Create the model
    model = create_model(X.shape[1:], len(character_labels))  # len(character_labels) is the number of unique characters

    # Train the model
    model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2)

    # Save the trained model
    model.save('urdu_character_model.h5')

    return model, character_labels

# Prediction function for user input (drawing)
def predict_character(user_input_strokes, model, character_labels):
    # Preprocess the user's input similar to how we preprocess training data
    scaler = StandardScaler()
    user_input = [scaler.fit_transform(np.array(user_input_strokes)[:, :3])]
    max_sequence_length = max(len(seq) for seq in user_input)
    padded_input = pad_sequences(user_input, maxlen=max_sequence_length, padding='post', dtype='float32')

    # Predict the character
    prediction = model.predict(padded_input)

    # Get the predicted class (the index with the highest probability)
    predicted_class = np.argmax(prediction)

    # Get the corresponding character from the labels
    predicted_character = list(character_labels.keys())[predicted_class]

    return predicted_character

# Define the path to your dataset folder
dataset_folder = '/content/dataset/json'

# Train the model
model, character_labels = train_model_on_dataset(dataset_folder)

# Example of how to predict a user input (you need to simulate user input here):
user_input_strokes = [
    {'dx': 224, 'dy': 119, 'timestamp': 0},
    {'dx': 224.33, 'dy': 119.66, 'timestamp': 0},
    {'dx': 224.66, 'dy': 121.33, 'timestamp': 0},
    {'dx': 225.33, 'dy': 124, 'timestamp': 17},
    {'dx': 225.66, 'dy': 127, 'timestamp': 17},
    # Add more points to simulate user's stroke data
]

# Predict the character based on the user's input
predicted_character = predict_character(user_input_strokes, model, character_labels)
print(f"Predicted Character: {predicted_character}")


Epoch 1/10


  super().__init__(**kwargs)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m888s[0m 125s/step - accuracy: 0.0000e+00 - loss: 5.5841 - val_accuracy: 0.0000e+00 - val_loss: 5.5876
Epoch 2/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m907s[0m 124s/step - accuracy: 0.0000e+00 - loss: 5.5823 - val_accuracy: 0.0000e+00 - val_loss: 5.5935
Epoch 3/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m932s[0m 127s/step - accuracy: 0.0068 - loss: 5.5809 - val_accuracy: 0.0000e+00 - val_loss: 5.5995
Epoch 4/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m973s[0m 133s/step - accuracy: 0.0000e+00 - loss: 5.5793 - val_accuracy: 0.0000e+00 - val_loss: 5.6054
Epoch 5/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m917s[0m 124s/step - accuracy: 0.0000e+00 - loss: 5.5778 - val_accuracy: 0.0000e+00 - val_loss: 5.6114
Epoch 6/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m962s[0m 130s/step - accuracy: 0.0000e+00 - loss: 5.5763 - val_accuracy: 0.0000e+00 - val_loss: 5.6174
E



IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed