In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
import os
import cv2
import numpy as np
import skimage.io as io
from skimage.filters import threshold_otsu
from skimage.morphology import erosion, dilation
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

# Set the path to the folder containing the images

folder_path='drive/MyDrive/samples'

# Set the window size for splitting the images

window_width = 20
window_height = 100

# Function to preprocess an image using erosion and dilation

def preprocess_image(image):
    # gray image
    img = io.imread(image, as_gray=True)
    #plt.imshow(img, 'gray', aspect='auto')
    # binary image
    threshold = threshold_otsu(img)
    img_binary = img > threshold
    kernel = np.ones((2, 2), np.uint8)
    # dilated 1
    img_dilated = dilation(img_binary, kernel)
    # erosion 1
    img_eroded = erosion(img_dilated)

    kernel = np.ones((3, 1), np.uint8)
    # dilated 2
    img_dilated = dilation(img_binary, kernel)
    # erosion 2
    img_eroded = erosion(img_dilated)

    #dilation 3 
    kernel = np.ones((4,1), np.uint8)
    img_dilated_2 = dilation(img_eroded, kernel)
    return img_dilated_2


# Load and preprocess the images

preprocessed_images = []
preprocessed_character_images = []

# For storing the characters on each character image 
labels=[]

for filename in os.listdir(folder_path):
    image_path = os.path.join(folder_path, filename)
    image = preprocess_image(image_path)
    preprocessed_images.append(image)

    # Split each image into five character images
    height, width = image.shape[:2]
    for i in range(5):
        start_x = i * window_width
        end_x = (i + 1) * window_width
        character_image = image[:, start_x:end_x]
        preprocessed_character_images.append(character_image)
        labels.append(filename[i])

In [15]:
from sklearn.preprocessing import StandardScaler

# Flatten the preprocessed images into 1D arrays
feature_vectors = [image.flatten() for image in preprocessed_character_images]
feature_vectors = np.array(feature_vectors)

# Convert labels to numpy array
labels = np.array(labels)

# Split the data into train and validation sets
train_x,val_x,train_y,val_y = train_test_split(
    feature_vectors,labels, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Train Logistic Regression classifier
lr_classifier = LogisticRegression()
lr_classifier.fit(train_x, train_y)

# Train Support Vector Machine classifier
svm_classifier = SVC()
svm_classifier.fit(train_x, train_y)

# Train Random Forest classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(train_x, train_y)


In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluate on validation data
lr_predictions = lr_classifier.predict(val_x)
svm_predictions = svm_classifier.predict(val_x)
rf_predictions = rf_classifier.predict(val_x)

# Calculate performance metrics
lr_accuracy = accuracy_score(val_y, lr_predictions)
lr_precision = precision_score(val_y, lr_predictions, average='weighted')
lr_recall = recall_score(val_y, lr_predictions, average='weighted')
lr_f1 = f1_score(val_y, lr_predictions, average='weighted')

svm_accuracy = accuracy_score(val_y, svm_predictions)
svm_precision = precision_score(val_y, svm_predictions, average='weighted')
svm_recall = recall_score(val_y, svm_predictions, average='weighted')
svm_f1 = f1_score(val_y, svm_predictions, average='weighted')

rf_accuracy = accuracy_score(val_y, rf_predictions)
rf_precision = precision_score(val_y, rf_predictions, average='weighted')
rf_recall = recall_score(val_y, rf_predictions, average='weighted')
rf_f1 = f1_score(val_y, rf_predictions, average='weighted')

# Print the performance metrics
print("Logistic Regression Performance:")
print("Accuracy:", lr_accuracy)
print("Precision:", lr_precision)
print("Recall:", lr_recall)
print("F1-Score:", lr_f1)

print("Support Vector Machine Performance:")
print("Accuracy:", svm_accuracy)
print("Precision:", svm_precision)
print("Recall:", svm_recall)
print("F1-Score:", svm_f1)


print("Random Forest Performance:")
print("Accuracy:", rf_accuracy)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("F1-Score:", rf_f1)


Logistic Regression Performance:
Accuracy: 0.6232876712328768
Precision: 0.6343438869245378
Recall: 0.6232876712328768
F1-Score: 0.6209229817640419
Support Vector Machine Performance:
Accuracy: 0.7157534246575342
Precision: 0.7596878377890673
Recall: 0.7157534246575342
F1-Score: 0.7188536308417801
Random Forest Performance:
Accuracy: 0.7414383561643836
Precision: 0.7734579273651232
Recall: 0.7414383561643836
F1-Score: 0.7447581534061665


In [18]:
# ANSWER-1
# Here, random forest yields the best accuracy on the validation set.
# Accuracy is a commonly used metric to evaluate the performance of classification
# algorithms, especially when the classes are balanced. However, there are cases where
# accuracy alone, though, may not always be sufficient to assess a model's performance.
# In those cases, we would prefer using “F1 score” to evaluate the model’s performance.
# The F1 score is the harmonic mean of precision and recall. It provides a balanced
# evaluation of an model’s performance, especially in scenarios where both precision and
# recall are important.
# Precision measures the proportion of correctly predicted positive instances out of all
# instances predicted as positive, while recall measures the proportion of correctly
# predicted positive instances out of all actual positive instances. These metrics are
# particularly useful when dealing with imbalanced datasets, where the number of
# instances in one class is significantly higher than the other.

# ANSWER-3 
# Precision, recall and F1-score are shown above for all the three models

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [300],  # Number of trees
    'max_depth': [15],  # Maximum depth of trees
    'max_features': ['sqrt', 'log2']  # Maximum number of features to consider
}

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier()

# Perform grid search
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5)
grid_search.fit(train_x, train_y)

# Get the best hyperparameters found by grid search
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
best_rf_classifier = RandomForestClassifier(**best_params)
best_rf_classifier.fit(train_x, train_y)

# Predict on the test set
y_pred = best_rf_classifier.predict(val_x)

# Calculate accuracy
accuracy = accuracy_score(val_y, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7636986301369864


In [20]:
# ANSWER-2
# We changed n_estimator, maximum depth of the tree and max features which improved the accuracy of the model.
# Increasing the number of trees or n_estimators allowed the model to capture more diverse patterns 
# and reduce the impact of individual noisy or biased trees. This helped to improve the overall 
# accuracy of the model.
# The max-depth hyperparameter helped to balance the bias-variance trade-off in the model

In [21]:
from sklearn.metrics import confusion_matrix

# Assuming rf_predictions contains the predicted labels from the Random Forest classifier

confusion_matrix = confusion_matrix(val_y, rf_predictions)

print("Confusion Matrix:")
print(confusion_matrix)

# characters_list contains the list of characters corresponding to the labels

characters_accuracy = {}
characters_list = []
for i in range(ord('a'), ord('z')+1):
    characters_list.append(chr(i))


for i in range(len(characters_list)):
    true_positives = confusion_matrix[i, i]
    total_instances = np.sum(confusion_matrix[i, :])
    accuracy = true_positives / total_instances
    characters_accuracy[characters_list[i]] = accuracy

# Sorting characters based on accuracy in ascending order
characters_accuracy_sorted = sorted(characters_accuracy.items(), key=lambda x: x[1])
print(characters_accuracy_sorted)


Confusion Matrix:
[[15  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  1  0  0
   0  0]
 [ 0 18  0  0  0  1  0  2  1  0  0  0  0  0  0  0  0  0  2  0  0  0  0  0
   0  0]
 [ 0  0 18  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  0  0  0  0  0  1
   0  0]
 [ 2  0  0 17  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  1  0  1
   0  0]
 [ 0  1  0  0 19  1  0  0  1  0  0  0  0  0  0  0  0  0  1  0  0  0  1  0
   0  0]
 [ 0  0  0  0  0 10  1  0  2  2  0  1  0  0  0  0  0  0  0  0  0  0  0  2
   0  0]
 [ 0  0  0  0  1  0 19  0  1  0  0  0  1  0  0  0  0  0  0  1  0  0  0  1
   0  0]
 [ 0  7  0  1  0  1  0 11  0  1  1  0  0  0  0  0  0  0  0  0  0  0  0  1
   0  0]
 [ 0  0  0  0  0  0  0  0 18  2  0  0  0  0  0  0  0  0  1  0  0  0  0  0
   0  0]
 [ 0  1  0  0  0  0  0  0  0 25  0  2  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 0  1  2  0  0  0  0  0  3  0 14  0  0  0  0  0  0  0  0  0  0  1  1  0
   0  0]
 [ 0  0  1  0  0  0  0  0  1  1  0 19  1  0  0  0  0  0  0  0  0  0  

In [22]:
# ANSWER-4(A)
# From the values observed, Random Forest Classifier has highest value of accuracy,
# precision, recall, F1 score. So it is considered as Best Classifier.
# Its confusion matrix is given as above.
# ANSWER-4(B)
# From the values obserevd, the characters 'h','u','f' have the least accuracy.
# Note - the characters may vary for model to model as the accuracy value for lower 
# accuracy characters have minute difference in values.
# ANSWER-4(C)
# The reasons for this low accuracy could be-
# Unbalanced Classes - If the dataset has imbalanced class distribution, where some
#characters have significantly fewer instances compared to others, the classifier may
# struggle to learn and accurately predict the minority class. The model may be
# biased towards the majority class, resulting in lower accuracy for the minority
# characters.
# Limited Training Data - If there is an insufficient amount of training data available for
# certain characters, it can be challenging for the classifier to learn their distinguishing
# features accurately.
# Underfitting or Overfitting - The model may suffer from overfitting, where it
# memorizes the training data instead of learning general patterns, or underfitting,
# where the model is too simple to capture the complexities of the data. Both
# scenarios can result in reduced accuracy for certain characters.

In [23]:
# ANSWER - 5
# CNNs are composed of multiple layers that progressively learn and extract low-level
# features. CNNs are also capable of learning transition-invariant features. They can 
# identify the character even if they appear in different positions and orientations. 
# Due to these factors we think building a convolutional neural network model for 
# character recognition can give higher accuracy then classifiers.
# Below is the model we developed using basic nueral network algorithms that gave us a 
# little more accurate values than our best classifier. We can improve the basic 
# cnn model to get more higher accuracy values.

In [24]:
# ANSWER - 5
# Tried building a neural network  to enhance the accuracy of character recognition model.
import numpy as np
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Step 1: Initialize and fit the LabelEncoder for training labels

label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_y)

# Step 2: Initialize and fit the LabelEncoder for validation labels.

label_encoder = LabelEncoder()
val_labels_encoded = label_encoder.fit_transform(val_y)

# Step 3: Convert character labels to numerical values
# The train_labels_encoded and val_labels_encoded obtained from the LabelEncoder are 
# then converted into one-hot encoded vectors using the to_categorical function from Keras.
# This is necessary to represent the categorical labels in a format suitable for the model.
train_labels_onehot = to_categorical(train_labels_encoded)
val_labels_onehot = to_categorical(val_labels_encoded)

# Step 4: Pre processing the data and reshaping
train_x= train_x.reshape(-1,100,20,1)
val_x = val_x.reshape(-1,100,20,1)

# Step 5: Define the CNN model
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(100,20,1)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(26, activation='softmax'))

# Step 6: Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 7: Train the model
model.fit(train_x, train_labels_onehot, batch_size=32, epochs=10, validation_data=(val_x, val_labels_onehot))

# Step 8: Evaluate the model on validation data
val_loss, val_accuracy = model.evaluate(val_x, val_labels_onehot)
print("Validation Accuracy:", val_accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Validation Accuracy: 0.7808219194412231
