In [1]:
import pandas as pd
import numpy as np
import warnings

# This is used to suppress all warnings for a cleaner output(for my clear view of the output..)
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score

# Deep learning libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

print("Starting the machine learning pipeline for Alphabet Recognition...")
print("------------------------------------------------------------------")

print("Step 1: Data Exploration and Preprocessing")


data = pd.read_csv("Alphabets_data.csv")
print(data.head())


num_samples = data.shape[0]
num_features = data.shape[1] - 1  
num_classes = data['letter'].nunique()
class_labels = data['letter'].unique()

print(f"\nSummary of the dataset:")
print(f"- Number of samples: {num_samples}")
print(f"- Number of features: {num_features}")
print(f"- Number of classes: {num_classes} ({', '.join(class_labels)})")

print("\nChecking for missing values:")
print(data.isnull().sum())

X = data.drop('letter', axis=1)
y = data['letter']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print("\nLabels have been successfully encoded into numerical format.")
print(f"Original labels: {label_encoder.classes_}")
print(f"Encoded labels: {np.unique(y_encoded)}")

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("\nFeatures have been successfully normalized.")


print("Step 2: Model Implementation and Training.")

# 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)
print(f"\nData split into training and testing sets:")
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

def create_default_model():
    model = Sequential()
    model.add(Dense(64, input_shape=(X_train.shape[1],), activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

print("\nBuilding a default ANN model...")
default_model = create_default_model()

print("Training the default model...")
history = default_model.fit(X_train, y_train, epochs=20, batch_size=32, verbose=0)
print("Default model training complete.")

y_pred_default = np.argmax(default_model.predict(X_test, verbose=0), axis=1)


print("Step 3: Hyperparameter Tuning (Manual Grid Search)")

param_grid = {
    'epochs': [20, 30],
    'batch_size': [32, 64],
    'learning_rate': [0.001, 0.01],
    'hidden_layers': [1, 2],
    'neurons': [32, 64, 128],
    'activation': ['relu', 'tanh']
}

best_accuracy = 0
best_params = {}
best_model = None

kf = KFold(n_splits=3, shuffle=True, random_state=42)

from itertools import product
keys = param_grid.keys()
combinations = list(product(*param_grid.values()))

print(f"\nStarting manual grid search with {len(combinations)} combinations. This may take some minutes of our time approx 30 mins...")

for combo in combinations:
    params = dict(zip(keys, combo))
    
    fold_accuracies = []
    for train_index, val_index in kf.split(X_train):
        X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

        model = Sequential()
        for _ in range(params['hidden_layers']):
            model.add(Dense(params['neurons'], activation=params['activation']))
        model.add(Dense(num_classes, activation='softmax'))

        optimizer = keras.optimizers.Adam(learning_rate=params['learning_rate'])
        model.compile(optimizer=optimizer,
                      loss='sparse_categorical_crossentropy',
                      metrics=['accuracy'])

        model.fit(X_train_fold, y_train_fold, 
                  epochs=params['epochs'], 
                  batch_size=params['batch_size'], 
                  verbose=0)

        accuracy = model.evaluate(X_val_fold, y_val_fold, verbose=0)
        fold_accuracies.append(accuracy)

    mean_accuracy = np.mean(fold_accuracies)
    
    if mean_accuracy > best_accuracy:
        best_accuracy = mean_accuracy
        best_params = params
        best_model = model

print("\nManual Grid Search completed.")
print(f"Best parameters found: {best_params}")
print(f"Best cross-validation accuracy achieved: {best_accuracy:.4f}")

print("\nTraining the final best model on the complete training set...")
best_model.fit(X_train, y_train, epochs=best_params['epochs'], batch_size=best_params['batch_size'], verbose=0)
y_pred_tuned = np.argmax(best_model.predict(X_test, verbose=0), axis=1)


print("Step 4: Evaluation of Models")


print("\nEvaluation of the Default Model......")
report_default = classification_report(y_test, y_pred_default, target_names=label_encoder.classes_)
print(report_default)


print("\nEvaluation of the Hyperparameter-Tuned Model...")
report_tuned = classification_report(y_test, y_pred_tuned, target_names=label_encoder.classes_)
print(report_tuned)

print("\nDiscussion of Results....")
print("The classification reports above provide a detailed look at the performance of both models.")
print("\nAccuracy is a key metric, and it's calculated as the ratio of correct predictions to the total number of predictions.")
print("Precision, recall, and F1-score provide a more nuanced view, especially for multi-class problems.")
print("- Precision: What proportion of positive identifications was actually correct?")
print("- Recall: What proportion of actual positives was identified correctly?")
print("- F1-score: The harmonic mean of precision and recall, a balanced metric.")

print("\nComparing the two reports, you can observe the direct impact of hyperparameter tuning.")
print("Typically, the tuned model shows a significant improvement in overall accuracy and the other metrics.")
print("This demonstrates that fine-tuning parameters like the number of neurons, hidden layers, and the learning rate can lead to a more effective and robust model for this specific dataset.")
print("The Grid Search method systematically explored different combinations, finding the optimal set of parameters to maximize performance.")

print("\nEnd of the pipeline. Thank you!")


Starting the machine learning pipeline for Alphabet Recognition...
------------------------------------------------------------------
Step 1: Data Exploration and Preprocessing

First 5 rows of the dataset:
  letter  xbox  ybox  width  height  onpix  xbar  ybar  x2bar  y2bar  xybar  \
0      T     2     8      3       5      1     8    13      0      6      6   
1      I     5    12      3       7      2    10     5      5      4     13   
2      D     4    11      6       8      6    10     6      2      6     10   
3      N     7    11      6       6      3     5     9      4      6      4   
4      G     2     1      3       1      1     8     6      6      6      6   

   x2ybar  xy2bar  xedge  xedgey  yedge  yedgex  
0      10       8      0       8      0       8  
1       3       9      2       8      4      10  
2       3       7      3       7      3       9  
3       4      10      6      10      2       8  
4       5       9      1       7      5      10  

Summary of the da