In [9]:
# Import necessary libraries
import pandas as pd  # For data loading and manipulation
from sklearn.model_selection import train_test_split  # For train-test split
from sklearn.preprocessing import StandardScaler  # For feature normalization
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score  # Evaluation metrics
from sklearn.model_selection import GridSearchCV  # For grid search tuning

import tensorflow as tf
from tensorflow.keras.models import Sequential  # For building the ANN model
from tensorflow.keras.layers import Dense  # For adding layers to the ANN
# Import KerasClassifier from scikeras instead of tensorflow.keras.wrappers
from scikeras.wrappers import KerasClassifier  # Wrapper for Keras for hyperparameter tuning

In [10]:
# Step 1: Load and explore the dataset
data = pd.read_csv('/content/Alphabets_data.csv')  # Load data
print(data.head())  # Print first few rows to understand structure
print(data.info())  # Print data info to check types and missing values
print(data.describe())  # Get summary statistics for numerical features

  letter  xbox  ybox  width  height  onpix  xbar  ybar  x2bar  y2bar  xybar  \
0      T     2     8      3       5      1     8    13      0      6      6   
1      I     5    12      3       7      2    10     5      5      4     13   
2      D     4    11      6       8      6    10     6      2      6     10   
3      N     7    11      6       6      3     5     9      4      6      4   
4      G     2     1      3       1      1     8     6      6      6      6   

   x2ybar  xy2bar  xedge  xedgey  yedge  yedgex  
0      10       8      0       8      0       8  
1       3       9      2       8      4      10  
2       3       7      3       7      3       9  
3       4      10      6      10      2       8  
4       5       9      1       7      5      10  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   letter  20000 non-null  object
 1   xbo

In [23]:
# prompt: Remove punctuation, special characters, numbers, and stop words. Lowercase all text to reduce vocabulary size.

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

def preprocess_text(text):
  # Lowercase the text
  text = text.lower()
  # Remove punctuation and special characters
  text = re.sub(r'[^\w\s]', '', text)
  # Remove numbers
  text = re.sub(r'\d+', '', text)
  # Remove stop words
  stop_words = set(stopwords.words('english'))
  words = text.split()
  words = [word for word in words if word not in stop_words]
  text = ' '.join(words)
  return text

# Example usage with the provided code:
with open('/content/Alphabets_data.csv', 'r') as file:
    file_content = file.read()

preprocessed_content = preprocess_text(file_content)
preprocessed_content

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'letterxboxyboxwidthheightonpixxbarybarxbarybarxybarxybarxybarxedgexedgeyyedgeyedgex n g b j x g r f c j j h j c w h g l l x b g p g e x e x g v x w g r l j q q c l n c e x h l h e h q q q j j r n p c p n w v v e n w n h n h u c v k w e f h u c f z x n r b b u j f h e u x j k p c h l v u z v l b k h w f e b v c h u p z e n p h b u f j r e p k b h p k v f z q b k v b z l j z l j q n z e j l v x j n h p f x v p f p w p j e e b c p r u u u j z f v u b l u r g z w c k j c v f b n u q f f x b j h w n p l x g e n g v e k l g j f w k p c z x w b c f b p k u n q q z f n w u g g u k w k e v r z n x q n c q n k e x c j b r z k g b x v z g j v v j c u v z j k b k z f n p c z w p l u j j z u j b b q n b e e n q u h j g v u u z g l j k l x q f w k p v l b b f k c h b w u p b r n b q h j w f x z h k j n w u q v v r l v f b r j v x k h l j b n r z r k w p e e j l v p h r x p v n g r w u l q f v v f p u b q h x j v b u k w x u v h b j p z w f h b k n f v f r q p k r b w k g p c j p x l b e h p n p b j

In [12]:
# Step 2: Preprocess the data
# Assuming 'label' is the column with alphabet classes and others are features
X = data.drop(columns='letter')  # Features
y = data['letter']  # Target variable (alphabet labels)

In [13]:
# Normalize features
scaler = StandardScaler()  # Initialize scaler
X_scaled = scaler.fit_transform(X)  # Fit and transform features to have mean=0 and std=1


In [14]:
# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)  # Split data


In [15]:
# Step 4: Build the model
def create_model(hidden_layers=1, neurons=32, learning_rate=0.01):
    model = Sequential()  # Initialize sequential model
    model.add(Dense(neurons, input_shape=(X_train.shape[1],), activation='relu'))  # Input layer + first hidden layer

    # Add specified number of hidden layers
    for _ in range(hidden_layers - 1):
        model.add(Dense(neurons, activation='relu'))  # Add hidden layer with specified neurons

    model.add(Dense(len(y.unique()), activation='softmax'))  # Output layer with softmax for classification
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])  # Compile model
    return model

In [18]:
# Step 5: Hyperparameter tuning with Grid Search
# Define parameter grid for tuning
param_grid = {
    'model__hidden_layers': [1, 2],  # Varying the number of hidden layers
    'model__neurons': [32, 64],  # Varying the number of neurons
    'epochs': [10, 20],  # Varying the number of epochs
    'batch_size': [16, 32]  # Varying the batch size
}

In [19]:
# Wrap the model for use in scikit-learn
model = KerasClassifier(build_fn=create_model, verbose=0)

# Grid search for hyperparameter tuning
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=3)  # 3-fold cross-validation
grid_result = grid.fit(X_train, y_train)  # Fit grid search to training data

# Display best parameters and accuracy score from grid search
print("Best parameters found: ", grid_result.best_params_)
print("Best accuracy from grid search: ", grid_result.best_score_)


  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regu

Best parameters found:  {'batch_size': 16, 'epochs': 20, 'model__hidden_layers': 2, 'model__neurons': 64}
Best accuracy from grid search:  0.9268126009844119


In [20]:
# Step 6: Evaluate the model with best parameters on the test set
best_model = grid_result.best_estimator_  # Retrieve the best model
y_pred = best_model.predict(X_test)  # Make predictions on test data


In [21]:
# Compute and display evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Test Accuracy: {accuracy}")
print(f"Test Precision: {precision}")
print(f"Test Recall: {recall}")
print(f"Test F1 Score: {f1}")

Test Accuracy: 0.94725
Test Precision: 0.9483105908440221
Test Recall: 0.94725
Test F1 Score: 0.9474726912249626
