In [1]:

import os
import cv2
import numpy as np
import pandas as pd
import pywt  # Import the wavelet transform library

def compute_wavelet_features(window, wavelet='haar', level=1):
    # Convert to grayscale if the window is not already
    if len(window.shape) == 3:
        window = cv2.cvtColor(window, cv2.COLOR_BGR2GRAY)
    
    # Compute 2D Discrete Wavelet Transform (DWT)
    coeffs = pywt.wavedec2(window, wavelet=wavelet, level=level)
    
    # Flatten coefficients into a feature vector
    wavelet_features = []
    for coeff in coeffs:
        if isinstance(coeff, tuple):  # Detail coefficients
            for subband in coeff:
                wavelet_features.extend(subband.flatten())
        else:  # Approximation coefficients
            wavelet_features.extend(coeff.flatten())
    
    # Optionally limit the length of features if needed (e.g., take the first 198)
    return np.array(wavelet_features[:198])

# Adjust these paths according to your dataset structure
image_folder = r'/Users/harsha_ramisetti/Downloads/color_equlsize_jpg'

labels_df = pd.read_excel('/Users/harsha_ramisetti/Downloads/ML new data set 3/cleaned_line_gt_8.xlsx')

# Initialize a dictionary to store Wavelet feature sequences for each image
word_sequences = {}

for index, row in labels_df.iterrows():
    image_name = row['image name']
    character_sequence = row['gt']
    print(image_folder, image_name)
    
    # Load the corresponding image
    image_path = os.path.join(image_folder, str(image_name))
    image = cv2.imread(image_path)
    
    if image is None:
        print(f"Image {image_name} could not be loaded.")
        continue

    image_width = image.shape[1]  # Get image width
    
    # Define the sliding window parameters
    window_width = 150  # Width of the sliding window in pixels
    step_size = 20     # Step size of the sliding window in pixels

    # Calculate the number of windows
    num_windows = (image_width - window_width) // step_size + 1

    # Create an empty list to hold the sequence of wavelet features for this word
    word_wavelet_sequence = []

    # Create sliding windows and compute wavelet features for each window
    for i in range(num_windows):
        # Calculate the pixel range covered by the current window
        window_start = i * step_size
        window_end = window_start + window_width
        
        # Extract the window from the image
        window = image[:, window_start:window_end]  # Assume height is all rows, width is the window
        
        # Compute Wavelet features for the window
        wavelet_features = compute_wavelet_features(window)
        word_wavelet_sequence.append(wavelet_features)
        
    # Store the wavelet feature sequence for the current word/image
    word_sequences[image_name] = np.vstack(word_wavelet_sequence)  # Stack the list of feature arrays to form a matrix



/Users/harsha_ramisetti/Downloads/color_equlsize_jpg MaI849_039_line_1.jpg
/Users/harsha_ramisetti/Downloads/color_equlsize_jpg MaI849_039_line_2.jpg
/Users/harsha_ramisetti/Downloads/color_equlsize_jpg MaI849_039_line_3.jpg
/Users/harsha_ramisetti/Downloads/color_equlsize_jpg MaI849_039_line_4.jpg
/Users/harsha_ramisetti/Downloads/color_equlsize_jpg MaI849_039_line_5.jpg
/Users/harsha_ramisetti/Downloads/color_equlsize_jpg MaI849_039_line_6.jpg
/Users/harsha_ramisetti/Downloads/color_equlsize_jpg MaI849_039_line_7.jpg
/Users/harsha_ramisetti/Downloads/color_equlsize_jpg MaI849_039_line_8.jpg
/Users/harsha_ramisetti/Downloads/color_equlsize_jpg MaI849_039_line_9.jpg
/Users/harsha_ramisetti/Downloads/color_equlsize_jpg MaI849_038_line_1.jpg
/Users/harsha_ramisetti/Downloads/color_equlsize_jpg MaI849_038_line_2.jpg
/Users/harsha_ramisetti/Downloads/color_equlsize_jpg MaI849_038_line_3.jpg
/Users/harsha_ramisetti/Downloads/color_equlsize_jpg MaI849_038_line_4.jpg
/Users/harsha_ramisetti/D

In [9]:
pip install openpyxl

Defaulting to user installation because normal site-packages is not writeable
Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[K     |████████████████████████████████| 250 kB 2.3 MB/s eta 0:00:01
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install pywavelets


In [29]:
# Initialize lists for sequences and their corresponding lengths
sequences = []
lengths = []
labels = []

# For each image, add its DCT feature sequence and the corresponding label (word)
for image_name, dct_sequence in word_sequences.items():
    # Get the corresponding label (word) from the DataFrame
    label = labels_df.loc[labels_df['image name'] == image_name, 'gt'].values[0]
    
    # Append the DCT sequence and its length
    sequences.append(dct_sequence)
    lengths.append(len(dct_sequence))
    labels.append(label)  # Store the label for each image

# Convert sequences to a single numpy array
X = np.concatenate(sequences)  # Flatten the list of sequences

# Convert lengths to numpy array (required by hmmlearn)
model_lengths = np.array(lengths)

# Print the prepared data
print("Sequences shape:", X.shape)
print("Lengths:", model_lengths)
print("Labels:", labels[:5])  # Display the first 5 labels

Sequences shape: (26364, 198)
Lengths: [338 338 338 338 338 338 338 338 338 338 338 338 338 338 338 338 338 338
 338 338 338 338 338 338 338 338 338 338 338 338 338 338 338 338 338 338
 338 338 338 338 338 338 338 338 338 338 338 338 338 338 338 338 338 338
 338 338 338 338 338 338 338 338 338 338 338 338 338 338 338 338 338 338
 338 338 338 338 338 338]
Labels: [' അവരൊധാ കഴിച്ചു കൊള്ളുവാൻ തക്കവണ്ണം എച്ചും നീരും കൊടുത്തു ചെരമാന്\u200d പെരു', 'മാളെന്നരാജാവിനഅണ കലിസ്വർഗ്ഗ സന്ദേഹ പ്രാപ്യ പ ചെരമാന്\u200d പെരുമാളെന്ന രാജാ', 'വിൻറെ ഗുണാധിക്യംമുപ്പത്തിയാറുവർഷം കാലം പാണു ', 'ബ്രാഹ്മണരപരദേശത്ത ചെന്നതുമില്ലപെരുമാളെ കണ്ടതുമില്ല എന്നുകൽപ്പി', 'ച്ച കൃഷ്ണരായ രാ മലയാളം അടക്കുവാൻ പടക്കൂട്ടുകഎല്ലോ ചൈക്കാത്ത അതി']


In [15]:
from hmmlearn import hmm

# Initialize the HMM model
n_states = 71  # Number of hidden states in the HMM
model = hmm.GaussianHMM(n_components=n_states, covariance_type="diag", n_iter=1000)

# Train the model
model.fit(X, model_lengths)

print("HMM training complete.")

HMM training complete.


In [30]:
import pickle

# Path to save the model
model_save_path = '/Users/harsha_ramisetti/Downloads/ml hmm model/hmm_model.pkl'

# Save the model to a file
with open(model_save_path, 'wb') as file:
    pickle.dump(model, file)

print(f"HMM model saved to {model_save_path}")


HMM model saved to /Users/harsha_ramisetti/Downloads/ml hmm model/hmm_model.pkl


In [25]:
import numpy as np
from hmmlearn import hmm
import pickle

# Example Training Data
# Replace `X` with your feature data (2D NumPy array) and `model_lengths` with the sequence lengths
# For example:
# X = np.array([[0.1], [0.2], [0.3], [1.0], [1.1], [1.2]])
# model_lengths = [3, 3]  # Two sequences of 3 samples each
X = np.random.rand(100, 1)  # Replace with actual feature array
model_lengths = [10] * 10  # Replace with sequence lengths

# Initialize the HMM model
n_states = 71  # Number of hidden states
model = hmm.GaussianHMM(n_components=n_states, covariance_type="diag", n_iter=1000, random_state=42)

# Train the model
print("Training the HMM model...")
model.fit(X, model_lengths)
print("HMM training complete.")

# Save the model to a file
model_save_path = '/Users/harsha_ramisetti/Downloads/ml hmm model/hmm_model.pkl'  # Update the path as needed
with open(model_save_path, 'wb') as file:
    pickle.dump(model, file)

print(f"HMM model saved to {model_save_path}")


Fitting a model with 5182 free scalar parameters with only 100 data points will result in a degenerate solution.


Training the HMM model...
HMM training complete.
HMM model saved to /Users/harsha_ramisetti/Downloads/ml hmm model/hmm_model.pkl


In [15]:
pip install hmmlearn

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [16]:
from joblib import dump

# Save the model to a file
dump(model, 'line_sequences_71_hmm_mode_11-11-2024_1000.joblib')

['line_sequences_71_hmm_mode_11-11-2024_1000.joblib']

In [13]:
import cv2
import numpy as np
import pywt

def compute_wavelet_features(window, wavelet='db1', level=2):
    
    # Convert to grayscale if not already
    if len(window.shape) == 3:  # Check if the window is RGB
        window = cv2.cvtColor(window, cv2.COLOR_BGR2GRAY)

    # Perform wavelet decomposition
    coeffs = pywt.wavedec2(window, wavelet=wavelet, level=level)
    
    # Flatten coefficients and return as features
    features = np.concatenate([c.flatten() for c in coeffs])
    return features

def predict_sequence(model, image_path, window_width=150, step_size=20, wavelet='db1', level=2):
    
    image = cv2.imread(image_path)
    if image is None:
        raise ValueError(f"Image at {image_path} could not be loaded.")

    image_width = image.shape[1]
    num_windows = (image_width - window_width) // step_size + 1
    wavelet_sequence = []

    # Generate wavelet features for each window
    for i in range(num_windows):
        window_start = i * step_size
        window_end = window_start + window_width
        window = image[:, window_start:window_end]
        wavelet_features = compute_wavelet_features(window, wavelet=wavelet, level=level)
        wavelet_sequence.append(wavelet_features)

    # Convert to numpy array
    wavelet_sequence = np.array(wavelet_sequence)
    
    # Predict the sequence of states
    predicted_states = model.predict(wavelet_sequence)
    
    return predicted_states


In [32]:
import os
import numpy as np
import pandas as pd
from hmmlearn import hmm

# Load the HMM model (replace with your trained model)
# model = ...


# Read the Excel file containing character sequences
import pandas as pd
df = pd.read_excel('/Users/harsha_ramisetti/Downloads/ML new data set 3/cleaned_line_gt_8.xlsx') 

# Extract the 'Label' column containing character sequences
character_sequences = df['gt'].tolist()

# Set to hold unique characters
# Initialize a set to store unique characters
unique_characters = set()

# Loop through each sequence and add characters to the set
for sequence in character_sequences:
    # Check if the sequence is iterable (i.e., it's a string, list, etc., not a float)
    if isinstance(sequence, (str, list, tuple)):
        unique_characters.update(sequence)

# Now, 'unique_characters' contains all unique characters from the sequences
sorted_characters = sorted(unique_characters)


# Create the mapping of state index to character
state_to_char = {i: char for i, char in enumerate(sorted_characters)}

# Directory containing the images
image_folder = '/Users/harsha_ramisetti/Downloads/color_equlsize_jpg'

# Get all image files in the folder (adjust pattern as needed)
image_files = [f for f in os.listdir(image_folder) if f.endswith('.jpg')]
states = []
# Process each image in the folder
for image_file in image_files:
    # Get the full path of the image
    image_path = os.path.join(image_folder, image_file)
    
    # Predict the sequence of states for the image
    predicted_states = predict_sequence(model, image_path)
    predicted_states = np.array(predicted_states) 
    states.append([image_file, predicted_states])
    print(states)
# Convert data to a DataFrame
output_df = pd.DataFrame(states, columns=['Image Name', 'Predicted States'])

# Save the DataFrame to an Excel file
output_df.to_excel('/Users/harsha_ramisetti/Downloads/predicted_states_line_wavelet2.xlsx', index=False)

[['MaI12_Page102_line_5.jpg', array([ 4, 41,  4, ..., 14, 68, 38])]]
[['MaI12_Page102_line_5.jpg', array([ 4, 41,  4, ..., 14, 68, 38])], ['MaI849_041_line_1.jpg', array([ 4, 41,  4, ..., 55, 59, 14])]]
[['MaI12_Page102_line_5.jpg', array([ 4, 41,  4, ..., 14, 68, 38])], ['MaI849_041_line_1.jpg', array([ 4, 41,  4, ..., 55, 59, 14])], ['MaI12_Page102_line_4.jpg', array([ 4, 41,  4, ..., 13, 59, 14])]]
[['MaI12_Page102_line_5.jpg', array([ 4, 41,  4, ..., 14, 68, 38])], ['MaI849_041_line_1.jpg', array([ 4, 41,  4, ..., 55, 59, 14])], ['MaI12_Page102_line_4.jpg', array([ 4, 41,  4, ..., 13, 59, 14])], ['MaI849_039_line_9.jpg', array([ 4, 41,  4, ..., 13, 59, 14])]]
[['MaI12_Page102_line_5.jpg', array([ 4, 41,  4, ..., 14, 68, 38])], ['MaI849_041_line_1.jpg', array([ 4, 41,  4, ..., 55, 59, 14])], ['MaI12_Page102_line_4.jpg', array([ 4, 41,  4, ..., 13, 59, 14])], ['MaI849_039_line_9.jpg', array([ 4, 41,  4, ..., 13, 59, 14])], ['MaI849_041_line_3.jpg', array([ 4, 41,  4, ..., 60, 25, 62]

In [2]:
import os
import numpy as np
import pandas as pd
from hmmlearn import hmm
import pickle
import cv2  # For image processing

# Define a function to load the trained HMM model
def load_hmm_model(model_path):
    """
    Load a pre-trained HMM model from a file.
    """
    with open(model_path, 'rb') as file:
        model = pickle.load(file)
    return model

# Define a function to compute DCT features for an image
def compute_dct_features(image_path):
    """
    Compute DCT (Discrete Cosine Transform) features from the given image.
    """
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)  # Load image as grayscale
    if image is None:
        raise FileNotFoundError(f"Image not found: {image_path}")
    dct_features = cv2.dct(np.float32(image))  # Compute DCT
    return dct_features.flatten()

# Define a function to predict the state sequence for an image
def predict_sequence(model, image_path):
    """
    Predict a sequence of states for the given image using the HMM model.
    """
    features = compute_dct_features(image_path)
    features = features.reshape(-1, 1)  # Reshape for HMM input
    predicted_states = model.predict(features)
    return predicted_states

# Load the trained HMM model
model_path = '/Users/harsha_ramisetti/Downloads/ml hmm model/hmm_model.pkl'  # Update this with your model's path
model = load_hmm_model(model_path)
print("HMM model loaded successfully.")

# Load character sequences from the Excel file
excel_path = '/Users/harsha_ramisetti/Downloads/ML new data set 3/cleaned_line_gt_8.xlsx'
df = pd.read_excel(excel_path)
character_sequences = df['gt'].tolist()

# Initialize a set to store unique characters
unique_characters = set()
for sequence in character_sequences:
    if isinstance(sequence, (str, list, tuple)):
        unique_characters.update(sequence)

# Sorted list of unique characters and state-to-character mapping
sorted_characters = sorted(unique_characters)
state_to_char = {i: char for i, char in enumerate(sorted_characters)}

# Directory containing the images
image_folder = '/Users/harsha_ramisetti/Downloads/color_equlsize_jpg'

# Get all image files in the folder
image_files = [f for f in os.listdir(image_folder) if f.endswith('.jpg')]

# List to store results
states = []

# Process each image in the folder
for image_file in image_files:
    try:
        # Full path of the image
        image_path = os.path.join(image_folder, image_file)

        # Predict the sequence of states for the image
        predicted_states = predict_sequence(model, image_path)
        predicted_states = np.array(predicted_states)

        # Append the results
        states.append([image_file, predicted_states])
        print(f"Processed {image_file}: Predicted States - {predicted_states}")

    except Exception as e:
        print(f"Error processing {image_file}: {e}")
        states.append([image_file, None])  # Mark as failed

# Convert data to a DataFrame
output_df = pd.DataFrame(states, columns=['Image Name', 'Predicted States'])

# Save the DataFrame to an Excel file
output_excel_path = '/Users/harsha_ramisetti/Downloads/predicted_states_line_wavelet.xlsx'
output_df.to_excel(output_excel_path, index=False)
print(f"Predicted states saved to {output_excel_path}")


HMM model loaded successfully.
Processed MaI12_Page102_line_5.jpg: Predicted States - [ 4 41  4 ... 14 68 38]
Processed MaI849_041_line_1.jpg: Predicted States - [ 4 41  4 ... 55 59 14]
Processed MaI12_Page102_line_4.jpg: Predicted States - [ 4 41  4 ... 13 59 14]
Processed MaI849_039_line_9.jpg: Predicted States - [ 4 41  4 ... 13 59 14]
Processed MaI849_041_line_3.jpg: Predicted States - [ 4 41  4 ... 60 25 62]
Processed MaI849_041_line_2.jpg: Predicted States - [ 4 41  4 ... 25 62  9]
Processed MaI849_039_line_8.jpg: Predicted States - [ 4 41  4 ... 60 25 62]
Processed MaI12_Page102_line_3.jpg: Predicted States - [ 4 41  4 ... 60 25 62]
Processed MaI14_007_9.jpg: Predicted States - [ 4 41  4 ... 14 68 38]
Processed MaI849_041_line_6.jpg: Predicted States - [ 4 41  4 ... 43 60 25]
Processed MaI849_041_line_7.jpg: Predicted States - [ 4 41  4 ... 55 59 14]
Processed MaI14_007_8.jpg: Predicted States - [46 18 41 ... 13 50 40]
Processed MaI12_Page102_line_2.jpg: Predicted States - [ 4 4

In [12]:
import pandas as pd
import numpy as np
from scipy.stats import norm
import re  # For regular expression-based cleaning

# Load the Excel file
file_path = '/Users/harsha_ramisetti/Downloads/predicted_states_line_wavelet.xlsx'
data = pd.read_excel(file_path)

# Display the first few rows to understand the structure
print("Dataset Preview:")
print(data.head())

# Column containing the sequences of predicted states
column_name = 'Predicted States'

# Ensure the column exists
if column_name not in data.columns:
    raise ValueError(f"Column '{column_name}' not found in the dataset.")

# Clean and parse the `Predicted States` column
def parse_states(state_string):
    # Remove non-numeric characters and extra spaces, then split into integers
    state_string = re.sub(r'[^0-9\s]', '', state_string)  # Remove brackets and dots
    return list(map(int, state_string.split()))  # Convert to a list of integers

# Apply the cleaning function to the column
data['Parsed States'] = data[column_name].apply(parse_states)

# Flatten the sequences into a single list of numbers
flattened_values = [value for sublist in data['Parsed States'] for value in sublist]

# Convert to a numpy array
values = np.array(flattened_values)

# Define parameters of the probability distribution (Gaussian in this case)
mean = np.mean(values)  # You can use a fixed mean if desired
std_dev = np.std(values)  # You can use a fixed std_dev if desired

# Ensure standard deviation is positive
if std_dev <= 0:
    raise ValueError("Standard deviation must be positive.")

# Calculate log probabilities for each data point
log_probs = norm.logpdf(values, loc=mean, scale=std_dev)

# Calculate the total log probability of the dataset
total_log_prob = np.sum(log_probs)

# Output results
print("\nFlattened Values:")
print(values)

print("\nTotal Log Probability of the dataset:")
print(total_log_prob)

# Save the results to a new file
output_file_path = '/Users/harsha_ramisetti/Downloads/predicted_states_with_log_prob.xlsx'

# Add log probabilities back to the original dataframe for each sequence
data['Log Probability'] = data['Parsed States'].apply(
    lambda seq: np.sum(norm.logpdf(seq, loc=mean, scale=std_dev))
)

# Save to Excel
data.to_excel(output_file_path, index=False)
print(f"\nResults saved to {output_file_path}")


Dataset Preview:
                 Image Name         Predicted States
0  MaI12_Page102_line_5.jpg  [ 4 41  4 ... 14 68 38]
1     MaI849_041_line_1.jpg  [ 4 41  4 ... 55 59 14]
2  MaI12_Page102_line_4.jpg  [ 4 41  4 ... 13 59 14]
3     MaI849_039_line_9.jpg  [ 4 41  4 ... 13 59 14]
4     MaI849_041_line_3.jpg  [ 4 41  4 ... 60 25 62]

Flattened Values:
[ 4 41  4 14 68 38  4 41  4 55 59 14  4 41  4 13 59 14  4 41  4 13 59 14
  4 41  4 60 25 62  4 41  4 25 62  9  4 41  4 60 25 62  4 41  4 60 25 62
  4 41  4 14 68 38  4 41  4 43 60 25  4 41  4 55 59 14 46 18 41 13 50 40
  4 41  4 14 62  9 46 18 41 27 13 50 46 18 41 59 14 68  4 41  4 14 62  9
  4 41  4 14 62  9 46 18 41 27 13 50  4 41  4 14 62  9  4 41  4 59 14 68
  4 41  4 25 62  9  4 41  4 59 14 68  4 41  4 14 68 38 46 18 41 14 68 38
  4 41  4 13 59 14  4 41  4 25 62  9  4 41  4 59 14 68 46 18 41  9 55 59
  4 41  4 14 62  9  4 41  4 59 14 68  4 41  4 14 68 38  4 41  4 14 62  9
  4 41  4 59 14 68  4 41  4 25 62  9  4 41  4 59 14 68  4 41  

In [21]:
# Column containing the labeled data
label_column = 'gt'

# Check if the column exists
if label_column not in data.columns:
    raise ValueError(f"Column '{label_column}' not found in the dataset.")

# Combine all labeled text into a single string
all_text = ''.join(data[label_column].dropna().astype(str))  # Drop missing values and concatenate

# Find unique characters
unique_characters = sorted(set(all_text))

# Output results
print("Unique Characters:")
print(unique_characters)

# Optionally, save unique characters to a text file
output_file_path = 'unique_characters.txt'
with open(output_file_path, 'w') as file:
    for char in unique_characters:
        file.write(f"{char}\n")

print(f"\nUnique characters saved to {output_file_path}")


Unique Characters:
[' ', ':', '\xa0', 'ം', 'ഃ', 'അ', 'ആ', 'ഇ', 'ഉ', 'എ', 'ഏ', 'ഒ', 'ഓ', 'ക', 'ഖ', 'ഗ', 'ഘ', 'ങ', 'ച', 'ഛ', 'ജ', 'ഝ', 'ഞ', 'ട', 'ഠ', 'ഡ', 'ഢ', 'ണ', 'ത', 'ഥ', 'ദ', 'ധ', 'ന', 'പ', 'ഫ', 'ബ', 'ഭ', 'മ', 'യ', 'ര', 'റ', 'ല', 'ള', 'ഴ', 'വ', 'ശ', 'ഷ', 'സ', 'ഹ', 'ാ', 'ി', 'ീ', 'ു', 'ൂ', 'ൃ', 'െ', 'േ', 'ൈ', 'ൊ', 'ോ', '്', 'ൗ', 'ൻ', 'ർ', 'ൽ', 'ൾ', '\u200c', '\u200d']

Unique characters saved to unique_characters.txt


In [22]:
import pandas as pd

# Load the labeled data file (replace 'file_path' with your actual file path)
file_path = '/Users/harsha_ramisetti/Downloads/ML new data set 3/cleaned_line_gt_8.xlsx'  # Update this path
data = pd.read_excel(file_path)

# Column containing the labeled text
label_column = 'gt'  # Replace with the actual column name in your dataset

# Check if the column exists
if label_column not in data.columns:
    raise ValueError(f"Column '{label_column}' not found in the dataset.")

# Combine all labeled text into a single string
all_text = ''.join(data[label_column].dropna().astype(str))  # Drop missing values and concatenate

# Find unique characters
unique_characters = sorted(set(all_text))

# Map unique characters to integer values
char_to_int = {char: idx for idx, char in enumerate(unique_characters)}

# Display the mapping
print("Character to Integer Mapping:")
print(char_to_int)

# Save the mapping to a text file
mapping_output_file = 'char_to_int_mapping.txt'
with open(mapping_output_file, 'w') as file:
    for char, idx in char_to_int.items():
        file.write(f"'{char}': {idx}\n")

print(f"\nCharacter to integer mapping saved to {mapping_output_file}")


Character to Integer Mapping:
{' ': 0, ':': 1, '\xa0': 2, 'ം': 3, 'ഃ': 4, 'അ': 5, 'ആ': 6, 'ഇ': 7, 'ഉ': 8, 'എ': 9, 'ഏ': 10, 'ഒ': 11, 'ഓ': 12, 'ക': 13, 'ഖ': 14, 'ഗ': 15, 'ഘ': 16, 'ങ': 17, 'ച': 18, 'ഛ': 19, 'ജ': 20, 'ഝ': 21, 'ഞ': 22, 'ട': 23, 'ഠ': 24, 'ഡ': 25, 'ഢ': 26, 'ണ': 27, 'ത': 28, 'ഥ': 29, 'ദ': 30, 'ധ': 31, 'ന': 32, 'പ': 33, 'ഫ': 34, 'ബ': 35, 'ഭ': 36, 'മ': 37, 'യ': 38, 'ര': 39, 'റ': 40, 'ല': 41, 'ള': 42, 'ഴ': 43, 'വ': 44, 'ശ': 45, 'ഷ': 46, 'സ': 47, 'ഹ': 48, 'ാ': 49, 'ി': 50, 'ീ': 51, 'ു': 52, 'ൂ': 53, 'ൃ': 54, 'െ': 55, 'േ': 56, 'ൈ': 57, 'ൊ': 58, 'ോ': 59, '്': 60, 'ൗ': 61, 'ൻ': 62, 'ർ': 63, 'ൽ': 64, 'ൾ': 65, '\u200c': 66, '\u200d': 67}

Character to integer mapping saved to char_to_int_mapping.txt


In [37]:
import pandas as pd
import os

print(df.head())

# Load the Excel file
file_path = '/Users/harsha_ramisetti/Downloads/predicted_states_line_wavelet.xlsx'
data = pd.ExcelFile(file_path)

# Load the first sheet into a DataFrame
df = data.parse('Sheet1')

# Define the reverse mapping dictionary for Malayalam characters
int_to_char = {
    0: 'അ', 1: 'ആ', 2: 'ഇ', 4: 'ഉ', 13: 'ഋ', 14: 'എ', 25: 'ഒ', 38: 'ഔ', 
    41: 'ക', 55: 'ഗ', 59: 'ച', 60: 'ജ', 62: 'ട', 68: 'ഡ'
}

# Function to clean and inverse map the Predicted States column
def inverse_map(sequence_str):
    try:
        # Clean up the string and convert to a list of integers
        sequence = [int(x) for x in sequence_str.strip("[]").replace('...', '').split()]
        # Apply the reverse mapping
        return ''.join(int_to_char.get(num, '?') for num in sequence)
    except Exception as e:
        return f"Error: {e}"

# Apply the inverse mapping function to the Predicted States column
df['Decoded States'] = df['Predicted States'].apply(inverse_map)

# Save the processed DataFrame to a new Excel file
output_path = 'inversemapping_predicted_states_malayalam.xlsx'
df.to_excel(output_path, index=False)

# Notify the user about the save location
print(f"File saved successfully at: {os.path.abspath(output_path)}")


                 Image Name         Predicted States Decoded States
0  MaI12_Page102_line_5.jpg  [ 4 41  4 ... 14 68 38]         ഉകഉഎഡഔ
1     MaI849_041_line_1.jpg  [ 4 41  4 ... 55 59 14]         ഉകഉഗചഎ
2  MaI12_Page102_line_4.jpg  [ 4 41  4 ... 13 59 14]         ഉകഉഋചഎ
3     MaI849_039_line_9.jpg  [ 4 41  4 ... 13 59 14]         ഉകഉഋചഎ
4     MaI849_041_line_3.jpg  [ 4 41  4 ... 60 25 62]         ഉകഉജഒട
File saved successfully at: /Users/harsha_ramisetti/Downloads/inversemapping_predicted_states_malayalam.xlsx


In [48]:
import os
import cv2
import numpy as np
import pandas as pd
import pywt  # Library for wavelet transforms

# Function to compute wavelet features from an image window
# Function to compute wavelet features from an image window
def compute_wavelet_features(window, wavelet='db1', level=2):
    """
    Computes wavelet features for an image window.
    Args:
        window (ndarray): Image window (grayscale or RGB).
        wavelet (str): Wavelet type, e.g., 'db1' for Daubechies wavelets.
        level (int): Decomposition level for wavelet transform.
    Returns:
        ndarray: Flattened wavelet coefficients array.
    """
    # Convert to grayscale if the window is not already
    if len(window.shape) == 3:
        window = cv2.cvtColor(window, cv2.COLOR_BGR2GRAY)
    
    # Perform wavelet decomposition
    coeffs = pywt.wavedec2(window, wavelet=wavelet, level=level)
    
    # Flatten and concatenate all detail coefficients
    wavelet_features = np.hstack([comp.flatten() for coeff in coeffs[1:] for comp in coeff])
    
    # Optional: Limit the feature vector size
    max_features = 198  # Similar to DCT, we use the first 198 coefficients
    return wavelet_features[:max_features]

# Paths to your dataset
# Load data from the Excel file
labels_df = pd.read_excel(r'/Users/harsha_ramisetti/Downloads/ML new data set 3/cleaned_line_gt_8.xlsx')  # Adjust the path if necessary

# Parameters
window_width = 30  # Width of each sliding window in pixels
step_size =10    # Step size of the sliding window in pixels
image_folder = r'/Users/harsha_ramisetti/Downloads/color_equlsize_jpg'  # Adjust the path if necessary

# Dictionary to store DCT feature sequences for each character across all images
character_dct_sequences = {}

# Process each image (word) in the dataset
for index, row in labels_df.iterrows():
    image_name = row['image name']         # Adjust column name if necessary
    character_sequence = row['gt']      # Adjust column name if necessary
    
    # Load the corresponding image
    image_path = os.path.join(image_folder, image_name)
    image = cv2.imread(image_path)
    if image is None:
        print(f"Image {image_name} could not be loaded.")
        continue

    image_width = image.shape[1]  # Get image width
    
    # Calculate the width of each character region based on the sequence length
    num_characters = len(character_sequence)
    character_width = image_width // num_characters
    
    # Loop through each character in the sequence and collect its DCT features
    for i, char in enumerate(character_sequence):
        # Define the region corresponding to the current character
        region_start = i * character_width
        region_end = region_start + character_width
        character_region = image[:, region_start:region_end]  # Assume height is all rows
        
        # Split the character region into windows to capture DCT features
        num_windows = (character_width - window_width) // step_size + 1
        char_dct_sequence = []
        
        for j in range(num_windows):
            # Calculate the start and end of the window within the character region
            window_start = region_start + j * step_size
            window_end = window_start + window_width
            
            # Extract the window
            window = image[:, window_start:window_end]
            
            # Compute DCT features for this window
            dct_features = compute_wavelet_features(window)
            char_dct_sequence.append(dct_features)
        
        # Append this character's DCT features to the global dictionary
        if char not in character_dct_sequences:
            character_dct_sequences[char] = []
        character_dct_sequences[char].append(char_dct_sequence)

# Print the number of DCT feature sequences for each character
for char, sequences in character_dct_sequences.items():
    print(f"Character '{char}' has {len(sequences)} sequences of wavelet features.")

Character ' ' has 888 sequences of wavelet features.
Character 'അ' has 48 sequences of wavelet features.
Character 'വ' has 185 sequences of wavelet features.
Character 'ര' has 328 sequences of wavelet features.
Character 'ൊ' has 84 sequences of wavelet features.
Character 'ധ' has 39 sequences of wavelet features.
Character 'ാ' has 401 sequences of wavelet features.
Character 'ക' has 248 sequences of wavelet features.
Character 'ഴ' has 18 sequences of wavelet features.
Character 'ി' has 348 sequences of wavelet features.
Character 'ച' has 105 sequences of wavelet features.
Character '്' has 715 sequences of wavelet features.
Character 'ു' has 253 sequences of wavelet features.
Character 'ള' has 45 sequences of wavelet features.
Character 'ൻ' has 23 sequences of wavelet features.
Character 'ത' has 318 sequences of wavelet features.
Character 'ണ' has 87 sequences of wavelet features.
Character 'ം' has 100 sequences of wavelet features.
Character 'എ' has 17 sequences of wavelet features.
C

In [55]:
import numpy as np
from hmmlearn import hmm

# Dictionary to store character HMMs
character_hmms = {}
num_states=4
# Example: 'character_dct_sequences' contains the DCT feature sequences for each character
for char, sequences in character_dct_sequences.items():
    # Remove empty sequences
    sequences = [seq for seq in sequences if len(seq) > 0]
    
    if len(sequences) == 0:
        print(f"Warning: No valid sequences for character {char}. Skipping this character.")
        continue  # Skip to the next character if no valid sequences
    if len(sequences) >=15:
    # Prepare training data for the HMM
        X = np.vstack(sequences)  # Stack the sequences into a single array
        lengths = [len(seq) for seq in sequences]  # Length of each sequence
    
        # Initialize HMM for this character
        model = hmm.GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000)
    
        # Train the HMM with the character’s DCT feature sequences
        model.fit(X, lengths)
    
        # Store the trained model
        character_hmms[char] = model
        print(f"Model trained for character: {char}")

Model trained for character:  
Model trained for character: അ
Model trained for character: വ
Model trained for character: ര
Model trained for character: ൊ
Model trained for character: ധ
Model trained for character: ാ
Model trained for character: ക
Model trained for character: ഴ
Model trained for character: ി
Model trained for character: ച
Model trained for character: ്
Model trained for character: ു
Model trained for character: ള
Model trained for character: ൻ
Model trained for character: ത
Model trained for character: ണ
Model trained for character: ം
Model trained for character: എ
Model trained for character: ന
Model trained for character: ീ
Model trained for character: ട
Model trained for character: െ
Model trained for character: മ
Model trained for character: ‍
Model trained for character: പ
Model trained for character: ജ
Model trained for character: ല
Model trained for character: സ
Model trained for character: ർ
Model trained for character: ഗ
Model trained for character: ദ
Model tr

In [64]:
import numpy as np
from hmmlearn import hmm
import pickle

# Dictionary to store character HMMs
character_hmms = {}
num_states = 4

# Example: 'character_dct_sequences' contains the DCT feature sequences for each character
for char, sequences in character_dct_sequences.items():
    # Remove empty sequences
    sequences = [seq for seq in sequences if len(seq) > 0]
    
    if len(sequences) == 0:
        print(f"Warning: No valid sequences for character {char}. Skipping this character.")
        continue  # Skip to the next character if no valid sequences
    
    if len(sequences) >= 15:
        # Prepare training data for the HMM
        X = np.vstack(sequences)  # Stack the sequences into a single array
        lengths = [len(seq) for seq in sequences]  # Length of each sequence
        
        # Initialize HMM for this character
        model = hmm.GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000)
        
        # Train the HMM with the character’s DCT feature sequences
        model.fit(X, lengths)
        
        # Store the trained model
        character_hmms[char] = model
        print(f"Model trained for character: {char}")

# Save all trained models to a single pickle file
output_file = "/Users/harsha_ramisetti/Downloads/ML charcters hmm model/hmm charcter model.pkl"
with open(output_file, "wb") as f:
    pickle.dump(character_hmms, f)

print(f"All character models have been saved to {output_file}.")


Model trained for character:  
Model trained for character: അ
Model trained for character: വ
Model trained for character: ര
Model trained for character: ൊ
Model trained for character: ധ
Model trained for character: ാ
Model trained for character: ക
Model trained for character: ഴ
Model trained for character: ി
Model trained for character: ച
Model trained for character: ്
Model trained for character: ു
Model trained for character: ള
Model trained for character: ൻ
Model trained for character: ത
Model trained for character: ണ
Model trained for character: ം
Model trained for character: എ
Model trained for character: ന
Model trained for character: ീ
Model trained for character: ട
Model trained for character: െ
Model trained for character: മ
Model trained for character: ‍
Model trained for character: പ
Model trained for character: ജ


Model is not converging.  Current: -135938.00770208036 is not greater than -135938.00770081117. Delta is -1.2691889423877e-06


Model trained for character: ല
Model trained for character: സ
Model trained for character: ർ
Model trained for character: ഗ
Model trained for character: ദ
Model trained for character: ഹ
Model trained for character: യ
Model trained for character: റ
Model trained for character: ഷ
Model trained for character: ബ


Some rows of transmat_ have zero sum because no transition from the state was ever observed.


Model trained for character: ശ
Model trained for character: ൽ
Model trained for character: ൃ
Model trained for character: ൂ
Model trained for character: ൈ
Model trained for character: ഭ
Model trained for character: ആ
Model trained for character: ഞ


Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the st

Model trained for character: ങ
Model trained for character: ഥ
Model trained for character:  
Model trained for character: ഃ
All character models have been saved to /Users/harsha_ramisetti/Downloads/ML charcters hmm model/hmm charcter model.pkl.


In [50]:
# Extract the character names (keys) from the character_hmms dictionary
char_names = list(character_hmms.keys())

# Print the character names
print("Character names:", char_names)


Character names: [' ', 'അ', 'വ', 'ര', 'ൊ', 'ധ', 'ാ', 'ക', 'ഴ', 'ി', 'ച', '്', 'ു', 'ള', 'ൻ', 'ത', 'ണ', 'ം', 'എ', 'ന', 'ീ', 'ട', 'െ', 'മ', '\u200d', 'പ', 'ജ', 'ല', 'സ', 'ർ', 'ഗ', 'ദ', 'ഹ', 'യ', 'റ', 'ഷ', 'ബ', 'ശ', 'ൽ', 'ൃ', 'ൂ', 'ൈ', 'ഭ', 'ആ', 'ഞ', 'ങ', 'ഥ', '\xa0', 'ഃ']


In [51]:
ground_truth =char_names

In [52]:
unique_chars = sorted(set("".join(ground_truth)))
char_to_state = {char: idx for idx, char in enumerate(unique_chars)}
state_to_char = {idx: char for char, idx in char_to_state.items()}

In [73]:
import joblib
from hmmlearn import hmm
import cv2
import numpy as np
import pandas as pd

# Load character HMM models
character_hmms = {"'/Users/harsha_ramisetti/Downloads/ML charcters hmm model"}
#for char in char_to_state.keys():
   # sanitized_char = sanitize_filename(char)
   # try:
       # model = joblib.load(f"{sanitized_char}_hmm.pkl")
       # character_hmms[char] = model
   # except FileNotFoundError:
        #print(f"Model for character '{char}' not found.")
        #continue

# Function to compute the actual sequence based on window mapping
# Function to extract the actual sequence for a specific image from the DataFrame
def get_actual_sequence_from_df(image_name, line_image, label_df, window_width=30, step_size=10):
    # Find the corresponding row in the DataFrame
    row = label_df[label_df['image name'] == image_name]
    
    if row.empty:
        raise ValueError(f"Image name '{image_name}' not found in the labels DataFrame.")
    
    # Extract the ground truth character sequence
    character_sequence = row['gt'].values[0]  # Adjust column name if necessary
    
    # Compute the actual sequence based on ground truth and image dimensions
    actual_sequence = []
    image_width = line_image.shape[1]
    num_characters = len(character_sequence)
    character_width = image_width // num_characters

    for i, char in enumerate(character_sequence):
        # Define the region corresponding to this character
        region_start = i * character_width
        region_end = region_start + character_width
        character_region = line_image[:, region_start:region_end]

        # Divide the character region into windows
        num_windows = (character_width - window_width) // step_size + 1
        for _ in range(num_windows):
            actual_sequence.append(char)  # Map each window to the current character
    
    return actual_sequence
# Function to predict the sequence based on HMM models
def predict_line_sequence(line_image, window_width=30, step_size=10):
    predictions = []
    image_width = line_image.shape[1]
    num_windows = (image_width - window_width) // step_size + 1

    for i in range(num_windows):
        window_start = i * step_size
        window_end = window_start + window_width
        window = line_image[:, window_start:window_end]

        # Compute DCT features for the window
        dct_features = compute_wavelet_features(window)
        #gabor_features = compute_gabor_features(window)

            # Combine features (concatenate DCT and Gabor features)
        #combined_features = np.concatenate((dct_features, gabor_features))

        combined_features = dct_features.reshape(1, -1)  # Reshape for HMM input
        

        # Calculate likelihoods for each character model
        best_char = None
        best_score = float('-inf')

        for char, model in character_hmms.items():
            try:
                score = model.score(combined_features)
                if score > best_score:
                    best_score = score
                    best_char = char
            except:
                pass  # Ignore errors for invalid model scoring

        if best_char is not None:
            predictions.append(best_char)

    return ''.join(predictions)

# Test image details
# Load the labels DataFrame
label_file = r"/Users/harsha_ramisetti/Downloads/ML new data set 3/cleaned_line_gt_8.xlsx"
label_df = pd.read_excel(label_file)

# Test image details
test_image_path = "MaI14_007_6.jpg"
test_image_name = test_image_path.split("\\")[-1]  # Extract the image name

# Load the test image
test_image = cv2.imread(test_image_path, cv2.IMREAD_GRAYSCALE)

# Extract the actual sequence from the DataFrame
actual_sequence = get_actual_sequence_from_df(test_image_name, test_image, label_df)

# Predict the sequence using the HMM models
predicted_sequence = predict_line_sequence(test_image)

# Print the sequences for comparison
print(f"Actual sequence: {''.join(actual_sequence)}")
print(f"Predicted sequence: {predicted_sequence}")

[ WARN:0@63526.963] global loadsave.cpp:241 findDecoder imread_('MaI14_007_6.jpg'): can't open/read file: check file path/integrity


AttributeError: 'NoneType' object has no attribute 'shape'

In [95]:
import os
import cv2
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu

# Function to calculate BLEU score
def calculate_bleu_score(actual_sequence, predicted_sequence):
    reference = [list(actual_sequence)]  # Wrap in another list for multiple references
    hypothesis = list(predicted_sequence)
    bleu_score = sentence_bleu(reference, hypothesis)
    return bleu_score

# Function to calculate accuracy
def calculate_accuracy(actual_sequence, predicted_sequence):
    # Ensure the sequences are the same length for comparison
    min_length = min(len(actual_sequence), len(predicted_sequence))
    correct_predictions = 0
    
    for i in range(min_length):
        if actual_sequence[i] == predicted_sequence[i]:
            correct_predictions += 1

    # Calculate accuracy as a percentage
    accuracy = (correct_predictions / min_length) * 100
    return accuracy

# Folder paths and files
image_folder = r"/Users/harsha_ramisetti/Downloads/color_equlsize_jpg"
label_file = r"/Users/harsha_ramisetti/Downloads/ML new data set 3/cleaned_line_gt_8.xlsx"
output_file = r"/Users/harsha_ramisetti/Downloads/average_accuracy.xlsx"

# Load the labels DataFrame
label_df = pd.read_excel(label_file)

# List all image files in the folder
image_files = [f for f in os.listdir(image_folder) if f.endswith(('.jpg', '.png'))]

# Initialize a results list and variable to track total accuracy
results = []
total_accuracy = 0
num_images = 0

# Process each image
for image_name in image_files:
    image_path = os.path.join(image_folder, image_name)
    
    # Load the image
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image is None:
        print(f"Error loading image: {image_name}")
        continue

    # Extract the actual sequence from the DataFrame
    actual_sequence = get_actual_sequence_from_df(image_name, image, label_df)
    if actual_sequence is None:
        print(f"No label found for image: {image_name}")
        continue

    # Predict the sequence using the HMM models
    predicted_sequence = predict_line_sequence(image)
    
    # Calculate BLEU score
    #bleu_score = calculate_bleu_score(actual_sequence, predicted_sequence)
    
    # Calculate accuracy
    accuracy = calculate_accuracy(actual_sequence, predicted_sequence)
    print(image_name,accuracy)
    # Add the accuracy to the total
    total_accuracy += accuracy
    num_images += 1
    
    # Append results
    results.append({
        "Image Name": image_name,
        "Actual Sequence": ''.join(actual_sequence),
        "Predicted Sequence": predicted_sequence,
        #"BLEU Score": bleu_score,
        "Accuracy": accuracy
    })

# Calculate the average accuracy
average_accuracy = total_accuracy / num_images if num_images > 0 else 0

# Save the results to an Excel sheet
results_df = pd.DataFrame(results)
results_df.to_excel(output_file, index=False)

# Print average accuracy
print(f"Average Accuracy: {average_accuracy:.2f}%")
print(f"Results saved to {output_file}")


AttributeError: 'set' object has no attribute 'items'

In [167]:
import pickle
import os

# Path where the pickle file will be saved
file_path = 'trained_character_hmms.pkl'

# Save the trained character HMMs to a .pkl file
try:
    with open(file_path, 'wb') as f:
        pickle.dump(character_hmms, f)
    print(f"Character HMMs successfully saved to {os.path.abspath(file_path)}")
except Exception as e:
    print(f"Error saving HMM models: {e}")


Character HMMs successfully saved to /Users/harsha_ramisetti/Downloads/trained_character_hmms.pkl


In [9]:
import pickle

# Path to the uploaded file
file_path = "/Users/harsha_ramisetti/Downloads/trained_character_hmms.pkl"

# Load the file
try:
    with open(file_path, "rb") as file:
        data = pickle.load(file)
        print("File loaded successfully!")
        print(f"Type of data: {type(data)}")
        if isinstance(data, dict):
            print(f"Keys in the file: {list(data.keys())}")
            print(f"Number of entries: {len(data)}")
        else:
            print("The file does not contain a dictionary structure.")
except Exception as e:
    print(f"Error loading file: {e}")

trained_characters = list(data.keys()) if isinstance(data, dict) else []
missing_characters = [c for c in malayalam_characters if c not in trained_characters]
if missing_characters:
    print("The following characters are missing or untrained:", missing_characters)
else:
    print("All Malayalam characters are trained!")


File loaded successfully!
Type of data: <class 'dict'>
Keys in the file: [' ', 'അ', 'വ', 'ര', 'ൊ', 'ധ', 'ാ', 'ക', 'ഴ', 'ി', 'ച', '്', 'ു', 'ള', 'ൻ', 'ത', 'ണ', 'ം', 'എ', 'ന', 'ീ', 'ട', 'െ', 'മ', '\u200d', 'പ', 'ജ', 'ല', 'സ', 'ർ', 'ഗ', 'ദ', 'ഹ', 'യ', 'റ', 'ഷ', 'ബ', 'ശ', 'ൽ', 'ൃ', 'ൂ', 'ൈ', 'ഭ', 'ആ', 'ഞ', 'ങ', 'ഥ', '\xa0', 'ഃ', 'ഇ', 'ഈ', 'ഉ', 'ഊ', 'ഋ', 'ഏ', 'ഐ', 'ഒ', 'ഓ', 'ഔ', 'ഖ', 'ഘ', 'ഛ', 'ഝ', 'ഠ', 'ഡ', 'ഢ', 'ഫ']
Number of entries: 67
All Malayalam characters are trained!


In [169]:
ground_truth =char_names

In [170]:
unique_chars = sorted(set("".join(ground_truth)))
char_to_state = {char: idx for idx, char in enumerate(unique_chars)}
state_to_char = {idx: char for char, idx in char_to_state.items()}

In [25]:
from sklearn.decomposition import PCA
import numpy as np
import joblib
import cv2
import os
import pandas as pd

# Load character HMM models
character_hmms_path = r"/Users/harsha_ramisetti/Downloads/trained_character_hmms.pkl"
character_hmms = joblib.load(character_hmms_path)
print("Character HMM models loaded successfully.")

# Ensure transition matrices are valid
for char, model in character_hmms.items():
    if not np.allclose(model.transmat_.sum(axis=1), 1.0):
        print(f"Normalizing transmat_ for character '{char}'")
        model.transmat_ = model.transmat_ / model.transmat_.sum(axis=1, keepdims=True)

# Load labels DataFrame
label_file = r"/Users/harsha_ramisetti/Downloads/ML new data set 3/cleaned_line_gt_8.xlsx"
label_df = pd.read_excel(label_file)

# Load test image
test_image_path = r"/Users/harsha_ramisetti/Downloads/color_equlsize_jpg/MaI14_007_3.jpg"
test_image = cv2.imread(test_image_path, cv2.IMREAD_GRAYSCALE)

if test_image is None:
    raise ValueError(f"Failed to load image: {test_image_path}")

# Initialize PCA
pca = PCA(n_components=198)  # Adjust to match the HMM input dimensionality

# Function to extract DCT features
def compute_dct_features(window):
    """Compute normalized DCT features for a given window."""
    dct_features = np.abs(np.fft.fft2(window)).flatten()
    return dct_features / np.linalg.norm(dct_features)  # Normalize features

# Function to train PCA on multiple windows
def train_pca_on_image(image, window_width=30, step_size=10):
    """Train PCA using sliding windows across the image."""
    windows = []
    image_width = image.shape[1]
    num_windows = (image_width - window_width) // step_size + 1

    for i in range(num_windows):
        window_start = i * step_size
        window_end = window_start + window_width
        window = image[:, window_start:window_end]
        dct_features = compute_dct_features(window)
        windows.append(dct_features)
    
    windows = np.array(windows)
    print(f"Training PCA on windows with shape: {windows.shape}")
    pca.fit(windows)
    print(f"PCA trained on {windows.shape[0]} windows with {windows.shape[1]} features each.")

# Train PCA on the test image
train_pca_on_image(test_image)

# Function to predict sequence using HMMs
def predict_line_sequence(line_image, window_width=30, step_size=10):
    """Predict the sequence of characters in a line image using HMM models."""
    predictions = []
    image_width = line_image.shape[1]
    num_windows = (image_width - window_width) // step_size + 1

    for i in range(num_windows):
        window_start = i * step_size
        window_end = window_start + window_width
        window = line_image[:, window_start:window_end]

        # Extract and transform features
        dct_features = compute_dct_features(window)
        try:
            reduced_features = pca.transform(dct_features.reshape(1, -1))
        except Exception as e:
            print(f"Feature extraction error at window {i}: {e}")
            continue

        best_char = None
        best_score = float('-inf')

        for char, model in character_hmms.items():
            try:
                # Score the model with reduced features
                score = model.score(reduced_features)
                if score > best_score:
                    best_score = score
                    best_char = char
            except Exception as e:
                print(f"Error scoring model for character '{char}' at window {i}: {e}")
                continue

        if best_char is not None:
            predictions.append(best_char)

    return ''.join(predictions)

# Predict the sequence
predicted_sequence = predict_line_sequence(test_image)

# Extract the actual sequence from the labels
test_image_name = os.path.basename(test_image_path)
actual_sequence_row = label_df[label_df['image name'] == test_image_name]

if actual_sequence_row.empty:
    raise ValueError(f"No ground truth found for image: {test_image_name}")

actual_sequence = actual_sequence_row['gt'].values[0]

# Print sequences for comparison
print(f"Actual sequence: {actual_sequence}")
print(f"Predicted sequence: {predicted_sequence}")


Character HMM models loaded successfully.
Normalizing transmat_ for character 'ആ'
Training PCA on windows with shape: (688, 4800)


  model.transmat_ = model.transmat_ / model.transmat_.sum(axis=1, keepdims=True)


PCA trained on 688 windows with 4800 features each.
Error scoring model for character 'ആ' at window 0: transmat_ rows must sum to 1 (got row sums of [ 1.  1. nan  1.])
Error scoring model for character 'ഇ' at window 0: operands could not be broadcast together with shapes (1,198) (2,) 
Error scoring model for character 'ഈ' at window 0: operands could not be broadcast together with shapes (1,198) (2,) 
Error scoring model for character 'ഉ' at window 0: operands could not be broadcast together with shapes (1,198) (2,) 
Error scoring model for character 'ഊ' at window 0: operands could not be broadcast together with shapes (1,198) (2,) 
Error scoring model for character 'ഋ' at window 0: operands could not be broadcast together with shapes (1,198) (2,) 
Error scoring model for character 'ഏ' at window 0: operands could not be broadcast together with shapes (1,198) (2,) 
Error scoring model for character 'ഐ' at window 0: operands could not be broadcast together with shapes (1,198) (2,) 
Error 

In [46]:
import joblib
from hmmlearn import hmm
import cv2
import numpy as np
import pandas as pd
import os

# Load character HMM models
character_hmms = {}
# Load your HMM models (Assuming these models are already trained and saved as .pkl files)
# You should specify the correct path to your models
# for char in char_to_state.keys():
#     sanitized_char = sanitize_filename(char)
#     try:
#         model = joblib.load(f"{sanitized_char}_hmm.pkl")
#         character_hmms[char] = model
#     except FileNotFoundError:
#         print(f"Model for character '{char}' not found.")
#         continue

# Function to compute the actual sequence based on window mapping
def get_actual_sequence_from_df(image_name, line_image, label_df, window_width=30, step_size=10):
    if line_image is None:
        raise ValueError(f"Failed to load image '{image_name}'.")
    
    # Find the corresponding row in the DataFrame
    row = label_df[label_df['image name'] == image_name]
    
    if row.empty:
        raise ValueError(f"Image name '{image_name}' not found in the labels DataFrame.")
    
    # Extract the ground truth character sequence
    character_sequence = row['gt'].values[0]  # Adjust column name if necessary
    
    # Compute the actual sequence based on ground truth and image dimensions
    actual_sequence = []
    image_width = line_image.shape[1]
    num_characters = len(character_sequence)
    character_width = image_width // num_characters

    for i, char in enumerate(character_sequence):
        # Define the region corresponding to this character
        region_start = i * character_width
        region_end = region_start + character_width
        character_region = line_image[:, region_start:region_end]

        # Divide the character region into windows
        num_windows = (character_width - window_width) // step_size + 1
        for _ in range(num_windows):
            actual_sequence.append(char)  # Map each window to the current character
    
    return actual_sequence

# Function to predict the sequence based on HMM models
def predict_line_sequence(line_image, window_width=30, step_size=10):
    predictions = []
    image_width = line_image.shape[1]
    num_windows = (image_width - window_width) // step_size + 1

    for i in range(num_windows):
        window_start = i * step_size
        window_end = window_start + window_width
        window = line_image[:, window_start:window_end]

        # Compute DCT features for the window
        dct_features = compute_wavelet_features(window)
        # gabor_features = compute_gabor_features(window)  # Optionally add Gabor features

        combined_features = dct_features.reshape(1, -1)  # Reshape for HMM input

        # Calculate likelihoods for each character model
        best_char = None
        best_score = float('-inf')

        for char, model in character_hmms.items():
            try:
                score = model.score(combined_features)
                if score > best_score:
                    best_score = score
                    best_char = char
            except Exception as e:
                print(f"Error scoring model for character '{char}': {e}")
                pass  # Ignore errors for invalid model scoring

        if best_char is not None:
            predictions.append(best_char)

    return ''.join(predictions)

# Function to extract features (dummy function for DCT, replace with your actual feature extraction)
def compute_wavelet_features(window):
    # Example feature extraction function, replace with your actual method
    # For demonstration, we'll just return a dummy feature vector
    return np.fft.fft2(window).flatten()  # Placeholder for actual DCT or wavelet features

# Check if the file exists
def load_image(test_image_path):
    if not os.path.exists(test_image_path):
        raise FileNotFoundError(f"Error: The file '{test_image_path}' does not exist.")
    test_image = cv2.imread(test_image_path, cv2.IMREAD_GRAYSCALE)
    if test_image is None:
        raise ValueError(f"Failed to load image: {test_image_path}")
    return test_image

# Load the labels DataFrame
label_file = r"/Users/harsha_ramisetti/Downloads/ML new data set 3/cleaned_line_gt_8.xlsx"
label_df = pd.read_excel(label_file)

# Test image details
test_image_path = r"/Users/harsha_ramisetti/Downloads/color_equlsize_jpg/MaI12_Page100_line_1.jpg"

# Load the test image
test_image = load_image(test_image_path)

# Extract the actual sequence from the DataFrame
test_image_name = os.path.basename(test_image_path)
actual_sequence = get_actual_sequence_from_df(test_image_name, test_image, label_df)

# Predict the sequence using the HMM models
predicted_sequence = predict_line_sequence(test_image)

# Print the sequences for comparison
print(f"Actual sequence: {''.join(actual_sequence)}")
print(f"Predicted sequence: {predicted_sequence}")


Actual sequence: ദദദദദദദദദദദതതതതതതതതതതത്്്്്്്്്്്തതതതതതതതതതതെെെെെെെെെെെ           പപപപപപപപപപപെെെെെെെെെെെരരരരരരരരരരരളളളളളളളളളളള           ഷഷഷഷഷഷഷഷഷഷഷൊൊൊൊൊൊൊൊൊൊൊ           തതതതതതതതതതത്്്്്്്്്്്തതതതതതതതതതതിിിിിിിിിിി           കകകകകകകകകകകെെെെെെെെെെെൽൽൽൽൽൽൽൽൽൽൽ           ളളളളളളളളളളളിിിിിിിിിിി           ങങങങങങങങങങങ്്്്്്്്്്്കകകകകകകകകകകെെെെെെെെെെെ           നനനനനനനനനനനെെെെെെെെെെെ           കകകകകകകകകകകെെെെെെെെെെെടടടടടടടടടടട്്്്്്്്്്്ടടടടടടടടടടടിിിിിിിിിിി           ടടടടടടടടടടട്്്്്്്്്്്ടടടടടടടടടടടുുുുുുുുുുുംംംംംംംംംംം           ഹഹഹഹഹഹഹഹഹഹഹരരരരരരരരരരരിിിിിിിിിിി           
Predicted sequence: 


In [102]:
pip install nltk


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [79]:
import nltk
print(nltk.__version__)


3.9.1


In [36]:
import os
import cv2
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu

# Function to calculate BLEU score
def calculate_bleu_score(actual_sequence, predicted_sequence):
    reference = [list(actual_sequence)]  # Wrap in another list for multiple references
    hypothesis = list(predicted_sequence)
    bleu_score = sentence_bleu(reference, hypothesis)
    return bleu_score

# Function to calculate accuracy
def calculate_accuracy(actual_sequence, predicted_sequence):
    # Ensure the sequences are the same length for comparison
    min_length = min(len(actual_sequence), len(predicted_sequence))
    correct_predictions = 0
    
    for i in range(min_length):
        if actual_sequence[i] == predicted_sequence[i]:
            correct_predictions += 1

    # Calculate accuracy as a percentage
    accuracy = (correct_predictions / min_length) * 100
    return accuracy

# Function to get actual sequence from DataFrame
def get_actual_sequence_from_df(image_name, label_df):
    # Match the image name with the ground truth
    row = label_df[label_df['image name'] == image_name]
    if not row.empty:
        return row.iloc[0]['gt']
    return None

# Function to predict sequence (replace with actual model logic)
def predict_line_sequence(image):
    # Placeholder: Replace with your prediction logic (e.g., OCR or HMM model)
    predicted_sequence = "PREDICTED_SEQUENCE"  # Dummy sequence for now
    return predicted_sequence

# Folder paths and files
image_folder = '/Users/harsha_ramisetti/Downloads/color_equlsize_jpg'
label_file = '/Users/harsha_ramisetti/Downloads/ML new data set 3/cleaned_line_gt_8.xlsx'
output_file = '/Users/harsha_ramisetti/Downloads/average_accuracy.xlsx'

# Load the labels DataFrame
label_df = pd.read_excel(label_file)

# List all image files in the folder
image_files = [f for f in os.listdir(image_folder) if f.endswith(('.jpg', '.png'))]

# Initialize a results list and variable to track total accuracy
results = []
total_accuracy = 1
num_images = 0

# Process each image
for image_name in image_files:
    image_path = os.path.join(image_folder, image_name)
    
    # Load the image
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image is None:
        print(f"Error loading image: {image_name}")
        continue

    # Extract the actual sequence from the DataFrame
    actual_sequence = get_actual_sequence_from_df(image_name, label_df)
    if actual_sequence is None:
        print(f"No label found for image: {image_name}")
        continue

    # Predict the sequence using the HMM models
    predicted_sequence = predict_line_sequence(image)
    
    # Calculate BLEU score
    bleu_score = calculate_bleu_score(actual_sequence, predicted_sequence)
    
    # Calculate accuracy
    accuracy = calculate_accuracy(actual_sequence, predicted_sequence)
    print(f"{image_name}: Accuracy = {accuracy:.2f}%")

    # Add the accuracy to the total
    total_accuracy += accuracy
    num_images += 1
    
    # Append results
    results.append({
        "Image Name": image_name,
        "Actual Sequence": actual_sequence,
        "Predicted Sequence": predicted_sequence,
        "BLEU Score": bleu_score,
        "Accuracy": accuracy
    })

# Calculate the average accuracy
average_accuracy = total_accuracy / num_images if num_images > 0 else 0

# Save the results to an Excel sheet
results_df = pd.DataFrame(results)
results_df.to_excel(output_file, index=False)

# Print average accuracy
print(f"Average Accuracy: {average_accuracy:.2f}%")
print(f"Results saved to {output_file}")


MaI12_Page102_line_5.jpg: Accuracy = 0.00%
MaI849_041_line_1.jpg: Accuracy = 0.00%
MaI12_Page102_line_4.jpg: Accuracy = 0.00%
MaI849_039_line_9.jpg: Accuracy = 0.00%
MaI849_041_line_3.jpg: Accuracy = 0.00%
MaI849_041_line_2.jpg: Accuracy = 0.00%
MaI849_039_line_8.jpg: Accuracy = 0.00%
MaI12_Page102_line_3.jpg: Accuracy = 0.00%
MaI14_007_9.jpg: Accuracy = 0.00%
MaI849_041_line_6.jpg: Accuracy = 0.00%
MaI849_041_line_7.jpg: Accuracy = 0.00%
MaI14_007_8.jpg: Accuracy = 0.00%
MaI12_Page102_line_2.jpg: Accuracy = 0.00%
MaI849_041_line_5.jpg: Accuracy = 0.00%
MaI849_041_line_4.jpg: Accuracy = 0.00%
MaI12_Page102_line_1.jpg: Accuracy = 0.00%
MaI849_038_line_5.jpg: Accuracy = 0.00%
MaI14_051_04.jpg: Accuracy = 0.00%
MaI12_Page100_line_4.jpg: Accuracy = 0.00%
MaI12_Page100_line_5.jpg: Accuracy = 0.00%
MaI14_051_05.jpg: Accuracy = 0.00%
MaI849_038_line_4.jpg: Accuracy = 0.00%
MaI849_038_line_6.jpg: Accuracy = 0.00%
MaI14_051_07.jpg: Accuracy = 0.00%
mal286_010_9.jpg: Accuracy = 0.00%
mal286_010_

In [37]:
import os
import cv2
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from difflib import SequenceMatcher  # For alignment accuracy

# Function to calculate BLEU score
def calculate_bleu_score(actual_sequence, predicted_sequence):
    reference = [list(actual_sequence)]  # Wrap in another list for multiple references
    hypothesis = list(predicted_sequence)
    bleu_score = sentence_bleu(reference, hypothesis)
    return bleu_score

# Function to calculate accuracy using alignment
def calculate_accuracy(actual_sequence, predicted_sequence):
    matcher = SequenceMatcher(None, actual_sequence, predicted_sequence)
    match_ratio = matcher.ratio()  # Ratio of matched characters
    accuracy = match_ratio * 100  # Convert to percentage
    return accuracy

# Function to get actual sequence from DataFrame
def get_actual_sequence_from_df(image_name, label_df):
    # Normalize the image name and labels for comparison
    image_name = image_name.strip().lower()
    row = label_df[label_df['image name'].str.strip().str.lower() == image_name]
    if not row.empty:
        return row.iloc[0]['gt']
    return None

# Function to predict sequence (replace this with actual prediction logic)
def predict_line_sequence(image):
    # Replace with your OCR or HMM-based logic
    predicted_sequence = "PREDICTED_SEQUENCE"  # Dummy prediction
    return predicted_sequence

# Folder paths and files
image_folder = '/Users/harsha_ramisetti/Downloads/color_equlsize_jpg'
label_file = '/Users/harsha_ramisetti/Downloads/ML new data set 3/cleaned_line_gt_8.xlsx'
output_file = '/Users/harsha_ramisetti/Downloads/average_accuracy.xlsx'

# Load the labels DataFrame
label_df = pd.read_excel(label_file)

# List all image files in the folder
image_files = [f for f in os.listdir(image_folder) if f.endswith(('.jpg', '.png'))]

# Initialize a results list and variable to track total accuracy
results = []
total_accuracy = 0
num_images = 0

# Process each image
for image_name in image_files:
    image_path = os.path.join(image_folder, image_name)
    
    # Load the image
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image is None:
        print(f"Error loading image: {image_name}")
        continue

    # Extract the actual sequence from the DataFrame
    actual_sequence = get_actual_sequence_from_df(image_name, label_df)
    if actual_sequence is None:
        print(f"No label found for image: {image_name}")
        continue

    # Predict the sequence using the HMM models
    # For testing, mimic perfect prediction:
    predicted_sequence = actual_sequence  # Use actual for accuracy testing
    # Replace above line with: predicted_sequence = predict_line_sequence(image)

    # Calculate BLEU score
    bleu_score = calculate_bleu_score(actual_sequence, predicted_sequence)
    
    # Calculate accuracy
    accuracy = calculate_accuracy(actual_sequence, predicted_sequence)
    print(f"{image_name}: Accuracy = {accuracy:.2f}%, BLEU = {bleu_score:.4f}")

    # Add the accuracy to the total
    total_accuracy += accuracy
    num_images += 1
    
    # Append results
    results.append({
        "Image Name": image_name,
        "Actual Sequence": actual_sequence,
        "Predicted Sequence": predicted_sequence,
        "BLEU Score": bleu_score,
        "Accuracy": accuracy
    })

# Calculate the average accuracy
average_accuracy = total_accuracy / num_images if num_images > 0 else 0

# Save the results to an Excel sheet
results_df = pd.DataFrame(results)
results_df.to_excel(output_file, index=False)

# Print average accuracy
print(f"Average Accuracy: {average_accuracy:.2f}%")
print(f"Results saved to {output_file}")


MaI12_Page102_line_5.jpg: Accuracy = 100.00%, BLEU = 1.0000
MaI849_041_line_1.jpg: Accuracy = 100.00%, BLEU = 1.0000
MaI12_Page102_line_4.jpg: Accuracy = 100.00%, BLEU = 1.0000
MaI849_039_line_9.jpg: Accuracy = 100.00%, BLEU = 1.0000
MaI849_041_line_3.jpg: Accuracy = 100.00%, BLEU = 1.0000
MaI849_041_line_2.jpg: Accuracy = 100.00%, BLEU = 1.0000
MaI849_039_line_8.jpg: Accuracy = 100.00%, BLEU = 1.0000
MaI12_Page102_line_3.jpg: Accuracy = 100.00%, BLEU = 1.0000
MaI14_007_9.jpg: Accuracy = 100.00%, BLEU = 1.0000
MaI849_041_line_6.jpg: Accuracy = 100.00%, BLEU = 1.0000
MaI849_041_line_7.jpg: Accuracy = 100.00%, BLEU = 1.0000
MaI14_007_8.jpg: Accuracy = 100.00%, BLEU = 1.0000
MaI12_Page102_line_2.jpg: Accuracy = 100.00%, BLEU = 1.0000
MaI849_041_line_5.jpg: Accuracy = 100.00%, BLEU = 1.0000
MaI849_041_line_4.jpg: Accuracy = 100.00%, BLEU = 1.0000
MaI12_Page102_line_1.jpg: Accuracy = 100.00%, BLEU = 1.0000
MaI849_038_line_5.jpg: Accuracy = 100.00%, BLEU = 1.0000
MaI14_051_04.jpg: Accuracy =

In [None]:
import os
import cv2

def calculate_windows(image_folder, window_width=150, step_size=20):
    """
    Calculates the number of sliding windows for all images in the dataset.

    Args:
    - image_folder: Path to the folder containing the images.
    - window_width: Width of the sliding window (default is 150 pixels).
    - step_size: Step size for the sliding window (default is 20 pixels).

    Returns:
    - windows_count: A dictionary where keys are image filenames and values are the number of windows.
    """
    windows_count = {}
    
    for filename in os.listdir(image_folder):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
            # Construct full path
            image_path = os.path.join(image_folder, filename)
            image = cv2.imread(image_path)
            
            if image is None:
                print(f"Warning: Could not read {filename}. Skipping...")
                continue
            
            # Get image width
            image_width = image.shape[1]
            
            # Calculate the number of windows
            num_windows = (image_width - window_width) // step_size + 1
            num_windows = max(num_windows, 0)  # Ensure non-negative windows
            
            windows_count[filename] = num_windows
    
    return windows_count

# Path to your folder with images
image_folder = "/Users/harsha_ramisetti/Downloads/color_equlsize_jpg"  # Update with the correct path

# Calculate window sizes
windows = calculate_windows(image_folder)

# Save results to a file
output_file = "/Users/harsha_ramisetti/Downloads/outfile.txt"  # Ensure this is a file, not a directory
with open(output_file, "w") as f:
    for image_name, num_windows in windows.items():
        f.write(f"{image_name}: {num_windows} windows\n")

print(f"Window counts for all images have been saved to {output_file}.")



Window counts for all images have been saved to /Users/harsha_ramisetti/Downloads/outfile.txt.


In [22]:
pip install --upgrade scikit-image


Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-image
  Downloading scikit_image-0.24.0-cp39-cp39-macosx_12_0_arm64.whl (13.4 MB)
[K     |████████████████████████████████| 13.4 MB 5.8 MB/s eta 0:00:01
[?25hCollecting tifffile>=2022.8.12
  Downloading tifffile-2024.8.30-py3-none-any.whl (227 kB)
[K     |████████████████████████████████| 227 kB 3.7 MB/s eta 0:00:01
Collecting imageio>=2.33
  Downloading imageio-2.36.0-py3-none-any.whl (315 kB)
[K     |████████████████████████████████| 315 kB 3.3 MB/s eta 0:00:01
[?25hCollecting lazy-loader>=0.4
  Downloading lazy_loader-0.4-py3-none-any.whl (12 kB)
Collecting networkx>=2.8
  Downloading networkx-3.2.1-py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 9.2 MB/s eta 0:00:01
Collecting pillow>=9.1
  Downloading pillow-11.0.0-cp39-cp39-macosx_11_0_arm64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 3.3 MB/s eta 0:00:01
[?25hInstalling collected pa

In [23]:
pip install scikit-image==0.19.3


Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-image==0.19.3
  Downloading scikit_image-0.19.3-cp39-cp39-macosx_12_0_arm64.whl (12.5 MB)
[K     |████████████████████████████████| 12.5 MB 1.6 MB/s eta 0:00:01
Installing collected packages: scikit-image
  Attempting uninstall: scikit-image
    Found existing installation: scikit-image 0.24.0
    Uninstalling scikit-image-0.24.0:
      Successfully uninstalled scikit-image-0.24.0
Successfully installed scikit-image-0.19.3
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
