In [1]:

import numpy as np
import pandas as pd
import cv2
import os
from scipy.fftpack import dct

# Function to compute DCT features from an image window
def compute_dct_features(window, num_features=50):
    # Convert to grayscale if the window is not already
    if len(window.shape) == 3:
        window = cv2.cvtColor(window, cv2.COLOR_BGR2GRAY)
    
    # Flatten the window to a 1D array for DCT computation
    window_flat = window.flatten()
    
    # Compute DCT and take the first `num_features` coefficients
    dct_features = dct(window_flat, norm='ortho')[:num_features]
    
    return dct_features

# Load data from the Excel file
labels_df = pd.read_excel(r'selected_labels_5000 1.xlsx')  # Adjust the path if necessary

# Parameters
window_width = 30  # Width of each sliding window in pixels
step_size =30    # Step size of the sliding window in pixels
image_folder = r"C:\Users\pavan\Desktop\ML-B20\New folder\resized 1\resized"  # Adjust the path if necessary

# Dictionary to store DCT feature sequences for each character across all images
character_dct_sequences = {}

# Process each image (word) in the dataset
for index, row in labels_df.iterrows():
    image_name = row['image_name']         # Adjust column name if necessary
    character_sequence = row['gt']      # Adjust column name if necessary
    
    # Load the corresponding image
    image_path = os.path.join(image_folder, image_name)
    image = cv2.imread(image_path)
    if image is None:
        print(f"Image {image_name} could not be loaded.")
        continue

    image_width = image.shape[1]  # Get image width
    
    # Calculate the width of each character region based on the sequence length
    num_characters = len(character_sequence)
    character_width = image_width // num_characters
    
    # Loop through each character in the sequence and collect its DCT features
    for i, char in enumerate(character_sequence):
        # Define the region corresponding to the current character
        region_start = i * character_width
        region_end = region_start + character_width
        character_region = image[:, region_start:region_end]  # Assume height is all rows
        
        # Split the character region into windows to capture DCT features
        num_windows = (character_width - window_width) // step_size + 1
        char_dct_sequence = []
        
        for j in range(num_windows):
            # Calculate the start and end of the window within the character region
            window_start = region_start + j * step_size
            window_end = window_start + window_width
            
            # Extract the window
            window = image[:, window_start:window_end]
            
            # Compute DCT features for this window
            dct_features = compute_dct_features(window)
            char_dct_sequence.append(dct_features)
        
        # Append this character's DCT features to the global dictionary
        if char not in character_dct_sequences:
            character_dct_sequences[char] = []
        character_dct_sequences[char].append(char_dct_sequence)

# Print the number of DCT feature sequences for each character
for char, sequences in character_dct_sequences.items():
    print(f"Character '{char}' has {len(sequences)} sequences of DCT features.")

Character 'ജ' has 92 sequences of DCT features.
Character 'ീ' has 240 sequences of DCT features.
Character 'വ' has 1045 sequences of DCT features.
Character 'ി' has 2902 sequences of DCT features.
Character 'ത' has 1957 sequences of DCT features.
Character '്' has 7119 sequences of DCT features.
Character 'ന' has 2952 sequences of DCT features.
Character 'റ' has 685 sequences of DCT features.
Character 'െ' has 1065 sequences of DCT features.
Character 'അ' has 604 sequences of DCT features.
Character 'മ' has 1030 sequences of DCT features.
Character 'ഷ' has 212 sequences of DCT features.
Character 'ആ' has 147 sequences of DCT features.
Character 'ര' has 1583 sequences of DCT features.
Character 'ം' has 892 sequences of DCT features.
Character 'ഭ' has 123 sequences of DCT features.
Character 'ഉ' has 92 sequences of DCT features.
Character 'ണ' has 769 sequences of DCT features.
Character 'ട' has 1480 sequences of DCT features.
Character 'ാ' has 1781 sequences of DCT features.
Character 'യ

In [2]:
import numpy as np
from hmmlearn import hmm

# Dictionary to store character HMMs
character_hmms = {}
num_states=4
# Example: 'character_dct_sequences' contains the DCT feature sequences for each character
for char, sequences in character_dct_sequences.items():
    # Remove empty sequences
    sequences = [seq for seq in sequences if len(seq) > 0]
    
    if len(sequences) == 0:
        print(f"Warning: No valid sequences for character {char}. Skipping this character.")
        continue  # Skip to the next character if no valid sequences
    if len(sequences) >=15:
    # Prepare training data for the HMM
        X = np.vstack(sequences)  # Stack the sequences into a single array
        lengths = [len(seq) for seq in sequences]  # Length of each sequence
    
        # Initialize HMM for this character
        model = hmm.GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000)
    
        # Train the HMM with the character’s DCT feature sequences
        model.fit(X, lengths)
    
        # Store the trained model
        character_hmms[char] = model
        print(f"Model trained for character: {char}")


Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.


Model trained for character: ജ
Model trained for character: ീ
Model trained for character: വ
Model trained for character: ി
Model trained for character: ത


Model is not converging.  Current: -526109.6866417305 is not greater than -526086.8344836573. Delta is -22.85215807321947


Model trained for character: ്


Model is not converging.  Current: -244254.02879332757 is not greater than -244245.85480989426. Delta is -8.17398343331297


Model trained for character: ന
Model trained for character: റ
Model trained for character: െ


Model is not converging.  Current: -72641.0917806681 is not greater than -72640.23588995337. Delta is -0.8558907147235004


Model trained for character: അ


Model is not converging.  Current: -44198.26353153431 is not greater than -44197.57405579686. Delta is -0.689475737446628
Model is not converging.  Current: -28170.703819009457 is not greater than -27685.173499217657. Delta is -485.53031979180014


Model trained for character: മ
Model trained for character: ഷ


Model is not converging.  Current: -14868.231504723783 is not greater than -14868.23062052068. Delta is -0.0008842031020321883


Model trained for character: ആ
Model trained for character: ര


Model is not converging.  Current: -9948.74953668102 is not greater than -9935.815729105594. Delta is -12.933807575425817


Model trained for character: ം
Model trained for character: ഭ
Model trained for character: ഉ


Model is not converging.  Current: -59521.64847200659 is not greater than -59521.609668139325. Delta is -0.03880386726814322
Model is not converging.  Current: -92164.96995803784 is not greater than -92162.52137349521. Delta is -2.448584542624303


Model trained for character: ണ
Model trained for character: ട


Model is not converging.  Current: -162895.4618082769 is not greater than -162870.23415860123. Delta is -25.227649675653083


Model trained for character: ാ
Model trained for character: യ
Model trained for character: ക


Model is not converging.  Current: -106479.51679985755 is not greater than -106323.1861930659. Delta is -156.3306067916419
Model is not converging.  Current: -30604.677592463886 is not greater than -30428.174565583708. Delta is -176.50302688017837


Model trained for character: ല
Model trained for character: ഞ


Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the st

Model trained for character: ‍
Model trained for character: ള


Model is not converging.  Current: 10649.108271405514 is not greater than 10696.312793246581. Delta is -47.20452184106762


Model trained for character: എ
Model trained for character: ു
Model trained for character: ച
Model trained for character: ദ


Model is not converging.  Current: -24407.906360864537 is not greater than -24407.901415557986. Delta is -0.004945306551235262
Model is not converging.  Current: -27913.733698490083 is not greater than -26808.2953086068. Delta is -1105.438389883282


Model trained for character: സ
Model trained for character: .


Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the st

Model trained for character: '


Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Model is not converging.  Current: -37600.95079349791 is not greater than -37600.671203461825. Delta is -0.27959003608702915
Model is not converging.  Current: -6358.908045436154 is not greater than -6320.430844580069. Delta is -38.47720085608489
Model is not converging.  Current: -22299.04053174543 is not greater than -22288.733883291286. Delta is -10.306648454145034


Model trained for character: ങ
Model trained for character: ഖ
Model trained for character: ഴ


Model is not converging.  Current: 5537.401884797814 is not greater than 5537.408476030689. Delta is -0.006591232874598063


Model trained for character: ബ


Model is not converging.  Current: -63635.159570830736 is not greater than -63571.21190064008. Delta is -63.9476701906533


Model trained for character: പ
Model trained for character: ഡ
Model trained for character: ോ


Model is not converging.  Current: -17284.669815675934 is not greater than -17284.654936932777. Delta is -0.014878743157169083
Model is not converging.  Current: -7262.646682829322 is not greater than -7261.393315508188. Delta is -1.2533673211346468
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.


Model trained for character: ൊ
Model trained for character: ഗ
Model trained for character: ഒ
Model trained for character: ഇ


Model is not converging.  Current: -47847.50395407368 is not greater than -47847.42638630071. Delta is -0.077567772976181
Model is not converging.  Current: -16287.743052334734 is not greater than -16256.3919312245. Delta is -31.35112111023409


Model trained for character: േ
Model trained for character: ൂ
Model trained for character: ഹ


Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Model is not converging.  Current: -12337.58882871622 is not greater t

Model trained for character: ഏ
Model trained for character: ശ


Some rows of transmat_ have zero sum because no transition from the state was ever observed.
  a -= a_lse
Model is not converging.  Current: -inf is not greater than -4266.225231140509. Delta is -inf
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.


Model trained for character: ധ
Model trained for character: !
Model trained for character: ?


Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the st

Model trained for character: ഥ
Model trained for character: :
Model trained for character: ,


Some rows of transmat_ have zero sum because no transition from the state was ever observed.
  a -= a_lse
Model is not converging.  Current: -inf is not greater than -4139.96474355349. Delta is -inf


Model trained for character: -
Model trained for character: ‘
Model trained for character: ഈ


Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.


Model trained for character: ഓ


In [3]:
# Extract the character names (keys) from the character_hmms dictionary
char_names = list(character_hmms.keys())

# Print the character names
print("Character names:", char_names)


Character names: ['ജ', 'ീ', 'വ', 'ി', 'ത', '്', 'ന', 'റ', 'െ', 'അ', 'മ', 'ഷ', 'ആ', 'ര', 'ം', 'ഭ', 'ഉ', 'ണ', 'ട', 'ാ', 'യ', 'ക', 'ല', 'ഞ', '\u200d', 'ള', 'എ', 'ു', 'ച', 'ദ', 'സ', '.', "'", 'ങ', 'ഖ', 'ഴ', 'ബ', 'പ', 'ഡ', 'ോ', 'ൊ', 'ഗ', 'ഒ', 'ഇ', 'േ', 'ൂ', 'ഹ', 'ഏ', 'ശ', 'ധ', '!', '?', 'ഥ', ':', ',', '-', '‘', 'ഈ', 'ഓ']


In [4]:
ground_truth =char_names

In [5]:
print("Ground truth list:", ground_truth)

Ground truth list: ['ജ', 'ീ', 'വ', 'ി', 'ത', '്', 'ന', 'റ', 'െ', 'അ', 'മ', 'ഷ', 'ആ', 'ര', 'ം', 'ഭ', 'ഉ', 'ണ', 'ട', 'ാ', 'യ', 'ക', 'ല', 'ഞ', '\u200d', 'ള', 'എ', 'ു', 'ച', 'ദ', 'സ', '.', "'", 'ങ', 'ഖ', 'ഴ', 'ബ', 'പ', 'ഡ', 'ോ', 'ൊ', 'ഗ', 'ഒ', 'ഇ', 'േ', 'ൂ', 'ഹ', 'ഏ', 'ശ', 'ധ', '!', '?', 'ഥ', ':', ',', '-', '‘', 'ഈ', 'ഓ']


In [6]:
unique_chars = sorted(set("".join(ground_truth)))
char_to_state = {char: idx for idx, char in enumerate(unique_chars)}
state_to_char = {idx: char for char, idx in char_to_state.items()}

In [7]:
from nltk.translate.bleu_score import sentence_bleu
import numpy as np
 
# Calculate BLEU score for predictions
for char, model in character_hmms.items():
    try:
        sequences = character_dct_sequences[char]
        # Remove empty sequences
        sequences = [seq for seq in sequences if len(seq) > 0]
        # Check if we have any sequences left
        if len(sequences) >= 15:
            X = np.vstack(sequences)
            lengths = [len(seq) for seq in sequences]
            # Check for valid transition matrix
            row_sums = model.transmat_.sum(axis=1)
            if not np.allclose(row_sums, 1):
                print(f"Problem with transition matrix for character '{char}': row sums = {row_sums}")
                continue  # Skip this model and move to the next one
            # Get predicted states
            predicted_states = model.predict(X)
            mapped_predictions = [state_to_char[state] for state in predicted_states]
            # Calculate BLEU score
            bleu_score = sentence_bleu([ground_truth], mapped_predictions)
            print(f"BLEU Score for character '{char}': {bleu_score}")
    except ValueError as e:
        print(f"Error with character '{char}': {e}")

Problem with transition matrix for character 'ജ': row sums = [1. 1. 0. 1.]
BLEU Score for character 'ീ': 5.250084857547877e-232
BLEU Score for character 'വ': 3.7400828679455386e-232
BLEU Score for character 'ി': 3.400372698917353e-232
BLEU Score for character 'ത': 3.546957736134618e-232
BLEU Score for character '്': 2.6967209314349595e-232
BLEU Score for character 'ന': 3.1103854225893156e-232
BLEU Score for character 'റ': 4.317610965115914e-232
BLEU Score for character 'െ': 3.940299862695615e-232
BLEU Score for character 'അ': 3.8663092967161655e-232
BLEU Score for character 'മ': 4.20305087831312e-232
BLEU Score for character 'ഷ': 5.5754780543102235e-232
BLEU Score for character 'ആ': 3.998425477070395e-232
BLEU Score for character 'ര': 3.501324235538429e-232
BLEU Score for character 'ം': 3.8206362135222635e-232
BLEU Score for character 'ഭ': 5.6147157761712806e-232
BLEU Score for character 'ഉ': 4.7743047731569574e-232
BLEU Score for character 'ണ': 4.673102732696767e-232
BLEU Score for ch

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

In [9]:
# List of characters to remove from the character_hmms dictionary
characters_to_remove = ['ങ','ശ','!','?','ഒ','ഏ','ഥ','‘','ഈ','ഓ','ജ']
 
# Remove characters if they exist in the dictionary
for char in characters_to_remove:
    print (char)
    if char in character_hmms:
        del character_hmms[char]
        print(f"Deleted HMM model for character: {char}")
    else:
        print(f"Character {char} not found in character_hmms.")

ങ
Character ങ not found in character_hmms.
ശ
Character ശ not found in character_hmms.
!
Character ! not found in character_hmms.
?
Character ? not found in character_hmms.
ഒ
Character ഒ not found in character_hmms.
ഏ
Character ഏ not found in character_hmms.
ഥ
Character ഥ not found in character_hmms.
‘
Character ‘ not found in character_hmms.
ഈ
Character ഈ not found in character_hmms.
ഓ
Character ഓ not found in character_hmms.
ജ
Character ജ not found in character_hmms.


In [18]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
import numpy as np

# List to store results
results = []

# Loop through each character and its model
for char, model in character_hmms.items():
    sequences = character_dct_sequences[char]
    
    # Remove empty sequences
    sequences = [seq for seq in sequences if len(seq) > 0]
    
    # Check if we have enough sequences to proceed
    if len(sequences) >= 15:
        X = np.vstack(sequences)
        lengths = [len(seq) for seq in sequences]
        
        # Get predicted states and map to characters
        predicted_states = model.predict(X)
        mapped_predictions = [state_to_char[state] for state in predicted_states]
        
        # Calculate BLEU score
        bleu_score = sentence_bleu([ground_truth], mapped_predictions)
        
        # Store the results
        results.append({
            'Character': char,
            'Mapped Predictions': ''.join(mapped_predictions),
            'BLEU Score': bleu_score
        })

# Create a DataFrame to store results
results_df = pd.DataFrame(results)

# Save the results to an Excel file
output_path = r'C:\Users\pavan\Downloads\line_gt_7_dct.xlsx'
results_df.to_excel(output_path, index=False)

print(f"Results saved to {output_path}")


Results saved to C:\Users\pavan\Downloads\line_gt_7_dct.xlsx


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

In [22]:
for char, model in character_hmms.items():
    print(char,model)

ീ GaussianHMM(n_components=4, n_iter=1000)
വ GaussianHMM(n_components=4, n_iter=1000)
ി GaussianHMM(n_components=4, n_iter=1000)
ത GaussianHMM(n_components=4, n_iter=1000)
് GaussianHMM(n_components=4, n_iter=1000)
ന GaussianHMM(n_components=4, n_iter=1000)
റ GaussianHMM(n_components=4, n_iter=1000)
െ GaussianHMM(n_components=4, n_iter=1000)
അ GaussianHMM(n_components=4, n_iter=1000)
മ GaussianHMM(n_components=4, n_iter=1000)
ഷ GaussianHMM(n_components=4, n_iter=1000)
ആ GaussianHMM(n_components=4, n_iter=1000)
ര GaussianHMM(n_components=4, n_iter=1000)
ം GaussianHMM(n_components=4, n_iter=1000)
ഭ GaussianHMM(n_components=4, n_iter=1000)
ഉ GaussianHMM(n_components=4, n_iter=1000)
ണ GaussianHMM(n_components=4, n_iter=1000)
ട GaussianHMM(n_components=4, n_iter=1000)
ാ GaussianHMM(n_components=4, n_iter=1000)
യ GaussianHMM(n_components=4, n_iter=1000)
ക GaussianHMM(n_components=4, n_iter=1000)
ല GaussianHMM(n_components=4, n_iter=1000)
ഞ GaussianHMM(n_components=4, n_iter=1000)
‍ GaussianH

In [23]:
import joblib
import re
 
# Assuming your models are in a dictionary called `character_hmms`
# e.g., `character_hmms = {'െ': trained_model_1, 'ത': trained_model_2, ... }`
 
def sanitize_filename(char):
    # Replace invalid filename characters with an underscore or remove them
    return re.sub(r'[<>:"/\\|?*]', '_', char)
 
for char, model in character_hmms.items():
    # Sanitize character to create a valid filename
    sanitized_char = sanitize_filename(char)
    print(char,sanitized_char )
    # Save the trained HMM model to a file with the sanitized name
    joblib.dump(model, f"{sanitized_char}_hmm.pkl")

ീ ീ
വ വ
ി ി
ത ത
് ്
ന ന
റ റ
െ െ
അ അ
മ മ
ഷ ഷ
ആ ആ
ര ര
ം ം
ഭ ഭ
ഉ ഉ
ണ ണ
ട ട
ാ ാ
യ യ
ക ക
ല ല
ഞ ഞ
‍ ‍
ള ള
എ എ
ു ു
ച ച
ദ ദ
സ സ
. .
' '
ഖ ഖ
ഴ ഴ
ബ ബ
പ പ
ഡ ഡ
ോ ോ
ൊ ൊ
ഗ ഗ
ഇ ഇ
േ േ
ൂ ൂ
ഹ ഹ
ധ ധ
: _
, ,
- -


In [24]:
results_df 

Unnamed: 0,Character,Mapped Predictions,BLEU Score
0,ീ,----------------------------------------------...,5.250085e-232
1,വ,!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!...,3.740083e-232
2,ി,''''''''''''''''''''''''''''''''''''''''''''''...,3.400373e-232
3,ത,''''''''''''''''''''''''''''''''''''''''''''''...,3.546958e-232
4,്,''''''''''''''''''''''''''''''''''''''''''''''...,2.696721e-232
5,ന,",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...",3.1103849999999998e-232
6,റ,----------------------------------------------...,4.317611e-232
7,െ,!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!...,3.9403e-232
8,അ,!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!...,3.866309e-232
9,മ,''''''''''''''''''''''''''''''''''''''''''''''...,4.203051e-232


In [26]:
import pandas as pd
import numpy as np

# Read line sequences from Excel

# Assuming the line sequences are in a column named 'line_sequence' or similar
line_sequences = labels_df['gt'].tolist()

# Example line sequence (you can iterate through all lines in your dataset)
# line_sequence = ['ര', 'വ', 'ി', 'ട', 'ു']  # Example line sequence


In [27]:
# Get indices for each individual character, but skip characters not in char_to_index
line_sequence_indices = [char_to_state [individual_char] 
                         for char_group in line_sequences 
                         for individual_char in char_group 
                         if individual_char in char_to_state ]
print("Line sequence indices:", line_sequence_indices)

Line sequence indices: [22, 49, 42, 48, 27, 27, 56, 27, 48, 31, 56, 38, 52, 8, 31, 56, 27, 48, 35, 31, 48, 35, 48, 44, 27, 56, 27, 48, 31, 56, 38, 52, 9, 37, 7, 34, 7, 12, 26, 56, 24, 47, 36, 27, 56, 27, 56, 27, 48, 31, 56, 38, 52, 17, 39, 47, 42, 34, 42, 27, 56, 27, 52, 23, 47, 31, 56, 57, 8, 38, 48, 36, 47, 27, 52, 35, 17, 40, 52, 13, 31, 48, 17, 56, 17, 50, 39, 34, 48, 21, 56, 21, 27, 48, 39, 56, 57, 23, 47, 31, 56, 57, 29, 42, 27, 56, 27, 52, 45, 56, 27, 50, 27, 48, 21, 56, 21, 50, 4, 13, 31, 56, 38, 52, 1, 8, 20, 56, 20, 31, 52, 45, 50, 18, 17, 37, 35, 47, 36, 34, 47, 42, 31, 36, 48, 39, 56, 57, 35, 50, 41, 50, 17, 48, 23, 47, 31, 56, 57, 22, 31, 39, 56, 57, 33, 44, 49, 37, 56, 57, 32, 56, 37, 48, 31, 56, 57, 45, 48, 32, 56, 32, 39, 50, 7, 25, 55, 17, 56, 24, 38, 50, 7, 8, 36, 39, 56, 57, 32, 17, 56, 17, 17, 56, 17, 47, 37, 47, 26, 56, 4, 8, 42, 37, 50, 24, 52, 35, 17, 56, 17, 40, 56, 57, 8, 35, 56, 32, 37, 32, 56, 32, 48, 21, 56, 21, 50, 17, 54, 26, 56, 24, 50, 19, 7, 34, 49, 37,

In [30]:
import joblib
from hmmlearn import hmm
import cv2
# Load character HMM models
character_hmms = {}
for char in char_to_state.keys():
    sanitized_char = sanitize_filename(char)
    try:
        model = joblib.load(f"{sanitized_char}_hmm.pkl")
        character_hmms[char] = model
    except FileNotFoundError:
        print(f"Model for character '{char}' not found.")
        continue

# Function to predict line sequence based on HMMs
def predict_line_sequence(line_image, window_width=30, step_size=30):
    # Divide the line image into character-width segments
    predictions = []
    image_width = line_image.shape[1]
    num_windows = (image_width - window_width) // step_size + 1
    
    # Loop through each window and predict the character
    for i in range(num_windows):
        window_start = i * step_size
        window_end = window_start + window_width
        window = line_image[:, window_start:window_end]
        
        # Compute DCT features for the window
        dct_features = compute_dct_features(window)
        dct_features = dct_features.reshape(1, -1)  # Reshape for HMM input
        
        # Calculate likelihoods for each character model
        best_char = None
        best_score = float('-inf')
        
        for char, model in character_hmms.items():
            try:
                score = model.score(dct_features)
                if score > best_score:
                    best_score = score
                    best_char = char
            except:
                pass  # Ignore errors for invalid model scoring
        
        if best_char is not None:
            predictions.append(best_char)
    
    return ''.join(predictions)

# Load a test line image
test_image_path = r"D:\phd\OneDrive\AMRITA\color_window_double1 (1)\color_window_double1\MaI12_Page100_line_1.jpg_window_0.jpg"
test_image = cv2.imread(test_image_path, cv2.IMREAD_GRAYSCALE)

# Predict the character sequence in the test line image
predicted_sequence = predict_line_sequence(test_image)
print("Predicted line sequence:", predicted_sequence)

Model for character '!' not found.
Model for character 'ഈ' not found.
Model for character 'ഏ' not found.
Model for character 'ഒ' not found.
Model for character 'ഓ' not found.
Model for character '‘' not found.


KeyError: 'image name'

In [37]:
import joblib
from hmmlearn import hmm
import cv2
import numpy as np
import pandas as pd

# Load character HMM models
character_hmms = {}
for char in char_to_state.keys():
    sanitized_char = sanitize_filename(char)
    try:
        model = joblib.load(f"{sanitized_char}_hmm.pkl")
        character_hmms[char] = model
    except FileNotFoundError:
        print(f"Model for character '{char}' not found.")
        continue

# Function to compute the actual sequence based on window mapping
# Function to extract the actual sequence for a specific image from the DataFrame
def get_actual_sequence_from_df(image_name, line_image, label_df, window_width=30, step_size=30):
    # Find the corresponding row in the DataFrame
    row = label_df[label_df['image name'] == image_name]
    
    if row.empty:
        raise ValueError(f"Image name '{image_name}' not found in the labels DataFrame.")
    
    # Extract the ground truth character sequence
    character_sequence = row['Label'].values[0]  # Adjust column name if necessary
    
    # Compute the actual sequence based on ground truth and image dimensions
    actual_sequence = []
    image_width = line_image.shape[1]
    num_characters = len(character_sequence)
    character_width = image_width // num_characters

    for i, char in enumerate(character_sequence):
        # Define the region corresponding to this character
        region_start = i * character_width
        region_end = region_start + character_width
        character_region = line_image[:, region_start:region_end]

        # Divide the character region into windows
        num_windows = (character_width - window_width) // step_size + 1
        for _ in range(num_windows):
            actual_sequence.append(char)  # Map each window to the current character
    
    return actual_sequence
# Function to predict the sequence based on HMM models
def predict_line_sequence(line_image, window_width=30, step_size=10):
    predictions = []
    image_width = line_image.shape[1]
    num_windows = (image_width - window_width) // step_size + 1

    for i in range(num_windows):
        window_start = i * step_size
        window_end = window_start + window_width
        window = line_image[:, window_start:window_end]

        # Compute DCT features for the window
        dct_features = compute_dct_features(window)
        dct_features = dct_features.reshape(1, -1)

        # Calculate likelihoods for each character model
        best_char = None
        best_score = float('-inf')

        for char, model in character_hmms.items():
            try:
                score = model.score(dct_features)
                if score > best_score:
                    best_score = score
                    best_char = char
            except:
                pass  # Ignore errors for invalid model scoring

        if best_char is not None:
            predictions.append(best_char)

    return ''.join(predictions)

# Test image details
# Load the labels DataFrame
label_file = r"D:\phd\OneDrive\AMRITA\gt_Window (2).xlsx"
label_df = pd.read_excel(label_file)

# Test image details
test_image_path = r"D:\phd\OneDrive\AMRITA\color_window_double1 (1)\color_window_double1\MaI12_Page100_line_1.jpg_window_0.jpg"
test_image_name = test_image_path.split("\\")[-1]  # Extract the image name

# Load the test image
test_image = cv2.imread(test_image_path, cv2.IMREAD_GRAYSCALE)

# Extract the actual sequence from the DataFrame
actual_sequence = get_actual_sequence_from_df(test_image_name, test_image, label_df)

# Predict the sequence using the HMM models
predicted_sequence = predict_line_sequence(test_image)

# Print the sequences for comparison
print(f"Actual sequence: {''.join(actual_sequence)}")
print(f"Predicted sequence: {predicted_sequence}")


Actual sequence: െെെെെെെെെെ
Predicted sequence: """"െെറ""െെെറ""ആ്്ആആമകതചിഅെെെെ


In [40]:
from nltk.translate.bleu_score import sentence_bleu

# Function to calculate BLEU score
def calculate_bleu_score(actual_sequence, predicted_sequence):
    # BLEU expects the reference as a list of lists and hypothesis as a list
    reference = [list(actual_sequence)]  # Wrap in another list for multiple references
    hypothesis = list(predicted_sequence)
    bleu_score = sentence_bleu(reference, hypothesis)
    return bleu_score

# Load the labels DataFrame
label_file = r"D:\phd\OneDrive\AMRITA\gt_Window (2).xlsx"
label_df = pd.read_excel(label_file)

# Test image details
test_image_path = r"D:\phd\OneDrive\AMRITA\color_window_double1 (1)\color_window_double1\MaI14_051_07.jpg_window_23.jpg"
test_image_name = test_image_path.split("\\")[-1]  # Extract the image name

# Load the test image
test_image = cv2.imread(test_image_path, cv2.IMREAD_GRAYSCALE)

# Extract the actual sequence from the DataFrame
actual_sequence = get_actual_sequence_from_df(test_image_name, test_image, label_df)

# Predict the sequence using the HMM models
predicted_sequence = predict_line_sequence(test_image)

# Calculate BLEU score
bleu_score = calculate_bleu_score(actual_sequence, predicted_sequence)

# Print the results
print(f"Actual sequence: {''.join(actual_sequence)}")
print(f"Predicted sequence: {predicted_sequence}")
print(f"BLEU Score: {bleu_score}")


Actual sequence: തതതതതാാാാാ
Predicted sequence: ർൽൽഃഭദരജജചർണഗർർഷൊൊൂണണണഷഹീൊൈണൊൊ
BLEU Score: 0


In [32]:
import os
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu

# Function to calculate BLEU score
def calculate_bleu_score(actual_sequence, predicted_sequence):
    reference = [list(actual_sequence)]  # Wrap in another list for multiple references
    hypothesis = list(predicted_sequence)
    bleu_score = sentence_bleu(reference, hypothesis)
    return bleu_score

# Folder paths and files
image_folder = r"D:\phd\OneDrive\AMRITA\color_window_double1 (1)\color_window_double1"
label_file = r"D:\phd\OneDrive\AMRITA\gt_Window (2).xlsx"
output_file = r"D:\phd\BLEU_Scores.xlsx"

# Load the labels DataFrame
label_df = pd.read_excel(label_file)

# List all image files in the folder
image_files = [f for f in os.listdir(image_folder) if f.endswith(('.jpg', '.png'))]

# Initialize a results list
results = []

# Process each image
for image_name in image_files:
    image_path = os.path.join(image_folder, image_name)
    
    # Load the image
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image is None:
        print(f"Error loading image: {image_name}")
        continue

    # Extract the actual sequence from the DataFrame
    actual_sequence = get_actual_sequence_from_df(image_name, image, label_df)
    if actual_sequence is None:
        print(f"No label found for image: {image_name}")
        continue

    # Predict the sequence using the HMM models
    predicted_sequence = predict_line_sequence(image)
    
    # Calculate BLEU score
    bleu_score = calculate_bleu_score(actual_sequence, predicted_sequence)
    
    # Append results
    results.append({
        "Image Name": image_name,
        "Actual Sequence": ''.join(actual_sequence),
        "Predicted Sequence": predicted_sequence,
        "BLEU Score": bleu_score
    })

# Save the results to an Excel sheet
results_df = pd.DataFrame(results)
results_df.to_excel(output_file, index=False)

print(f"Results saved to {output_file}")


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

ValueError: Image name 'maI16_01_10.jpg_window_0.jpg' not found in the labels DataFrame.

In [35]:
import os
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
import cv2

# Function to calculate BLEU score
def calculate_bleu_score(actual_sequence, predicted_sequence):
    reference = [list(actual_sequence)]  # Wrap in another list for multiple references
    hypothesis = list(predicted_sequence)
    bleu_score = sentence_bleu(reference, hypothesis)
    return bleu_score

# Folder paths and files
image_folder = r"D:\phd\OneDrive\AMRITA\color_window_double1 (1)\color_window_double1"
label_file = r"D:\phd\OneDrive\AMRITA\gt_Window (2).xlsx"
output_file = r"D:\phd\BLEU_Scores.xlsx"

# Load the labels DataFrame
label_df = pd.read_excel(label_file)

# List all image files in the folder
image_files = [f for f in os.listdir(image_folder) if f.endswith(('.jpg', '.png'))]

# Initialize a results list
results = []

# Function to extract actual sequence from label DataFrame
def get_actual_sequence_from_df(image_name, label_df):
    try:
        row = label_df[label_df['image name'] == image_name]
        if row.empty:
            raise ValueError(f"Image name '{image_name}' not found in the labels DataFrame.")
        return row['Label'].values[0]
    except ValueError as e:
        print(e)  # Print the error message for the missing image
        return None  # Return None if the image name is not found

# Process each image
for image_name in image_files:
    image_path = os.path.join(image_folder, image_name)
    
    # Load the image
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image is None:
        print(f"Error loading image: {image_name}")
        continue

    # Extract the actual sequence from the DataFrame
    actual_sequence = get_actual_sequence_from_df(image_name, label_df)
    if actual_sequence is None:
        continue  # Skip this image if the actual sequence is not found

    # Predict the sequence using the HMM models
    predicted_sequence = predict_line_sequence(image)
    
    # Calculate BLEU score
    bleu_score = calculate_bleu_score(actual_sequence, predicted_sequence)
    
    # Append results
    results.append({
        "Image Name": image_name,
        "Actual Sequence": ''.join(actual_sequence),
        "Predicted Sequence": predicted_sequence,
        "BLEU Score": bleu_score
    })

# Save the results to an Excel sheet
results_df = pd.DataFrame(results)
results_df.to_excel(output_file, index=False)

print(f"Results saved to {output_file}")


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

Image name 'maI16_01_10.jpg_window_0.jpg' not found in the labels DataFrame.
Image name 'maI16_01_10.jpg_window_1.jpg' not found in the labels DataFrame.
Image name 'maI16_01_10.jpg_window_10.jpg' not found in the labels DataFrame.
Image name 'maI16_01_10.jpg_window_11.jpg' not found in the labels DataFrame.
Image name 'maI16_01_10.jpg_window_12.jpg' not found in the labels DataFrame.
Image name 'maI16_01_10.jpg_window_13.jpg' not found in the labels DataFrame.
Image name 'maI16_01_10.jpg_window_14.jpg' not found in the labels DataFrame.
Image name 'maI16_01_10.jpg_window_15.jpg' not found in the labels DataFrame.
Image name 'maI16_01_10.jpg_window_16.jpg' not found in the labels DataFrame.
Image name 'maI16_01_10.jpg_window_17.jpg' not found in the labels DataFrame.
Image name 'maI16_01_10.jpg_window_18.jpg' not found in the labels DataFrame.
Image name 'maI16_01_10.jpg_window_19.jpg' not found in the labels DataFrame.
Image name 'maI16_01_10.jpg_window_2.jpg' not found in the labels 

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

Image name 'mal286_010_1.jpg_window_0.jpg' not found in the labels DataFrame.
Image name 'mal286_010_1.jpg_window_1.jpg' not found in the labels DataFrame.
Image name 'mal286_010_1.jpg_window_10.jpg' not found in the labels DataFrame.
Image name 'mal286_010_1.jpg_window_11.jpg' not found in the labels DataFrame.
Image name 'mal286_010_1.jpg_window_12.jpg' not found in the labels DataFrame.
Image name 'mal286_010_1.jpg_window_13.jpg' not found in the labels DataFrame.
Image name 'mal286_010_1.jpg_window_14.jpg' not found in the labels DataFrame.
Image name 'mal286_010_1.jpg_window_15.jpg' not found in the labels DataFrame.
Image name 'mal286_010_1.jpg_window_16.jpg' not found in the labels DataFrame.
Image name 'mal286_010_1.jpg_window_17.jpg' not found in the labels DataFrame.
Image name 'mal286_010_1.jpg_window_18.jpg' not found in the labels DataFrame.
Image name 'mal286_010_1.jpg_window_19.jpg' not found in the labels DataFrame.
Image name 'mal286_010_1.jpg_window_2.jpg' not found i