In [None]:
class OCR_F(nn.Module):
    def __init__(self, charset_size, hidden_size, lstm_layers):
        assert(charset_size < 256)
        super(OCR_F, self).__init__()

        # Convolutional layers
        self.conv1 = nn.Conv2d(1, 6, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)

        self.conv2 = nn.Conv2d(6, 15, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)

        self.conv3 = nn.Conv2d(15, 42, kernel_size=3, padding=1)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)

        self.hidden_size = hidden_size
        self.lstm_layers = lstm_layers
        self.lstm = nn.LSTM(42 * 103 * 212, hidden_size, lstm_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size, charset_size)

    def forward(self, x): #[1, 825, 1697]
        x = self.pool1(F.relu(self.conv1(x))) #[6, 412, 848]
        x = self.pool2(F.relu(self.conv2(x))) #[15, 206, 424]
        x = self.pool3(F.relu(self.conv3(x))) #[42, 103, 212]
        x = x.view(x.size(0), -1)
        h0 = torch.zeros(self.lstm_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.lstm_layers, x.size(0), self.hidden_size).to(x.device)
        out, __ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return x

In [None]:
class OCR_S(nn.Module):
    def __init__(self, charset_size, hidden_size, lstm_layers):
        super(OCR_S, self).__init__()

        # Convolutional layers
        self.conv1 = nn.Conv2d(1, 6, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)

        self.conv2 = nn.Conv2d(6, 15, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)

        self.conv3 = nn.Conv2d(15, 42, kernel_size=3, padding=1)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # For sequence-to-sequence, we'll use an LSTM cell as the decoder
        self.decoder_lstm = nn.LSTMCell(charset_size, hidden_size * 2)
        self.character_prob = nn.Linear(hidden_size * 2, charset_size)

    def forward(self, x, max_output_length):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = self.pool3(F.relu(self.conv3(x)))
        x = x.view(x.size(0), -1)
        #x = x.view(x.size(0), 1, -1)  # Reshape for LSTM input
        
        h0 = torch.zeros(x.size(0), self.hidden_size * 2).to(x.device)
        c0 = torch.zeros(x.size(0), self.hidden_size * 2).to(x.device)

        hidden_states = [h0]
        cell_states = [c0]
        predictions = []

        for _ in range(max_output_length):
            input = hidden_states[-1]
            h, c = self.decoder_lstm(input, (hidden_states[-1], cell_states[-1]))
            hidden_states.append(h)
            cell_states.append(c)
            pred = self.character_prob(h)
            predictions.append(pred)

        return torch.stack(predictions, dim=1)

In [None]:
class TransformerEncoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(TransformerEncoder, self).__init__()
        self.linear_q = nn.Linear(input_size, hidden_size)
        self.linear_k = nn.Linear(input_size, hidden_size)
        self.linear_v = nn.Linear(input_size, hidden_size)
        self.linear_x = nn.Linear(input_size, hidden_size)
        self.attention = nn.MultiheadAttention(hidden_size, num_heads=4, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, hidden_size)
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size)
        )
        self.norm = nn.LayerNorm(hidden_size)
    
    def forward(self, x):
        q, k, v = self.linear_q(x), self.linear_k(x), self.linear_v(x)
        x = self.norm(self.linear_x(x) + self.attention(q, k, v))
        x = self.norm(x + self.fc(x))
        return x

class MultiLayerTransformer(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(MultiLayerTransformer, self).__init__()
        self.layers = nn.ModuleList([
            TransformerEncoder(input_size, hidden_size) for _ in range(num_layers)
        ])

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text, entities = self.data[index]
        return text, entities

def custom_collate(batch):
    texts, entities = zip(*batch)
    return texts, entities

# Your example data
example_list = [
    ["String 1", ["A", "B", "C", "D"]],
    ["String 2", ["E", "F", "G", "H"]],
    ["String 3", ["D", "F", "G", "H"]],
    ["String 4", ["F", "F", "G", "H"]],
    ["String 5", ["G", "F", "G", "H"]],
    ["String 6", ["H", "F", "G", "H"]],
    ["String 7", ["I", "F", "G", "H"]],
    ["String 8", ["J", "F", "G", "H"]],
    # Add more entries as needed
]

custom_dataset = CustomDataset(example_list)

custom_dataloader = DataLoader(
    custom_dataset, batch_size=3, collate_fn=custom_collate
)

for batch_texts, batch_entities in custom_dataloader:
    print("Batch Texts:", batch_texts)
    print("Batch Entities:", batch_entities)

Batch Texts: ('String 1', 'String 2', 'String 3')
Batch Entities: (['A', 'B', 'C', 'D'], ['E', 'F', 'G', 'H'], ['D', 'F', 'G', 'H'])
Batch Texts: ('String 4', 'String 5', 'String 6')
Batch Entities: (['F', 'F', 'G', 'H'], ['G', 'F', 'G', 'H'], ['H', 'F', 'G', 'H'])
Batch Texts: ('String 7', 'String 8')
Batch Entities: (['I', 'F', 'G', 'H'], ['J', 'F', 'G', 'H'])


In [None]:
class ExSet(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        img, text, entities = self.data[index]
        return text, entities

def custom_collate(batch):
    texts, entities = zip(*batch)
    return texts, entities

In [None]:
train_set = ExSet(train_data)
val_set = ExSet(val_data)
test_set = ExSet(test_data)

In [None]:
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15
train_data, temp_data = train_test_split(records, test_size=(1 - train_ratio))
val_data, test_data = train_test_split(temp_data, test_size=test_ratio / (val_ratio + test_ratio))
print("Training set len:", len(train_data))
print("Validation set len:", len(val_data))
print("Test set len:", len(test_data))

In [None]:
if 'records' in globals():
    del records
gc.collect()

In [25]:
# Sample string
main_string = "This is a sample string containing a substring."

# Substring to find (with spaces)
substring = "sub string"

# Remove spaces from both main string and substring for comparison
main_string_no_spaces = main_string.replace(" ", "")
substring_no_spaces = substring.replace(" ", "")

# Get the index where the substring (ignoring spaces) occurs in the modified main string
index = main_string_no_spaces.find(substring_no_spaces)

if index != -1:
    # Calculate the adjusted index considering spaces in the original string
    adjusted_index = 0
    spaces_count = 0
    for i in range(len(main_string)):
        if main_string[i] != " ":
            adjusted_index += 1
        else:
            spaces_count += 1
        if adjusted_index - spaces_count == index:
            break

    print(f"The substring '{substring}' (ignoring spaces) first occurs at index: {adjusted_index}")
else:
    print(f"The substring '{substring}' (ignoring spaces) does not exist in the main string.")



The substring 'sub string' (ignoring spaces) first occurs at index: 37


In [26]:
main_string[37:]

'substring.'

In [29]:
# Sample string
main_string = "This is a sample string containing a substring."

# Substring to find (with spaces)
substring = "sub string"

# Remove spaces from both main string and substring for comparison
main_string_no_spaces = main_string.replace(" ", "")
substring_no_spaces = substring.replace(" ", "")

# Get the index where the substring (ignoring spaces) occurs in the modified main string
start_index = main_string_no_spaces.find(substring_no_spaces)

if start_index != -1:
    end_index = start_index + len(substring_no_spaces)

    # Calculate the adjusted indexes considering spaces in the original string
    adjusted_end_index = 0
    spaces_count = 0

    for i in range(len(main_string)):
        if main_string[i] != " ":
            adjusted_end_index += 1
        else:
            spaces_count += 1
        if adjusted_end_index - spaces_count == end_index:
            break

    print(f"The substring '{substring}' (ignoring spaces) starts at index: {start_index}")
    print(f"The substring '{substring}' (ignoring spaces) ends at index: {adjusted_end_index}")
else:
    print(f"The substring '{substring}' (ignoring spaces) does not exist in the main string.")


The substring 'sub string' (ignoring spaces) starts at index: 30
The substring 'sub string' (ignoring spaces) ends at index: 40


In [30]:
len(main_string)

47

In [5]:
import torchtext.legacy

ModuleNotFoundError: No module named 'torchtext.legacy'

In [6]:
import torchtext

In [7]:
from torchtext.legacy import data

ModuleNotFoundError: No module named 'torchtext.legacy'

In [None]:
train_loader = DataLoader(train_set, batch_size=32, shuffle=True, collate_fn=custom_collate)
for texts, ents in train_loader:
    print(len(texts), len(ents))
    break

In [9]:
import torch.nn.functional as F

In [19]:
test = torch.rand(3, 4, 2)
test = F.softmax(test, dim=1)
print(test.shape)

torch.Size([3, 4, 2])


In [20]:
test

tensor([[[0.2850, 0.2242],
         [0.1947, 0.1796],
         [0.2689, 0.4012],
         [0.2514, 0.1949]],

        [[0.1820, 0.2570],
         [0.3742, 0.1824],
         [0.1954, 0.3900],
         [0.2483, 0.1707]],

        [[0.2207, 0.2231],
         [0.2863, 0.3196],
         [0.2681, 0.1791],
         [0.2250, 0.2782]]])

In [21]:
test = test.argmax(dim=2)

In [22]:
test

tensor([[0, 0, 1, 0],
        [1, 0, 1, 0],
        [1, 1, 0, 1]])

In [None]:
text_field = Field(sequential=True, tokenize=lambda x: x, include_lengths=True, batch_first=True, use_vocab=True)
name_field = Field(sequential=False)
date_field = Field(sequential=False)
address_field = Field(sequential=False)
total_field = Field(sequential=False)

# Create examples from dataset
data_examples = [Example.fromlist([text, name, date, address, total], fields=[
    ('text', text_field), ('name', name_field), ('date', date_field), ('address', address_field), ('total', total_field)
    ]) for text, (name, date, address, total) in records]
data_set = Dataset(data_examples, fields=[('text', text_field), ('name', name_field), ('date', date_field), ('address', address_field), ('total', total_field)])

In [None]:
# Build vocab (if necessary)
text_field.build_vocab(data_set)
name_field.build_vocab(data_set)
date_field.build_vocab(data_set)
address_field.build_vocab(data_set)


vocab_stoi = text_field.vocab.stoi # so we don't have to rewrite sample_sequence
vocab_itos = text_field.vocab.itos # so we don't have to rewrite sample_sequence
vocab_size = len(text_field.vocab.itos)
vocab_size

In [5]:
def character_error_rate(reference, hypothesis):
    """
    Computes the Character Error Rate (CER) between two strings.

    Args:
    reference (str): The reference string.
    hypothesis (str): The hypothesis string.

    Returns:
    float: Character Error Rate between the two strings (between 0 and 1).
    """
    # Ensure strings are not empty
    if len(reference) == 0 and len(hypothesis) == 0:
        return 0.0
    elif len(reference) == 0 or len(hypothesis) == 0:
        return 1.0
    
    # Initialize variables for CER calculation
    n = len(reference)
    m = len(hypothesis)
    dp = [[0] * (m + 1) for _ in range(n + 1)]

    # Initialize DP matrix
    for i in range(n + 1):
        dp[i][0] = i

    for j in range(m + 1):
        dp[0][j] = j

    # Fill DP matrix
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            cost = 0 if reference[i - 1] == hypothesis[j - 1] else 1
            dp[i][j] = min(dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + cost)

    return dp[n][m] / n  # Normalized CER between 0 and 1


In [6]:
character_error_rate('sjlskclx', 'skdhjsoiucx')

1.0

In [4]:
def cer(reference, hypothesis):
    # Convert the sentences into character lists
    ref = list(reference)
    hyp = list(hypothesis)

    # Create a matrix of size (len(ref)+1) x (len(hyp)+1)
    d = np.zeros((len(ref) + 1) * (len(hyp) + 1), dtype=np.uint32)
    d = d.reshape((len(ref) + 1, len(hyp) + 1))

    # Initialize the first row and column to be the distance from the empty string
    for i in range(len(ref) + 1):
        d[i][0] = i
    for j in range(len(hyp) + 1):
        d[0][j] = j

    # Populate the rest of the matrix
    for i in range(1, len(ref) + 1):
        for j in range(1, len(hyp) + 1):
            if ref[i - 1] == hyp[j - 1]:
                cost = 0
            else:
                cost = 1
            d[i][j] = min(d[i - 1][j] + 1,      # deletion
                          d[i][j - 1] + 1,      # insertion
                          d[i - 1][j - 1] + cost)  # substitution

    # The CER is the cost of transforming hypothesis into reference divided by the number of characters in the reference
    cer_value = float(d[len(ref)][len(hyp)]) / len(ref)

    return cer_value

In [5]:
import numpy as np
cer("i alsiwoclzjvcx", "dkss_dsl iohk lsks")

1.0666666666666667