In [1]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import torch.nn.functional as F

Define weather CUDA is initalized

In [2]:
# Ensure device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Load and preproccess the dataset

In [3]:

# Define the custom dataset
class OCRDataset(Dataset):
    def __init__(self, data_pairs, max_len, bin_len):
        self.data_pairs = data_pairs
        self.max_len = max_len
        self.bin_len = bin_len

    def __len__(self):
        return len(self.data_pairs)

    def __getitem__(self, idx):
        txt_path, excel_path = self.data_pairs[idx]
        
        # Read the text file
        with open(txt_path, 'r') as file:
            x_text = file.read()
        
        # Read the Excel file
        y_data = pd.read_excel(excel_path)
        y_text = y_data.to_string(index=False)
        
        # Pad or truncate the text data to max_len
        x_text = x_text[:self.max_len].ljust(self.max_len)
        y_text = y_text[:self.max_len].ljust(self.max_len)
        
        # Convert characters to their binary values
        x_data = self.text_to_binary(x_text)
        y_data = self.text_to_binary(y_text)
        
        return x_data, y_data

    def text_to_binary(self, text):
        binary_data = []
        for char in text:
            binary_char = format(ord(char), '08b')
            binary_data.extend([int(bit) for bit in binary_char])
        binary_data = binary_data[:self.bin_len]  # Ensure it is bin_len long
        return torch.tensor(binary_data, dtype=torch.float)

# Directory containing the subdirectories
data_directory = '/home/laptop/ECGR_5105/Project/random_tables'

# Initialize list for valid data pairs
valid_data_pairs = []

# Loop through each subdirectory and find pairs of txt and xlsx files
for subdir in os.listdir(data_directory):
    subdir_path = os.path.join(data_directory, subdir)
    if os.path.isdir(subdir_path):
        for filename in os.listdir(subdir_path):
            if filename.endswith(".txt"):
                txt_path = os.path.join(subdir_path, filename)
                excel_filename = filename.replace(".txt", ".xlsx")
                excel_path = os.path.join(subdir_path, excel_filename)
                
                if os.path.exists(excel_path):
                    valid_data_pairs.append((txt_path, excel_path))

print(f"Total valid data pairs: {len(valid_data_pairs)}")

# Split the data into training and testing sets
train_data_pairs, test_data_pairs = train_test_split(valid_data_pairs, test_size=0.2, random_state=42)

# Define maximum sequence length and binary length
max_len = max(len(open(pair[0], 'r').read()) for pair in valid_data_pairs)  # Calculate based on your data
bin_len = max_len * 8  # Each character is represented by 8 bits

# Create datasets
train_dataset = OCRDataset(train_data_pairs, max_len, bin_len)
test_dataset = OCRDataset(test_data_pairs, max_len, bin_len)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print("Data loading completed successfully.")



Total valid data pairs: 1249
Data loading completed successfully.


In [4]:
class ComplexCNN(nn.Module):
    def __init__(self, input_size, output_size, dropout_prob=0.3):
        super(ComplexCNN, self).__init__()
        self.conv1 = nn.Conv1d(1, 64, kernel_size=3, padding=1)
        self.batch_norm1 = nn.BatchNorm1d(64)
        self.conv2 = nn.Conv1d(64, 128, kernel_size=3, padding=1)
        self.batch_norm2 = nn.BatchNorm1d(128)
        self.conv3 = nn.Conv1d(128, 256, kernel_size=3, padding=1)
        self.batch_norm3 = nn.BatchNorm1d(256)
        self.fc1 = nn.Linear(256 * (input_size // 8), 128)
        self.batch_norm4 = nn.BatchNorm1d(128)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc2 = nn.Linear(128, output_size)

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension
        x = F.relu(self.conv1(x))
        x = self.batch_norm1(x)
        x = F.max_pool1d(x, 2)
        x = F.relu(self.conv2(x))
        x = self.batch_norm2(x)
        x = F.max_pool1d(x, 2)
        x = F.relu(self.conv3(x))
        x = self.batch_norm3(x)
        x = F.max_pool1d(x, 2)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.batch_norm4(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [5]:
# Model initialization
try:
    model = ComplexCNN(input_size=bin_len, output_size=bin_len)  # Adjust input_size to bin_len
    print(f"Model initialized successfully with {sum(p.numel() for p in model.parameters())} parameters.")
    model = model.to(device)
    print("Model moved to device successfully.")
except RuntimeError as e:
    print(f"Error when moving model to device: {e}")
    raise e


Model initialized successfully with 50926200 parameters.
Model moved to device successfully.


In [6]:
criterion = nn.MSELoss()  # Change the criterion to MSELoss to match the regression task
optimizer = optim.AdamW(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, verbose=True)



In [7]:
# Function to train the model
def train_model(model, train_loader, criterion, optimizer, scheduler, num_epochs=50):
    train_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device).float(), y_batch.to(device).float()
            optimizer.zero_grad()
            outputs = model(x_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            running_loss += loss.item()

        epoch_loss = running_loss / len(train_loader)
        train_losses.append(epoch_loss)

        # Validation loss
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for x_batch, y_batch in test_loader:
                x_batch, y_batch = x_batch.to(device).float(), y_batch.to(device).float()
                outputs = model(x_batch)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()

        val_loss /= len(test_loader)
        val_losses.append(val_loss)

        scheduler.step(val_loss)

        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {epoch_loss:.4f}, Validation Loss: {val_loss:.4f}')

    return train_losses, val_losses


In [8]:
# Function to convert binary values back to text
def binary_to_text(binary_values):
    binary_values = binary_values.astype(int)
    chars = [chr(int(''.join(map(str, binary_values[i:i+8])), 2)) for i in range(0, len(binary_values), 8)]
    return ''.join(chars)

# Function to evaluate the model and print predictions
def evaluate_model(model, data_loader):
    model.eval()
    with torch.no_grad():
        for x_batch, y_batch in data_loader:
            x_batch, y_batch = x_batch.to(device).float(), y_batch.to(device).float()
            outputs = model(x_batch)

            for i in range(len(x_batch)):
                input_text = binary_to_text(x_batch[i].cpu().numpy())
                predicted_text = binary_to_text(outputs[i].cpu().numpy().round())
                actual_text = binary_to_text(y_batch[i].cpu().numpy())
                print(f"Input: {input_text}")
                print(f"Predicted: {predicted_text}")
                print(f"Actual: {actual_text}\n")


In [9]:
# Train the model and get the loss
try:
    train_losses, val_losses = train_model(model, train_loader, criterion, optimizer, scheduler, num_epochs=150)
except RuntimeError as e:
    print(f"Error during training: {e}")
    raise e

Error during training: CUDA out of memory. Tried to allocate 188.00 MiB. GPU 0 has a total capacity of 5.78 GiB of which 36.06 MiB is free. Process 821772 has 3.75 GiB memory in use. Including non-PyTorch memory, this process has 1.35 GiB memory in use. Of the allocated memory 1.23 GiB is allocated by PyTorch, and 11.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


OutOfMemoryError: CUDA out of memory. Tried to allocate 188.00 MiB. GPU 0 has a total capacity of 5.78 GiB of which 36.06 MiB is free. Process 821772 has 3.75 GiB memory in use. Including non-PyTorch memory, this process has 1.35 GiB memory in use. Of the allocated memory 1.23 GiB is allocated by PyTorch, and 11.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Plotting the training loss and validation loss
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.title('Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(val_losses, label='Validation Loss')
plt.title('Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# Evaluate the model
evaluate_model(model, test_loader)