In [None]:
import pandas as pd
import os
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import string
import matplotlib.pyplot as plt

# Sprawdzenie dostępności GPU
print(torch.cuda.is_available())  
print(torch.cuda.device_count())  
print(torch.cuda.get_device_name(0))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on device {device}")

# Stałe
CSV_PATH = "hpt_dataset.csv"
IMAGE_SIZE = (64, 64)
BATCH_SIZE = 128
EPOCHS = 5
LEARNING_RATE = 0.001
MAX_WORD_LENGTH = 20  # Maksymalna długość słowa

# Wczytanie danych
df = pd.read_csv(CSV_PATH)

# Kodowanie liter
char_to_idx = {char: idx for idx, char in enumerate(string.ascii_uppercase)}
char_to_idx['<PAD>'] = len(char_to_idx)  # Dopełnienie
char_to_idx['<UNK>'] = len(char_to_idx)  # Nieznany znak
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# Funkcja do kodowania słów
def encode_word(word, max_length=MAX_WORD_LENGTH):
    encoded = [char_to_idx.get(char, char_to_idx['<UNK>']) for char in word.upper()]
    if len(encoded) < max_length:
        encoded += [char_to_idx['<PAD>']] * (max_length - len(encoded))  # Dopełnienie
    return encoded[:max_length]

# Funkcja do dekodowania słów
def decode_word(encoded_word):
    return ''.join([idx_to_char[idx] for idx in encoded_word if idx != char_to_idx['<PAD>']])

# Dataset
class WordDataset(Dataset):
    def __init__(self, dataframe, transform=None, max_length=MAX_WORD_LENGTH):
        self.dataframe = dataframe
        self.transform = transform
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        img_path = row['path']
        word = row['word']
        
        # Wczytanie obrazu
        image = Image.open(img_path).convert('L')
        if self.transform:
            image = self.transform(image)
        
        # Kodowanie słowa
        encoded_word = encode_word(word, self.max_length)
        
        return image, torch.tensor(encoded_word, dtype=torch.long)

# Transformacje obrazów
transform = transforms.Compose([
    transforms.Resize(IMAGE_SIZE),
    transforms.RandomRotation(10, fill=255),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomResizedCrop(64, scale=(0.8, 1.0)),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

# Podział danych
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.125, random_state=42)

# Przygotowanie DataLoaderów
dataset_train = WordDataset(train_df, transform=transform)
dataset_val = WordDataset(val_df, transform=transform)
dataset_test = WordDataset(test_df, transform=transform)

dataloader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=BATCH_SIZE, shuffle=False)
dataloader_test = DataLoader(dataset_test, batch_size=BATCH_SIZE, shuffle=False)

# Model CNN + LSTM
class CNN_LSTM(nn.Module):
    def __init__(self, num_chars, hidden_size, num_layers, max_length):
        super(CNN_LSTM, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Flatten()
        )
        self.lstm = nn.LSTM(256 * 8 * 8, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_chars)
        self.max_length = max_length

    def forward(self, x):
        # Ekstrakcja cech z obrazu
        features = self.cnn(x)
        features = features.unsqueeze(1).repeat(1, self.max_length, 1)  # Powtórz cechy dla każdej litery
        
        # Przetwarzanie sekwencji
        h0 = torch.zeros(self.lstm.num_layers, x.size(0), self.lstm.hidden_size).to(device)
        c0 = torch.zeros(self.lstm.num_layers, x.size(0), self.lstm.hidden_size).to(device)
        out, _ = self.lstm(features, (h0, c0))
        out = self.fc(out)
        return out

# Inicjalizacja modelu
num_chars = len(char_to_idx)  # Liczba znaków (litery + <PAD> + <UNK>)
hidden_size = 256
num_layers = 2
model = CNN_LSTM(num_chars, hidden_size, num_layers, MAX_WORD_LENGTH).to(device)

# Funkcja straty i optymalizator
criterion = nn.CrossEntropyLoss(ignore_index=char_to_idx['<PAD>'])  # Ignoruj padding
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5)

# Funkcja do obliczania dokładności
def calculate_accuracy(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, dim=2)
            total += labels.size(0) * labels.size(1)  # Liczba wszystkich znaków
            correct += (predicted == labels).sum().item()
    return correct / total

# Listy do przechowywania wyników
train_losses = []
val_accuracies = []
test_accuracies = []

# Trening modelu
def train_model():
    model.to(device)
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        for images, labels in dataloader_train:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs.view(-1, num_chars), labels.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        # Obliczanie straty i dokładności
        avg_loss = total_loss / len(dataloader_train)
        train_losses.append(avg_loss)
        
        # Obliczanie dokładności na zbiorze walidacyjnym i testowym
        val_accuracy = calculate_accuracy(model, dataloader_val)
        test_accuracy = calculate_accuracy(model, dataloader_test)
        val_accuracies.append(val_accuracy)
        test_accuracies.append(test_accuracy)
        
        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {avg_loss:.4f}, "
              f"Val Accuracy: {val_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}")

# Predykcja
def predict_word(image):
    model.eval()
    with torch.no_grad():
        image = image.to(device).unsqueeze(0)
        output = model(image)
        predicted = torch.argmax(output, dim=2).squeeze(0).cpu().numpy()
        return decode_word(predicted)

# Przykład użycia
image, _ = dataset_train[0]
predicted_word = predict_word(image)
print(f"Predicted: {predicted_word}")

# Trening
train_model()

# Wykres straty treningowej
plt.figure(figsize=(10, 5))
plt.plot(range(1, EPOCHS + 1), train_losses, label="Training Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss Over Epochs")
plt.legend()
plt.show()

# Wykres dokładności walidacyjnej i testowej
plt.figure(figsize=(10, 5))
plt.plot(range(1, EPOCHS + 1), val_accuracies, label="Validation Accuracy")
plt.plot(range(1, EPOCHS + 1), test_accuracies, label="Test Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Validation and Test Accuracy Over Epochs")
plt.legend()
plt.show()

True
1
NVIDIA GeForce GTX 1650
Training on device cuda
Predicted: KKKKKKKKKKKKKKKKKKKK
Epoch 1/50, Loss: 2.9684
Epoch 2/50, Loss: 2.8965
Epoch 3/50, Loss: 2.8828


KeyboardInterrupt: 