In [2]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from torchvision import models
import time
import os
from torchvision.io import read_image
from PIL import Image
import torch.nn.functional as F

In [3]:
transform = transforms.Compose([
    transforms.Resize((128, 128)),  # Resize images
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalizing the images
])

In [4]:
# Load the training and testing datasets
train_dataset = datasets.ImageFolder(root='../dataset/training', transform=transform)
test_dataset = datasets.ImageFolder(root='../dataset/testing', transform=transform)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [5]:
class BasicCNN(nn.Module):
    def __init__(self, num_classes):
        super(BasicCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        # Assuming your input images are 128x128, after three pooling layers, the size will be 128/(2^3) = 16
        # Thus, the output of the last conv layer would be [batch_size, 64, 16, 16]
        self.fc = nn.Linear(64 * 16 * 16, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.view(-1, 64 * 16 * 16)  # Flatten the output for the fully connected layer
        x = self.fc(x)
        return x

In [6]:
num_classes = len(train_dataset.classes)
model = BasicCNN(num_classes)

In [7]:
# Load the saved model state
model.load_state_dict(torch.load('../models/PhotoLingo_Base_v1.pth'))

# Put the model in evaluation mode if you are doing inference
model.eval()

BasicCNN(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc): Linear(in_features=16384, out_features=5, bias=True)
)

In [8]:
# Checking to make sure we are using our GPU instead of CPU.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [9]:
model.eval()  # Set model to evaluation mode
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the model on the test images: {100 * correct / total}%')



Accuracy of the model on the test images: 94.26944813829788%


In [10]:
import torch
import numpy as np
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

model.eval()  # Set model to evaluation mode
y_pred = []
y_true = []

# No gradients needed for evaluation
with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        y_pred.extend(predicted.cpu().numpy())  # Append batch predictions
        y_true.extend(labels.cpu().numpy())  # Append true labels

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred)

# Calculate precision, recall, F1-score and support for each class
precision, recall, f1_score, _ = precision_recall_fscore_support(y_true, y_pred, average=None)

# Calculate overall accuracy
accuracy = np.sum(np.diag(conf_matrix)) / np.sum(conf_matrix)


print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Accuracy: {accuracy}%')
print(f'Precision (per class): {precision}')
print(f'Recall (per class): {recall}')
print(f'F1 Score (per class): {f1_score}')



Confusion Matrix:
[[ 1285     1    11    10   103]
 [    0  1114     6     8    52]
 [    8     7  1422    66   279]
 [    9     4    63  1731   229]
 [   88    35   239   161 17133]]
Accuracy: 0.9426944813829787%
Precision (per class): [0.92446043 0.95951766 0.81677197 0.87601215 0.96274444]
Recall (per class): [0.91134752 0.9440678  0.7979798  0.85019646 0.97037834]
F1 Score (per class): [0.91785714 0.95173003 0.80726653 0.86291127 0.96654632]


In [11]:
# Corrected specificity calculation for each class
specificity = []
num_classes = len(conf_matrix)  # Assuming this is 5 in your case
for i in range(num_classes):
    true_negative = np.sum(conf_matrix) - np.sum(conf_matrix[i, :]) - np.sum(conf_matrix[:, i]) + conf_matrix[i, i]
    false_positive = np.sum(conf_matrix[:, i]) - conf_matrix[i, i]
    specificity_class_i = true_negative / (true_negative + false_positive) if (true_negative + false_positive) != 0 else 0
    specificity.append(specificity_class_i)

# Rest of the code remains the same

# Printing results
print(f'Specificity (per class): {np.array(specificity)}')
print(f'Specificity (avg): {np.mean(np.array(specificity))}')

Specificity (per class): [0.99536506 0.99794616 0.98568351 0.98887779 0.89653558]
Specificity (avg): 0.9728816207963551


In [13]:
class UnlabeledDataset(Dataset):
    def __init__(self, directory, transform=None):
        self.directory = directory
        self.transform = transform
        self.images = os.listdir(directory)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_path = os.path.join(self.directory, self.images[idx])
        image = Image.open(img_path).convert('RGB')  # Load as PIL Image
        if self.transform:
            image = self.transform(image)
        return image, self.images[idx]

In [20]:
model.to(device)
model.eval()

test_dataset = UnlabeledDataset('../dataset/testing_ICDAR', transform=transform)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

predictions = []
for inputs, image_names in test_loader:
    inputs = inputs.to(device)
    outputs = model(inputs)
    _, preds = torch.max(outputs, 1)
    
    class_names = ['Arabic', 'Hindi', 'Japanese', 'Korean', 'Latin'] 
    predicted_classes = [class_names[p] for p in preds.cpu()]
    
    for img_name, prediction in zip(image_names, predicted_classes):
        predictions.append(f"{img_name},{prediction}")

In [21]:
with open('predictions.txt', 'w') as f:
    for line in predictions:
        f.write(line + '\n')