In [1]:
from networks import *
from data_handling import *
import pandas as pd
import numpy as np
from PIL import Image
import os
import torch
from torchvision import transforms
from torchvision.models.segmentation import deeplabv3_resnet50
from torch import nn
from skimage.filters import threshold_sauvola
from sklearn.metrics import precision_score,recall_score,f1_score,jaccard_score
import random

random.seed(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


***

## Function definitions

In [2]:
# This is a necessary transformation that has to be done on the images for the model to predict inputs
patches_transforms = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ]
)

def predict_input(pages:list, model:nn.Module):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    for i,page in enumerate(pages):
        for j,patch in enumerate(page.grid): # All pages are divided into smaller patches, it is these patches that we actually predict they are then resynthesised into the larger image

            patch_img = Image.fromarray(patch.img.astype('uint8'),'RGB')

            transformed_image = patches_transforms(patch_img)
            input_img = transformed_image.unsqueeze(0)
            input_img= input_img.to(device)

            with torch.no_grad(): # with no grad reduces runtime as no gradients are being monitored
                output = model(input_img)['out'][0]
            output_pred = output.argmax(0).byte()
            page.grid[j].output = output_pred.cpu().numpy() #retrieves the prediction from device
        pages[i].coarse_segmentation=page.reconstruct_prediction() # Recreates the image and stores it in the instance that was sent in.

def evaluate_models(model:nn.Module, weights_path:str,testing_pages:list,thresholding_technique,name:str):
    """Evaluates a model with different weights on a set of pages.
    Args:
        model (module.nn): an empty model with no weights
        weights_path (str): path to a directory full of weights for specified model
        testing_pages (list): a list containing the pages class
        thresholding_technique (lambda): a lambda function
        name (string): used to name the outputted file

    Returns:
        dictionary: contains f1-score,precision,recall and IoU for each trial
    """
    output = {'f1_score':[],'precision':[],'recall':[],'iou':[]}
    model_weights = os.listdir(weights_path)

    for weight in model_weights: # for each presaved weight test it on the pages
        model.load_state_dict(torch.load(weights_path+"/"+weight))
        predict_input(testing_pages,model)
        preds_gt = [[],[]]

        for i in range(len(testing_pages)):
            refine_image(testing_pages[i],thresholding_technique)
            preds_gt[0].append(testing_pages[i].refined_segmentation)
            preds_gt[1].append(testing_pages[i].precise_gt)

        pred_global = np.array(preds_gt[0]).flatten()
        gt_global = np.array(preds_gt[1]).flatten()
        # Uses scipy to calculate all the metrics sought after
        output['f1_score'].append(f1_score(gt_global,pred_global,average="weighted"))
        output['recall'].append(recall_score(gt_global,pred_global,average="weighted"))
        output['precision'].append(precision_score(gt_global,pred_global,average="weighted"))
        output['iou'].append(jaccard_score(gt_global,pred_global,average="weighted"))

    pd.DataFrame(output).to_csv(f'{name}output.csv')
    print(f"metrics saved to '{name}output.csv'")
    return output

def evaluate_models_by_manuscript(model:nn.Module, weights_path:str,num_pages:int,thresholding_technique,name:str):
    """Evaluates a model with different weights on the different manuscripts

    Args:
        model (module.nn): an empty model with no weights
        weights_path (str): path to a directory full of weights for specified model
        num_pages (int): number of pages per manuscript to evaluate on can be between 1-10, set to ten most of the time
        thresholding_technique (lambda): a lambda function
        name (string): used to name the outputted files
    """
    manuscripts = {"CS18":{}, "CS863":{}, "CB55":{}}
    for key in manuscripts.keys():
        manuscripts[key] = {'f1_score':[],'precision':[],'recall':[],'iou':[]}

    model_weights = os.listdir(weights_path) #should contain presaved weights for the model in question

    for weight in model_weights: #for each pretraiend weight test it on these pages
        model.load_state_dict(torch.load(weights_path+"/"+weight))
        preds_gt = {}
        for key in manuscripts.keys():
            preds_gt[key] = [[],[]]
            _,testing_pages,_ = generate_set(patch_size=224,resize=(1120, 1344),from_folders=(False,True,False),num_pages=num_pages,manuscripts=[key]) #returns a set of pages from the manuscript being evaluated
            predict_input(testing_pages,model)
            for i in range(len(testing_pages)):
                refine_image(testing_pages[i],thresholding_technique)
                preds_gt[key][0].append(testing_pages[i].refined_segmentation)
                preds_gt[key][1].append(testing_pages[i].precise_gt)

            pred_global = np.array(preds_gt[key][0]).flatten()
            gt_global = np.array(preds_gt[key][1]).flatten()
            manuscripts[key]['f1_score'].append(f1_score(gt_global,pred_global,average="weighted"))
            manuscripts[key]['recall'].append(recall_score(gt_global,pred_global,average="weighted"))
            manuscripts[key]['precision'].append(precision_score(gt_global,pred_global,average="weighted"))
            manuscripts[key]['iou'].append(jaccard_score(gt_global,pred_global,average="weighted"))
    for key in manuscripts.keys():
        pd.DataFrame(manuscripts[key]).to_csv(f'{name}_output_{key}.csv')
        print(f"metrics saved to '{name}_output_{key}.csv'")


## Testing
The cell below contains the parameters which can be adjusted during the test, threshold_phan and sauv are the thresholding function and each of them have parameters which can be adjusted

In [7]:
threshold_phan = lambda img: phansalkar(img=img,n=15,p=0.3,q=3,k=0.15,R=0.5)
threshold_sauv = lambda img: threshold_sauvola(img, window_size=15,k =0.05)
num_pages = 10 # any value between 1-10
_,testing_pages,_ = generate_set(patch_size=224,resize=(1120, 1344),from_folders=(False,True,False),num_pages=num_pages,filepath="D:/Skola/Avancerad AI/")
outputs = {}

TypeError: generate_set() got an unexpected keyword argument 'filepath'

In [4]:
#Saves all the metrics retrieved in several csv files. This is tedious but we could not find a neat solution
model = UNet(n_class=4)
model_weights_file_path = "D:\Hämtade Filer\models-20240320T180531Z-001\models"
evaluate_models_by_manuscript(model,model_weights_file_path,num_pages,threshold_phan,name="UNet_phan")
evaluate_models_by_manuscript(model,model_weights_file_path,num_pages,threshold_sauv,name="UNet_sauv")

model = deeplabv3_resnet50(num_classes=4)
model_weights_file_path = "modelsDeeplab"
evaluate_models_by_manuscript(model,model_weights_file_path,num_pages,threshold_phan,name="DeepLabV3_phan")
evaluate_models_by_manuscript(model,model_weights_file_path,num_pages,threshold_sauv,name="DeepLabV3_sauv")


metrics saved to 'UNet_phan_output_CS18.csv'
metrics saved to 'UNet_phan_output_CS863.csv'
metrics saved to 'UNet_phan_output_CB55.csv'
metrics saved to 'UNet_sauv_output_CS18.csv'
metrics saved to 'UNet_sauv_output_CS863.csv'
metrics saved to 'UNet_sauv_output_CB55.csv'
metrics saved to 'DeepLabV3_phan_output_CS18.csv'
metrics saved to 'DeepLabV3_phan_output_CS863.csv'
metrics saved to 'DeepLabV3_phan_output_CB55.csv'
metrics saved to 'DeepLabV3_sauv_output_CS18.csv'
metrics saved to 'DeepLabV3_sauv_output_CS863.csv'
metrics saved to 'DeepLabV3_sauv_output_CB55.csv'


The analysis is done on the overall manuscript for the sake of brevity and to fit within the time constraint, it would be interesting to examine each dataset (manuscript) by itself

In [5]:
model = deeplabv3_resnet50(num_classes=4)
model_weights_file_path = "modelsDeeplab"
outputs['DP']=evaluate_models(model,model_weights_file_path,testing_pages,threshold_phan,name="DeepLabV3_phan")
outputs['DS']=evaluate_models(model,model_weights_file_path,testing_pages,threshold_sauv,name="DeepLabV3_sauv")

model = UNet(n_class=4)
model_weights_file_path = "D:\Hämtade Filer\models-20240320T180531Z-001\models"
outputs['UP']=evaluate_models(model,model_weights_file_path,testing_pages,threshold_phan,name="UNet_phan")
outputs['US']=evaluate_models(model,model_weights_file_path,testing_pages,threshold_sauv,name="UNet_sauv")

metrics saved to 'DeepLabV3_phanoutput.csv'
metrics saved to 'DeepLabV3_sauvoutput.csv'
metrics saved to 'UNet_phanoutput.csv'
metrics saved to 'UNet_sauvoutput.csv'


Again, to fit within the perview of the course, only f1-scores are of concern, they're extracted out of the output dictionary to conduct the friedman tests and nemenyi later on if differences are found. It would have been fun to use the friedman test that was done in the machine learning course instead of a prebuilt one

In [None]:
f1_scores = {}
for key in outputs.keys():
    f1_scores[key] = outputs[key]['f1_score']

In [6]:
f1_scores= {    'DP': [0.9751, 0.9763, 0.9761, 0.9707, 0.9756, 0.9764, 0.9753, 0.9760, 0.9753, 0.9757],
    'DS': [0.9805, 0.9819, 0.9814, 0.9755, 0.9811, 0.9819, 0.9806, 0.9816, 0.9804, 0.9812],
    'UP': [0.9703, 0.9513, 0.9601, 0.9738, 0.9617, 0.9748, 0.9653, 0.9496, 0.9607, 0.9630],
    'US': [0.9752, 0.9560, 0.9647, 0.9787, 0.9663, 0.9798, 0.9703, 0.9534, 0.9653, 0.9677]}

In [7]:
from scipy.stats import friedmanchisquare
friedmanchisquare(f1_scores['DP'],f1_scores['DS'],f1_scores['UP'],f1_scores['US'])

FriedmanchisquareResult(statistic=23.639999999999986, pvalue=2.9698131231310194e-05)

a p-value of 2.96 is below the confidence interval of 0.05. Furthermore the friedman statistic is far above the require 7.68 for four populations with n = 10.

In [19]:
from scipy import stats
import scikit_posthocs as sp
data = []
names = {2:"UP",3:"US",0:"DP",1:"DS"} # Some renaming to make the table a bit more understandable :)
for key in f1_scores.keys():
    data.append(f1_scores[key])
data= np.array(data)
nemenyi = sp.posthoc_nemenyi_friedman(data.T)
nemenyi.rename(names,inplace=True)
nemenyi.rename(names,axis=1)

Unnamed: 0,DP,DS,UP,US
DP,1.0,0.109694,0.04628,0.9
DS,0.109694,1.0,0.001,0.04628
UP,0.04628,0.001,1.0,0.109694
US,0.9,0.04628,0.109694,1.0


* DP vs DS: not significant
* DP vs UP: significant
* DP vs US: not significant
* DS vs UP: significant
* DS vs US: significant
* UP vs US: not significant