# Calculate tile classes per png file

Once the CNN classifier has classified images, it produces per-pixel probabilities as a PNG file. This workbook loads those per-pixel probabilities, and then samples each tile to obtain the class for all the pixels in the tile, so that the tile can be compared with the ground truth data as a whole

In [1]:
def sample_size(tile_size):
    # Calculate statistically significant test sample size

    # Z score for a 95% confidence (from Z table)
    Z_score = 1.96
    # Calculate with a 5% margin of error
    margin_of_error = 0.05


    # Some selected expected population percentages
    population_with_attribute_water = 0.09
    population_with_attribute_foliage = 0.876
    population_with_attribute_road = 0.027
    population_with_attribute_building = 0.006
    
    # Number of samples is the size of a square tile
    N = tile_size ** 2

    required_samples = 0

    # Calc for all population percentages, output all values. We will pick the largest one
    for p in [population_with_attribute_water, 
              population_with_attribute_foliage, population_with_attribute_road, 
              population_with_attribute_building]:
        q = 1 - p
        # Calc required samples for an unlimited population
        n_0 = ((Z_score ** 2) * p * q) / (margin_of_error ** 2)
        # Now reduce this unbounded required sample count down by the known population 
        # per tile (900 pixels)
        n = n_0 / (1 + ((n_0 - 1) / N))
        if (n > required_samples):
            required_samples = round(n) + (1 if round(n) != n else 0)

    return required_samples

## Step 2: Calculate the class per tile in each probability file, save as csv

Each image will have a statistically significant sample of pixels chosen per tile (uniformly distributed), and the probabilities of those pixels will be aggregated to provide the class of the tile.

A CSV file will be saved per image with the class of all the tiles in the image

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import h5py
import pandas as pd
import itertools
import os
import glob

def calc_class(foliage, water, building, road):
    if foliage > 0.5:
        return 'foliage'
    if water > 0.5:
        return 'water'
    if building > 0.5:
        return 'building'
    if road > 0.5:
        return 'road'
    return 'unknown'

def calc_tile_classes(image_data, tile_size):    
    img_data_arr = np.asarray(image_data)
    # The image data is in a 2 dimensional array. Dimension 1 is the Y pixel value, 2 is the X pixel value 
    # and array value is the class (0, 1, 2, 3 or 4)
    names = ['y', 'x']
    # Create an index for the dataframe
    index = pd.MultiIndex.from_product([range(s) for s in img_data_arr.shape], names=names)
    # create the dataframe itself
    image_df = pd.DataFrame({'Class': img_data_arr.flatten()}, index=index)  
    # Reformat into a 4 column frame with the 2 column index by unpacking the array
    image_df = image_df.sort_index()
    y_size, x_size = img_data_arr.shape
    # Now build a matrix of sample pixels across the image, the array will contain the tile number (adjusted for the 
    # size classified) and a uniformly distributed random selection of pixels from that tile
    sample_pixel_arr = []
    required_samples = sample_size(tile_size)
    all_items = itertools.product(np.asarray(range(int(x_size / tile_size))), np.asarray(range(int(y_size/tile_size))))    
    for x, y in all_items:
        x_s = np.asarray([int(round(x[0])) for x in 
                          np.random.uniform((x * tile_size), (x * tile_size) + tile_size, (required_samples, 1))])
        y_s = np.asarray([int(round(x[0])) for x in 
                          np.random.uniform((y * tile_size), (y * tile_size) + tile_size, (required_samples, 1))])
        sample_pixel_arr.append(list(zip(x_s, y_s, itertools.repeat(x), itertools.repeat(y))))

    # Convert the sample array to a dataframe for joining
    sample_matrix = np.asarray(sample_pixel_arr)
    sample_arr = np.reshape(sample_matrix, (sample_matrix.shape[0] * sample_matrix.shape[1], 
                                            sample_matrix.shape[2]))
    sample_df = pd.DataFrame(sample_arr)
    sample_df.columns = ['x', 'y', 'tile_x', 'tile_y']
    sample_df.set_index(['x', 'y'], inplace=True)
    
    # Join the sample pixels with the original probabilities frame to get the probabilities for each sample pixel
    sample_pixels = pd.merge(left=sample_df, right=image_df, left_on=['x', 'y'], right_on=['x', 'y'])

    # Sum all probabilities to give a total probability value for each category
    aggregated_samples = sample_pixels.groupby(['tile_x', 'tile_y', 'Class'], as_index=False).size().to_frame()
    aggregated_samples.columns = ['Occurrences']
    aggregated_samples = aggregated_samples.reset_index()
    aggregated_samples = aggregated_samples.pivot_table(index=['tile_x', 'tile_y'], columns='Class', values='Occurrences')
    aggregated_samples = aggregated_samples.reset_index()    
    aggregated_samples = aggregated_samples.fillna(0)
    
    sample_counts = [0] * len(aggregated_samples)        
    if 1 in aggregated_samples.columns:
        sample_counts = sample_counts + aggregated_samples[1]
    if 2 in aggregated_samples.columns:
        sample_counts = sample_counts + aggregated_samples[2]
    if 3 in aggregated_samples.columns:
        sample_counts = sample_counts + aggregated_samples[3]
    if 4 in aggregated_samples.columns:
        sample_counts = sample_counts + aggregated_samples[4]   

    # Divide total probability by number of samples to give a probability percentage per tile
    if 1 in aggregated_samples.columns:
        aggregated_samples[1] = aggregated_samples[1] / sample_counts
    if 2 in aggregated_samples.columns:
        aggregated_samples[2] = aggregated_samples[2] / sample_counts
    if 3 in aggregated_samples.columns:
        aggregated_samples[3] = aggregated_samples[3] / sample_counts
    if 4 in aggregated_samples.columns:
        aggregated_samples[4] = aggregated_samples[4] / sample_counts
    
    # Now calculate the class by inspecting the probability percentage. If any of the categories is 
    # above 50% that is considered to be the predicted  category of that tile
    aggregated_samples.loc[:,'tile_class'] = pd.Series('unsure', index=aggregated_samples.index)
    if 1 in aggregated_samples.columns:
        aggregated_samples.loc[aggregated_samples[1] > 0.5, 'tile_class'] = pd.Series('foliage', index=aggregated_samples.index)
    if 2 in aggregated_samples.columns:
        aggregated_samples.loc[aggregated_samples[2] > 0.5, 'tile_class'] = pd.Series('water', index=aggregated_samples.index)
    if 3 in aggregated_samples.columns:
        aggregated_samples.loc[aggregated_samples[3] > 0.5, 'tile_class'] = pd.Series('road', index=aggregated_samples.index)
    if 4 in aggregated_samples.columns:
        aggregated_samples.loc[aggregated_samples[4] > 0.5, 'tile_class'] = pd.Series('building', index=aggregated_samples.index)

    return aggregated_samples

In [4]:
import numpy as np 
from PIL import Image
import re
# black background
# index 1 is foliage - green
# index 2 is water - blue
# index 3 is road - yellow        
# index 4 is building - pink

processed_image_filenames = glob.glob('../../CNNTest/low_alt/*/*/*.jpg.out', recursive=True)
for file_name in processed_image_filenames:    
    path_parts = re.split(r'/|\\', file_name)

    head_tail = os.path.split(file_name)
    folder = os.path.basename(head_tail[0])
    filename = head_tail[1]
    pre, ext = os.path.splitext(filename)
    pre, ext = os.path.splitext(pre)
    
    inspect_image = Image.open(file_name)
    pixels = np.asarray(inspect_image)
    classified_width = inspect_image.width
    
    search_folder = '../../Texture_Repo/Donegal_Rural_Terrain_Textures/Test_Images/*/' + pre + '.jpg'
    match_files = glob.glob(search_folder)
    if len(match_files) == 0:
        print('ERROR: Source file not found for', pre)
    else:
        # Measure original image diwth
        orig_image = Image.open(match_files[0])        
        original_width = orig_image.width
        # Scale tiles by the size differential between the original and classified image resolution
        tile_size = (classified_width / original_width) * 30
        print('Processing', file_name, 'orig width', original_width, 'classified width', classified_width, 
              'tile size', tile_size)
    
        classified_classes = calc_tile_classes(pixels, tile_size)    
        classified_classes.to_csv('../../TestPredictions/CNN/' + path_parts[4] + '_' + path_parts[5] + '_' + pre + '.csv', index=False)

Processing ../../CNNTest/low_alt\hires\test\DJI_01510.jpg.out orig width 3840 classified width 3840 tile size 30.0
Processing ../../CNNTest/low_alt\hires\test\DJI_01511000.jpg.out orig width 3840 classified width 3840 tile size 30.0
Processing ../../CNNTest/low_alt\hires\test\DJI_0151300.jpg.out orig width 3840 classified width 3840 tile size 30.0
Processing ../../CNNTest/low_alt\hires\test\DJI_0151600.jpg.out orig width 3840 classified width 3840 tile size 30.0
Processing ../../CNNTest/low_alt\hires\test\DJI_0151800.jpg.out orig width 3840 classified width 3840 tile size 30.0
Processing ../../CNNTest/low_alt\hires\test\DJI_01530.jpg.out orig width 3840 classified width 3840 tile size 30.0
Processing ../../CNNTest/low_alt\hires\test\DJI_01531100.jpg.out orig width 3840 classified width 3840 tile size 30.0
Processing ../../CNNTest/low_alt\hires\test\DJI_01531400.jpg.out orig width 3840 classified width 3840 tile size 30.0
Processing ../../CNNTest/low_alt\hires\test\DJI_01532100.jpg.out 

Processing ../../CNNTest/low_alt\hires\test_small\DJI_01551400.jpg.out orig width 3840 classified width 800 tile size 6.25
Processing ../../CNNTest/low_alt\hires\test_small\DJI_01551700.jpg.out orig width 3840 classified width 800 tile size 6.25
Processing ../../CNNTest/low_alt\hires\test_small\DJI_0155200.jpg.out orig width 3840 classified width 800 tile size 6.25
Processing ../../CNNTest/low_alt\hires\test_small\DJI_01552300.jpg.out orig width 3840 classified width 800 tile size 6.25
Processing ../../CNNTest/low_alt\hires\test_small\DJI_01552900.jpg.out orig width 3840 classified width 800 tile size 6.25
Processing ../../CNNTest/low_alt\hires\test_small\DJI_01553500.jpg.out orig width 3840 classified width 800 tile size 6.25
Processing ../../CNNTest/low_alt\hires\test_small\DJI_01553700.jpg.out orig width 3840 classified width 800 tile size 6.25
Processing ../../CNNTest/low_alt\hires\test_small\DJI_0155400.jpg.out orig width 3840 classified width 800 tile size 6.25
Processing ../../C

Processing ../../CNNTest/low_alt\lowres\test_small\DJI_01531400.jpg.out orig width 3840 classified width 800 tile size 6.25
Processing ../../CNNTest/low_alt\lowres\test_small\DJI_01532100.jpg.out orig width 3840 classified width 800 tile size 6.25
Processing ../../CNNTest/low_alt\lowres\test_small\DJI_01532300.jpg.out orig width 3840 classified width 800 tile size 6.25
Processing ../../CNNTest/low_alt\lowres\test_small\DJI_01532500.jpg.out orig width 3840 classified width 800 tile size 6.25
Processing ../../CNNTest/low_alt\lowres\test_small\DJI_01532700.jpg.out orig width 3840 classified width 800 tile size 6.25
Processing ../../CNNTest/low_alt\lowres\test_small\DJI_01532900.jpg.out orig width 3840 classified width 800 tile size 6.25
Processing ../../CNNTest/low_alt\lowres\test_small\DJI_0153500.jpg.out orig width 3840 classified width 800 tile size 6.25
Processing ../../CNNTest/low_alt\lowres\test_small\DJI_01550.jpg.out orig width 3840 classified width 800 tile size 6.25
Processing .