# Pre-process comparison data

Prepare the ilastik classification results and CNN classification results so that the data can be statistically compared, to evaluate the accuracy of the various approaches

## Step 1: Combine all CNN and Ilastik csv files

Previous pre-processing has interpreted classified data from Ilastik and the CNN into csv files. These files are now combined into a dataframe with an indicator of what the algorithm is, what the image size is (small, med, large), and the classification probabilities per 30 representative tile coordinate

In [22]:
import glob
import os
import pandas as pd

# Function to read the json file and convert the dictionary object in the 'classifiedData' column
# into seperate columns
def load_prediction_file(file_name, algo_type, algo, image_size, image_id):
    data = pd.read_csv(file_name) 
    data['algorithm_type'] = algo_type
    data['algorithm'] = algo
    data['image_size'] = image_size
    data['image_id'] = image_id
    return data

all_frames = None

# 1a: Combine CNN classification CSV files into a dataframe

all_cnn_prediction_files = glob.glob('../../TestPredictions/CNN/*.csv')

for pred_file in all_cnn_prediction_files:
    head_tail = os.path.split(pred_file)
    pre, ext = os.path.splitext(head_tail[1])
    parts = pre.split('_')
    algo = parts[0]
    size = parts[1]
    image = parts[2] + '_' + parts[3]
    frame = load_prediction_file(pred_file, 'cnn', algo, size, image)
    all_frames = pd.concat([all_frames, frame])
   

# 1b: Add the Ilastik classification CSV files into a dataframe
all_ilastik_prediction_files = glob.glob('../../TestPredictions/Ilastik/*.csv')

for pred_file in all_ilastik_prediction_files:
    head_tail = os.path.split(pred_file)
    pre, ext = os.path.splitext(head_tail[1])
    parts = pre.split('_')
    # test-large_DJI_0120100_Probabilities
    size = parts[0]
    image = parts[1] + '_' + parts[2]    
    frame = load_prediction_file(pred_file, 'ilastik', 'default', size, image)
    all_frames = pd.concat([all_frames, frame])

all_frames
    

Unnamed: 0,tile_x,tile_y,0,1,2,3,4,tile_class,algorithm_type,algorithm,image_size,image_id,A,B,C,D
0,0,0,0.0,0.0,1.000000,0.000000,0.0,water,cnn,hires,test-large,DJI_0099200,,,,
1,0,1,0.0,0.0,1.000000,0.000000,0.0,water,cnn,hires,test-large,DJI_0099200,,,,
2,0,2,0.0,0.0,1.000000,0.000000,0.0,water,cnn,hires,test-large,DJI_0099200,,,,
3,0,3,0.0,0.0,0.887324,0.112676,0.0,water,cnn,hires,test-large,DJI_0099200,,,,
4,0,4,0.0,0.0,0.507042,0.492958,0.0,water,cnn,hires,test-large,DJI_0099200,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9211,127,67,,,,,,foliage,ilastik,default,test-small,DJI_0155600,0.822333,0.039333,0.055000,0.083333
9212,127,68,,,,,,foliage,ilastik,default,test-small,DJI_0155600,0.837187,0.075938,0.025625,0.061250
9213,127,69,,,,,,foliage,ilastik,default,test-small,DJI_0155600,0.591852,0.095926,0.252593,0.059630
9214,127,70,,,,,,foliage,ilastik,default,test-small,DJI_0155600,0.626552,0.146552,0.164828,0.062069


In [25]:
# 1c: Reformat the data into a consistent dataset

# Ilastik columns:
# A = Foliage
# B = Water
# C = Building
# D = Road

# CNN columns:
# 0 - Unknown
# 1 - Foliage
# 2 - Water
# 3 - Road
# 4 - Building

all_frames['Foliage'] = all_frames['1'].combine_first(all_frames['A'])
all_frames['Water'] = all_frames['2'].combine_first(all_frames['B'])
all_frames['Building'] = all_frames['4'].combine_first(all_frames['C'])
all_frames['Road'] = all_frames['3'].combine_first(all_frames['D'])

all_frames_save = all_frames.drop(['0', '1', '2', '3', '4', 'A', 'B', 'C', 'D'], axis = 1)

## Step 2: Enrich the data

Add the test case to the dataset, as well as the video id so that the video dataset can be joined into it

In [39]:
# 2a: Calculate the video ID
all_frames_save['video_id'] = all_frames_save.image_id.str[:8]

# 2b: Find the test case (identified by the parent folder of the test image), and update the dataframe to hold it
all_frames_save['test_case'] = ''
all_images = all_frames_save.image_id.unique()

for image_id in all_images:
    matching_test_files = glob.glob('../../Texture_Repo/Donegal_Rural_Terrain_Textures/Test_Images/*/{}.jpg'.format(image_id))
    test_file = matching_test_files[0]
    head_tail = os.path.split(test_file)
    filename = head_tail[1]
    parts = test_file.split('\\')
    test_case = parts[len(parts) - 2]
    all_frames_save.loc[all_frames_save['image_id'] == image_id, 'test_case'] = test_case
    

## Step 3: Merge the ground truth data into the dataset

Adding the ground truth data to the dataset will allow the efficacy of the various prediction algorithms to be compared and contrasted

In [42]:
# 3a - load the ground truth data

ground_truth_data = pd.read_pickle('../../Texture_Repo/Donegal_Rural_Terrain_Textures/classified/all_data.pkl')
# Remove 12 null rows - an artifact of the classification app
classified_data = classified_data[~classified_data['xCoord'].isnull()]

In [46]:
# 3b - Remove the duplicated image / tile sequences (some images were classified multiple times and they all show up)
classified_data = classified_data.drop(['classificationSourceQueue', 'classifiedOn'], axis=1)
classified_data = classified_data.drop_duplicates()

In [60]:
# 3c - Now join the ground truth data into the dataframe with all the predictions in
joined_data = pd.merge(all_frames_save, classified_data, left_on=['image_id','tile_x', 'tile_y'], 
                       right_on = ['sourceImageName', 'xCoord', 'yCoord'])
# Reformat the columns
joined_data = joined_data.drop(['xCoord', 'yCoord', 'sourceImageName'], axis=1)
joined_data = joined_data.rename(columns={"category": "ground_truth"})
# Replace all NaN values with zeroes
joined_data = joined_data.fillna(0)

# Inspect the data
joined_data

Unnamed: 0,tile_x,tile_y,tile_class,algorithm_type,algorithm,image_size,image_id,Foliage,Water,Building,Road,video_id,test_case,ground_truth
0,0,0,water,cnn,hires,test-large,DJI_0099200,0.000000,1.000000,0.000000,0.000000,DJI_0099,120m_altitude,road
1,0,0,road,cnn,hires,test-medium,DJI_0099200,0.000000,0.000000,0.000000,1.000000,DJI_0099,120m_altitude,road
2,0,0,road,cnn,hires,test-small,DJI_0099200,0.000000,0.000000,0.000000,1.000000,DJI_0099,120m_altitude,road
3,0,0,water,cnn,lowres,test-large,DJI_0099200,0.000000,1.000000,0.000000,0.000000,DJI_0099,120m_altitude,road
4,0,0,road,cnn,lowres,test-medium,DJI_0099200,0.000000,0.000000,0.257732,0.742268,DJI_0099,120m_altitude,road
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14017459,127,71,foliage,cnn,lowres,test-medium,DJI_0155400,1.000000,0.000000,0.000000,0.000000,DJI_0155,35m_altitude,foliage
14017460,127,71,foliage,cnn,lowres,test-small,DJI_0155400,1.000000,0.000000,0.000000,0.000000,DJI_0155,35m_altitude,foliage
14017461,127,71,foliage,ilastik,default,test-large,DJI_0155400,0.749568,0.006906,0.071655,0.171870,DJI_0155,35m_altitude,foliage
14017462,127,71,foliage,ilastik,default,test-medium,DJI_0155400,0.768778,0.011556,0.058000,0.161667,DJI_0155,35m_altitude,foliage


## Step 4: Merge the video information into the comparison dataframe

Load the video meta data csv file and merge it into the algorithm evaluation data

In [63]:
video_meta = pd.read_csv('../../Texture_Repo/Donegal_Rural_Terrain_Textures/Video_Info.csv')

compare_data = pd.merge(joined_data, video_meta, left_on= 'video_id', right_on = 'File_ID')
compare_data

Unnamed: 0,tile_x,tile_y,tile_class,algorithm_type,algorithm,image_size,image_id,Foliage,Water,Building,...,video_id,test_case,ground_truth,Video_File,File_ID,Date,Time,Time_From_Daylight_Start,Time_To_Daylight_End,Conditions
0,0,0,water,cnn,hires,test-large,DJI_0099200,0.000000,1.000000,0.000000,...,DJI_0099,120m_altitude,road,DJI_0099.MP4,DJI_0099,03/05/2020,11:40,5.77,9.40,Partly_Cloudy
1,0,0,road,cnn,hires,test-medium,DJI_0099200,0.000000,0.000000,0.000000,...,DJI_0099,120m_altitude,road,DJI_0099.MP4,DJI_0099,03/05/2020,11:40,5.77,9.40,Partly_Cloudy
2,0,0,road,cnn,hires,test-small,DJI_0099200,0.000000,0.000000,0.000000,...,DJI_0099,120m_altitude,road,DJI_0099.MP4,DJI_0099,03/05/2020,11:40,5.77,9.40,Partly_Cloudy
3,0,0,water,cnn,lowres,test-large,DJI_0099200,0.000000,1.000000,0.000000,...,DJI_0099,120m_altitude,road,DJI_0099.MP4,DJI_0099,03/05/2020,11:40,5.77,9.40,Partly_Cloudy
4,0,0,road,cnn,lowres,test-medium,DJI_0099200,0.000000,0.000000,0.257732,...,DJI_0099,120m_altitude,road,DJI_0099.MP4,DJI_0099,03/05/2020,11:40,5.77,9.40,Partly_Cloudy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14017459,127,71,foliage,cnn,lowres,test-medium,DJI_0155400,1.000000,0.000000,0.000000,...,DJI_0155,35m_altitude,foliage,DJI_0155.MP4,DJI_0155,04/07/2020,14:06,9.12,8.08,Cloudy
14017460,127,71,foliage,cnn,lowres,test-small,DJI_0155400,1.000000,0.000000,0.000000,...,DJI_0155,35m_altitude,foliage,DJI_0155.MP4,DJI_0155,04/07/2020,14:06,9.12,8.08,Cloudy
14017461,127,71,foliage,ilastik,default,test-large,DJI_0155400,0.749568,0.006906,0.071655,...,DJI_0155,35m_altitude,foliage,DJI_0155.MP4,DJI_0155,04/07/2020,14:06,9.12,8.08,Cloudy
14017462,127,71,foliage,ilastik,default,test-medium,DJI_0155400,0.768778,0.011556,0.058000,...,DJI_0155,35m_altitude,foliage,DJI_0155.MP4,DJI_0155,04/07/2020,14:06,9.12,8.08,Cloudy


In [None]:
# Finally save the resulting dataset to a pickle file for comparison
compare_data.to_pickle('../../TestPredictions/evaluate_algo_data.pkl')