In [1]:
import pandas as pd
import os
import numpy as np
import math
from skimpy import clean_columns

from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

In [2]:
# def ref_class_column(df):

#     map_class = df.pl_class
#     ref_class = []

#     for i in map_class.index:
#         if df.category_non_iceplant_vegetation.loc[i] == 100:
#             ref_class.append(0)
#         elif df.category_iceplant.loc[i] == 100:
#             ref_class.append(1)
#         elif df.category_low_ndvi_impervious_surface.loc[i] == 100:
#             ref_class.append(2)
#         elif df.category_water.loc[i] == 100:
#             ref_class.append(3)
#         else:
#             ref_class[j]= 100
            
#     return ref_class

In [3]:
def ref_class_column(df, map_col):
    ref_class = []

    for i in df.index:
        if df[map_col].loc[i] == 'non-iceplant vegetation':
            ref_class.append(0)
            
        elif df[map_col].loc[i] == 'iceplant':
            ref_class.append(1)
            
        elif df[map_col].loc[i] == 'low ndvi (impervious surface)':
            ref_class.append(2)
            
        elif df[map_col].loc[i] == 'water':
            ref_class.append(3)
        else:
            print(i)
            ref_class.append(100)
            
    return ref_class

In [4]:
year = 2020

file_name = 'ceo-AE5FP_2020_model_map_validation-sample-data-2023-02-02.csv'
df = clean_columns(pd.read_csv(os.path.join(os.getcwd(),file_name)))

file_name = 'modelAE5_FP_2020_rasters_2020_pixel_counts.csv'
pix_counts = pd.read_csv(os.path.join(os.getcwd(), file_name))

ref_col = 'category'  # ground truth (reference) column 
map_col = 'pl_class'  # point classification in map

In [5]:
df = df.drop([186])

In [6]:
ref_class = ref_class_column(df, ref_col) 
np.unique(ref_class, return_counts=True)


(array([0, 1, 2, 3]), array([236, 139, 119, 100]))

In [7]:
map_class = df[map_col].to_numpy()
np.unique(map_class, return_counts=True)

(array([0, 1, 2, 3]), array([200, 199, 110,  85]))

In [8]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
# using confusion_matrix directly we get a matrix C such that
# C_{i,j} = known to be i, predicted as  j 
# The notation in the paper is 
# n_{i,j} = predicted as i, known to be j 
# so we need to take the transpose

n = confusion_matrix(ref_class, map_class, labels=range(0,4)).T
n

array([[170,   1,  20,   9],
       [ 51, 137,  11,   0],
       [ 15,   1,  85,   9],
       [  0,   0,   3,  82]])

In [9]:
pix_counts = pd.read_csv(os.path.join(os.getcwd(), 'modelAE5_FP_2020_rasters_2020_pixel_counts.csv'))
pix_counts

Unnamed: 0,n_nonice_2020,n_ice_2020,n_ground_2020,n_water_2020,raster
0,36271293,5382187,111150412,62968690,modelAE5_FP_2020_merged_crs26910_S_2020
1,1122203,30004,1891593,2893071,modelAE5_FP_2020_merged_crs26910_W_2020
2,89669636,1123921,62587031,69125241,modelAE5_FP_2020_merged_crs26911_2020


In [10]:
total_pix = sum([sum(pix_counts.n_nonice_2020),
                  sum(pix_counts.n_ice_2020),
                  sum(pix_counts.n_ground_2020),
                  sum(pix_counts.n_water_2020)])

In [11]:
W = []      # proportion of area mapped as class i
n_idot = [] # pixels in sample that had class i in map (predicted as i, any true class j)
U_hat = []  # estimated users' accuracy (precision for each class: TP/(TP+FP))

for i in range(0,4):
    W.append( sum(pix_counts.iloc[:,i]) / total_pix)
    n_idot.append(sum(n[i,:]))
    U_hat.append(n[i,i] / n_idot[i])

OA = sum([W[i]*n[i,i]/n_idot[i] for i in range(0,4)])
print('overall accuracy:', OA*100)

var_O = sum([ W[i]**2 * U_hat[i] * (1-U_hat[i])/(n_idot[i]-1) for i in range(0,4)])
# std error of estimated overall accuracy -- paper equation (5)
print('overall accuracy conf interval:', np.sqrt(var_O)*196, '\n')

print('users accuracy:', U_hat)

var_U_hat = [U_hat[i] * (1-U_hat[i])/(n_idot[i]-1) for i in range(0,4)]
print('users accuracies conf interval:', 196*np.sqrt(var_U_hat))

overall accuracy: 85.19281389053452
overall accuracy conf interval: 3.6243670717266014 

users accuracy: [0.85, 0.6884422110552764, 0.7727272727272727, 0.9647058823529412]
users accuracies conf interval: [4.9611759  6.45099237 7.86737053 3.94607246]


In [12]:
p_dotk_hat = []
P_hat = []  # estimated producer's accurace (sensitiviy for each class TP/(TP+FN))

for k in range(0,4):
    partial = [ W[i]*n[i,k]/n_idot[i] for i in range(0,4) ]
    p_dotk_hat.append(sum(partial))  # equation (9)
p_dotk_hat

for i in range(0,4):
    P_hat.append( (W[i]*n[i,i]/n_idot[i]) / p_dotk_hat[i])

print('producers accuracy:', P_hat)

producers accuracy: [0.8082402844924153, 0.6684417997754406, 0.8838657732434829, 0.8663598116981505]


In [17]:
print('WHAT WE WANT: estimate (pixels correctly classified as iceplant)/(pixels are iceplant) for whole map\n ')

print('GOAL 1: estimate the fraction of the map that is really iceplant (class 1)')
print('Idea: for each class of pixels in map, estimate what fraction of it is really class 1')

print('Fraction of pixels from each class in map:')
print('W: ',W, '\n')

print('Fraction of samples that are class i in map and class 1 in reference:')
print('Information from sample about the fraction from each class in the map that is really class 1')
print('n[i,1]/n_idot[i]: ', [n[i,1]/n_idot[i] for i in range(0,4) ], '\n' )

print('Then we estimate the fraction of each class in the map that is really class 1:')
partial = [W[i]*n[i,1]/n_idot[i] for i in range(0,4) ]
print('W[i]*n[i,1]/n_idot[i] : ', partial)

print('Finally we add these to get an estimate of \n the total fraction of map that is really class 1')
print('p_dot1_hat: ', p_dotk_hat[1], '\n')

print('GOAL 2: estimate the fraction of the map that is correctly classified as iceplant')
print('this is: fraction of map classified as class 1,\n multiplied by the sample estimate of the fraction of it is iceplant')
print('W[1]: ', W[1])
print('n[1,1]/n_idot[1]: ',n[1,1]/n_idot[1])
print('p_11_hat = (W[1]*n[1,1]/n_idot[1]): ', (W[1]*n[1,1]/n_idot[1]) )
print('P_hat = p_11_hat/ p_dot1_hat: ', P_hat[1])
#p_dotk_hat.append( sum(partial))  # equation (9)

WHAT WE WANT: estimate (pixels correctly classified as iceplant)/(pixels are iceplant) for whole map
 
GOAL 1: estimate the fraction of the map that is really iceplant (class 1)
Idea: for each class of pixels in map, estimate what fraction of it is really class 1
Fraction of pixels from each class in map:
W:  [0.2860395334170426, 0.014713838683289604, 0.3953691894823195, 0.30387743841734827] 

Fraction of samples that are class i in map and class 1 in reference:
Information from sample about the fraction from each class in the map that is really class 1
n[i,1]/n_idot[i]:  [0.005, 0.6884422110552764, 0.00909090909090909, 0.0] 

Then we estimate the fraction of each class in the map that is really class 1:
W[i]*n[i,1]/n_idot[i] :  [0.0014301976670852128, 0.01012962763623455, 0.0035942653589301774, 0.0]
Finally we add these to get an estimate of 
 the total fraction of map that is really class 1
p_dot1_hat:  0.015154090662249941 

GOAL 2: estimate the fraction of the map that is correctly