In this notebook I add the FP from greentea model into the green tea train and test sets

In [1]:
import pandas as pd
import os 
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

from joblib import dump

In [2]:
def accuracy_info_df(y_true, y_pred):
    #N = y_true.shape[0]

    unique, counts = np.unique(y_true,return_counts=True)    
    N = counts[0]    
    P = counts[1]
    
    confmtx = confusion_matrix(y_true, y_pred)
    TN = confmtx[0,0]
    FP = confmtx[0,1]
    FN = confmtx[1,0]
    TP = confmtx[1,1]

    # P's  producer's accuracy (sensitivity) : TP/P
    PA_P =  np.round( TP/P  * 100, 2) 

    # N's producer's accuracy (specificity) : TN/N
    PA_N =  np.round( TN/N * 100, 2) 

    # P's user's accuracy (precision P) : TP/(TP+FP)
    UA_P = np.round( TP / (TP+FP) * 100, 2) 
    
    # N's user's accuracy (precision N) : TN/(TN+FN)
    UA_N = np.round( TN / (TN+FN) * 100, 2)
    
    # overal accuracy: (TP + TN)/(P + N)
    OA = np.round( (TP+TN)/y_true.shape[0]*100, 2) 
    
    D = {'acc':OA,
         'prod_acc_P':PA_P, 
         'prod_acc_N':PA_N,          
         'user_acc_P':UA_P,
         'user_acc_N':UA_N,         
         'TP':TP, 'TN':TN, 
         'FP':FP, 'FN':FN 
        }
    df = pd.DataFrame([D])
    return df
# ----------------------------------------------------------------
cols = ['r', 
        'r_avg13', 'r_entr13',         
        'g',
        'g_avg13', 'g_entr13',                 
        'b',
        'b_avg13', 'b_entr13',                 
        'nir',
        'nir_avg13', 'nir_entr13',                 
        'ndvi',
        'ndvi_avg13', 'ndvi_entr13',        
        'month', 
        'day_in_year']


In [3]:
test = pd.read_csv('/home/jovyan/msai4earth-esa/iceplant_detection/data/greentea13/greentea13_test.csv')
train = pd.read_csv('/home/jovyan/msai4earth-esa/iceplant_detection/data/greentea13/greentea13_train.csv')

fp = pd.read_csv('greentea_FP.csv')

# check duplicates in false positives and in merged set
print(np.unique(fp.duplicated(), return_counts=True), '\n')
print(np.unique(pd.concat([fp, train, test])[['x','y']].duplicated(), return_counts=True), '\n')

(array([False]), array([760])) 

(array([False]), array([5974])) 



In [4]:
# distribute false positives into train and test

aois = ['gaviota','capitan','campus_lagoon','carpinteria']
percentages = [100, 90, 80, 70, 60, 50, 40, 30, 20, 10]

results = []
rand_state = 80
for perc in percentages:

    test_samples = []
    train_samples = []
    
    for aoi in aois:

        # select false positives in that aoi
        df = fp[fp.aoi == aoi].sample(frac=perc/100, random_state=rand_state)

        # sample 70% of these for train set
        xtr_train = df.sample(frac=0.7, random_state=rand_state)
        train_samples.append(xtr_train)

        # pick remaining 30% for test set
        xtr_test = df.loc[list(set(df.index) - set(xtr_train.index))]
        test_samples.append(xtr_test)

    fp_test = pd.concat(test_samples)
    fp_train = pd.concat(train_samples)
            
    salt_test =  pd.concat([fp_test, test])
    salt_train =  pd.concat([fp_train, train])

    # ------------------------------    
    # divide into train and test sets 
    X_test = salt_test[cols].to_numpy()
    y_test = salt_test.loc[:,'iceplant'].to_numpy()
    
    X_train = salt_train[cols].to_numpy() 
    y_train = salt_train.loc[:,'iceplant'].to_numpy() 
    
    rfc = RandomForestClassifier(n_estimators = 100, random_state = 42)
    rfc.fit(X_train, y_train)
    
    preds = rfc.predict(X_test)
    results.append(accuracy_info_df(y_test, preds))


# check counts
# print(fp_test.groupby(['aoi']).count().x, '\n')

# print(fp_train.groupby(['aoi']).count().x)


In [5]:
R = pd.concat(results).reset_index(drop=True)
R.insert(loc=0,
         column = 'percentage',
         value = percentages)
R

Unnamed: 0,percentage,acc,prod_acc_P,prod_acc_N,user_acc_P,user_acc_N,TP,TN,FP,FN
0,100,82.36,58.56,95.18,86.73,81.01,366,1105,56,259
1,90,82.14,58.88,94.91,86.38,80.79,368,1081,58,257
2,80,81.72,59.84,93.99,84.81,80.68,374,1048,67,251
3,70,83.59,64.32,94.6,87.2,82.26,402,1034,59,223
4,60,82.01,64.16,92.43,83.2,81.53,401,989,81,224
5,50,83.67,67.04,93.6,86.21,82.63,419,980,67,206
6,40,82.6,67.04,92.09,83.8,82.07,419,943,81,206
7,30,82.71,67.2,92.4,84.68,81.84,420,924,76,205
8,20,83.24,71.84,90.51,82.84,83.44,449,887,93,176
9,10,82.67,70.56,90.59,83.05,82.48,441,866,90,184


In [None]:
# aois = ['gaviota','capitan','campus_lagoon','carpinteria']
# percentages = [40]

# results = []
# rand_state = 80
# for perc in percentages:

#     test_samples = []
#     train_samples = []
    
#     for aoi in aois:

#         # select false positives in that aoi
#         df = fp[fp.aoi == aoi].sample(frac=perc/100, random_state=rand_state)

#         # sample 70% of these for train set
#         xtr_train = df.sample(frac=0.7, random_state=rand_state)
#         train_samples.append(xtr_train)

#         # pick remaining 30% for test set
#         xtr_test = df.loc[list(set(df.index) - set(xtr_train.index))]
#         test_samples.append(xtr_test)

#     fp_test = pd.concat(test_samples)
#     fp_train = pd.concat(train_samples)
    
#     # # double-check duplicates in merges
#     print(perc)
#     print(np.unique(pd.concat([fp_train, fp_test, train, test])[['x','y']].duplicated(), return_counts=True), '\n')    
        
#     salt_test =  pd.concat([fp_test, test])
#     salt_train =  pd.concat([fp_train, train])
    
#     salt_test.to_csv('salt_p'+str(perc)+'_test.csv', index=False)
#     salt_train.to_csv('salt_p'+str(perc)+'_train.csv', index=False)    

#     # ------------------------------    
#     # divide into train and test sets 
#     X_test = salt_test[cols].to_numpy()
#     y_test = salt_test.loc[:,'iceplant'].to_numpy()
    
#     X_train = salt_train[cols].to_numpy() 
#     y_train = salt_train.loc[:,'iceplant'].to_numpy() 
    
#     rfc = RandomForestClassifier(n_estimators = 100, random_state = 42)
#     rfc.fit(X_train, y_train)
#     dump(rfc, 'salt_p'+str(perc)+'_rfc.joblib')
    
#     preds = rfc.predict(X_test)
#     results.append(accuracy_info_df(y_test, preds))
    
# R = pd.concat(results).reset_index(drop=True)
# R['percentage'] = percentages
# R