In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import model_prep_and_evals as mpe

import time
from sklearn.metrics import confusion_matrix

In [2]:
X_test = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','test_set.csv')).loc[:,'r':'day_in_year']
y_test = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','test_set.csv')).loc[:,'iceplant'] 

In [3]:
df = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','train_set.csv'))
ice = df.loc[df.iceplant == 1]
nonice = df.loc[df.iceplant == 0]

In [4]:
n_ice = len(ice)
n_nonice = len(nonice)

perc = [10, 20, 30, 40, 50]
samples = [ int(p*n_nonice / (100-p) ) for p in perc]
samples[4] = n_ice

In [5]:
TN = []
FN = []
FP = []
TP = []
sens = []
spec = []
prec = []
acc= []
times = []
#gmean = []
#mcc = []
#f1 = []
#f2 = []

In [6]:
for n in samples:
    ice_sample = ice.sample(n = n, random_state=21)

    X_train = pd.concat([ice_sample, nonice],axis=0).loc[:,'r':'day_in_year'].to_numpy()
    y_train = pd.concat([ice_sample, nonice],axis=0).loc[:,'iceplant'].to_numpy()

    t0 =  time.time()
    rfc = RandomForestClassifier(n_estimators = 100, random_state = 42)
    rfc.fit(X_train, y_train)
    times.append(time.time() - t0)    

    # ------------------------------------------------------    
    y_pred = rfc.predict(X_test.to_numpy())
    y_true = y_test.to_numpy()
    
    confmtx = confusion_matrix(y_true, y_pred)
    TN.append(confmtx[0,0])
    FP.append(confmtx[0,1])
    FN.append(confmtx[1,0])
    TP.append(confmtx[1,1])

    prec.append( np.round(confmtx[1,1]/(confmtx[1,1]+confmtx[0,1])*100, 2))
    acc.append( np.round( (confmtx[1,1] + confmtx[0,0])/y_true.shape[0]*100, 2))    
    
    unique, counts = np.unique(y_true, return_counts = True)
    sens.append(np.round(confmtx[1,1]/counts[1]*100, 2))
    spec.append(np.round(confmtx[0,0]/counts[0]*100, 2))
    # ------------------------------------------------------  
    print('working')


working
working
working
working
working


In [7]:
# print('G-mean: ', round(np.sqrt(sens*spec),2))
# print('MCC: ', matthews_corrcoef(y_true,y_pred))
# print('F1-measure: ',  round(fbeta_score(y_true, y_pred, beta=1.0),5))
# print('F0.5-measure (min false positives): ',  round(fbeta_score(y_true, y_pred, beta=0.5),5))
# print('F2-measure (min false negatives)  : ',  round(fbeta_score(y_true, y_pred, beta=2.0),5))



In [12]:
D = { 'TN' : TN,
     'FP' : FP,
     'FN' : FN,
     'TP' : TP,
     'prec' : prec,
     'acc' : acc, 
     'sens' : sens,
     'spec' : spec,
     'times' : times,
     'n_ice' : samples
    }
stats =  pd.DataFrame(D)

stats['n_nonice'] = n_nonice

stats['perc_ice'] = np.round(stats.n_ice/(stats.n_ice + stats.n_nonice)*100)
stats

In [21]:
stats.to_csv(os.path.join(os.getcwd(),'temp','stats.csv'))

In [None]:
#sensitivity (TP/P)
#specificity (TN/N)
#precision   (TP/(TP+FP))
#accuracy    (TP + TN)/(P + N)

In [25]:
p = 33
n = int(p*n_nonice / (100-p))

ice_sample = ice.sample(n = n, random_state=21)

X_train = pd.concat([ice_sample, nonice],axis=0).loc[:,'r':'day_in_year'].to_numpy()
y_train = pd.concat([ice_sample, nonice],axis=0).loc[:,'iceplant'].to_numpy()

t0 =  time.time()
rfc = RandomForestClassifier(n_estimators = 100, random_state = 42)
rfc.fit(X_train, y_train)    

# ------------------------------------------------------    
y_pred = rfc.predict(X_test.to_numpy())
y_true = y_test.to_numpy()

        
mpe.print_accuracy_info(y_true, y_pred)
# confmtx = confusion_matrix(y_true, y_pred)
# TN.append(confmtx[0,0])
# FP.append(confmtx[0,1])
# FN.append(confmtx[1,0])
# TP.append(confmtx[1,1])

# prec.append( np.round(confmtx[1,1]/(confmtx[1,1]+confmtx[0,1])*100, 2))
# acc.append( np.round( (confmtx[1,1] + confmtx[0,0])/y_true.shape[0]*100, 2))    

# unique, counts = np.unique(y_true, return_counts = True)
# sens.append(np.round(confmtx[1,1]/counts[1]*100, 2))
# spec.append(np.round(confmtx[0,0]/counts[0]*100, 2))
        

# ------------------------------------------------------  
print('working')


true negatives: 82285     false positives: 3545
false negatives: 7631     true positives: 69641

sensitivity (TP/P): 90.12 %
specificity (TN/N): 95.87 %
G-mean:  0.93

precision (TP/(TP+FP)): 95.16 %

MCC:  0.863311426951114

F1-measure:  0.92572
F0.5-measure (min false positives):  0.94105
F2-measure (min false negatives)  :  0.91088

accuracy: 93.15 %
working
