<a href="https://colab.research.google.com/github/CptK1ng/dmc2019/blob/master/notebooks/evaluate_automl_results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# evaluate results from AutoML

In [0]:
import numpy as np
import pandas as pd
from sklearn import metrics
from IPython.display import display

In [0]:
!wget -nc -q --show-progress https://www.dropbox.com/s/6m8iq9ogpzmu7vx/train_new.csv?dl=1 -O train_new.csv
!wget -nc -q --show-progress https://www.dropbox.com/s/tjpkc45oqn3uv8s/val_new.csv?dl=1 -O val_new.csv
!wget -nc -q --show-progress https://www.dropbox.com/s/hbd6nzgwlnevu4x/test.csv?dl=1 -O test.csv
!wget -nc -q --show-progress https://www.dropbox.com/s/z4nowycqk0fgu0u/pred_test_1.csv?dl=1 -O pred_test_1.csv
!wget -nc -q --show-progress https://www.dropbox.com/s/1auugm1h0yzliqo/pred_val_1.csv?dl=1 -O pred_val_1.csv
!wget -nc -q --show-progress https://www.dropbox.com/s/t8fmybg7212yg1a/pred_val_2.csv?dl=1 -O pred_val_2.csv

In [66]:
# download datasets
train_df = pd.read_csv("train_new.csv", sep="|")
val_df = pd.read_csv("val_new.csv", sep="|")
test_df = pd.read_csv("test.csv", sep="|")

#download predictions
test_pred_df = pd.read_csv("pred_test_1.csv", sep=",")
test_pred_df["fraud_pred"] = np.where(test_pred_df["fraud_1_score"] > 0.5, 1, 0)
val_pred1_df = pd.read_csv("pred_val_1.csv", sep=",").rename(columns={'fraud':'fraud_pred'})
val_pred2_df = pd.read_csv("pred_val_2.csv", sep=",").rename(columns={'fraud':'fraud_pred'})

# calculate pred_thresh for classification based on 0.71 treshold
treshold = 0.71
test_pred_df["fraud_pred_thresh"] = np.where(test_pred_df["fraud_1_score"] > treshold, 1, 0)
val_pred1_df["fraud_pred_thresh"] = np.where(val_pred1_df["fraud_1_score"] > treshold, 1, 0)
val_pred2_df["fraud_pred_thresh"] = np.where(val_pred2_df["fraud_1_score"] > treshold, 1, 0)

# as we can see the following are having different X values / another sorting:
display(val_df.head(2))
display(val_pred1_df.head(2))

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud
0,6,1504,89.37,8,5,3,0.018617,0.059422,0.285714,0
1,6,128,63.32,5,4,4,0.226562,0.494688,0.172414,0


Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud_pred,fraud_0_score,fraud_1_score,fraud_pred_thresh
0,2,932,80.12,2,1,3,0.003219,0.085966,0.666667,0,0.987175,0.012826,0
1,2,622,60.75,11,5,4,0.012862,0.097669,1.375,0,0.987175,0.012826,0


In [67]:
# predictions unfortunately are shuffled, use X columns (so without 'fraud' column) as key for classification
val_df_i = val_df.set_index(list(test_df.columns), verify_integrity=True)
test_df_i = test_df.set_index(list(test_df.columns), verify_integrity=True)

val_pred1_df_i = val_pred1_df.set_index(list(test_df.columns), verify_integrity=True)
val_pred2_df_i = val_pred2_df.set_index(list(test_df.columns), verify_integrity=True)
test_pred_df_i = test_pred_df.set_index(list(test_df.columns), verify_integrity=True)

# as we can see it is still another sorting but indices allow us to join the dataframes now.
display(val_df_i.head(3))
display(val_pred1_df_i.head(3))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,fraud
trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,Unnamed: 9_level_1
6,1504,89.37,8,5,3,0.018617,0.059422,0.285714,0
6,128,63.32,5,4,4,0.226562,0.494688,0.172414,0
3,1675,21.22,10,6,4,0.001791,0.012669,3.333333,0


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,fraud_pred,fraud_0_score,fraud_1_score,fraud_pred_thresh
trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2,932,80.12,2,1,3,0.003219,0.085966,0.666667,0,0.987175,0.012826,0
2,622,60.75,11,5,4,0.012862,0.097669,1.375,0,0.987175,0.012826,0
1,1264,42.18,0,6,4,0.014241,0.03337,0.0,0,0.815823,0.184177,0


In [68]:
# join predictions to original val data
val_pred1_df_c = val_df_i.join(val_pred1_df_i)
val_pred2_df_c = val_df_i.join(val_pred2_df_i)
test_pred_df_c = test_df_i.join(test_pred_df_i)

val_pred1_df_c.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,fraud,fraud_pred,fraud_0_score,fraud_1_score,fraud_pred_thresh
trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
6,1504,89.37,8,5,3,0.018617,0.059422,0.285714,0,0,0.991176,0.008824,0
6,128,63.32,5,4,4,0.226562,0.494688,0.172414,0,0,0.991176,0.008824,0
3,1675,21.22,10,6,4,0.001791,0.012669,3.333333,0,0,0.993208,0.006792,0
1,1753,14.41,5,1,1,0.006275,0.00822,0.454545,0,0,0.966544,0.033456,0
5,157,33.89,11,3,3,0.146497,0.21586,0.478261,0,0,0.993208,0.006792,0


In [72]:
# calculate scores
def score_function(y_true, y_pred):
  dmc = np.sum(metrics.confusion_matrix(y_true, y_pred)*np.array([[0, -25],[ -5, 5]])) #sklearn gives [[tn,fp],[fn,tp]]
  return (#0 if all(y_pred == 0) else metrics.fbeta_score(y_true, y_pred, beta=2),
          dmc, 
          dmc/len(y_pred), #comparable relative score, the higher the better.
          metrics.confusion_matrix(y_true, y_pred).tolist(),
          0 if all(y_pred == 0) else metrics.fbeta_score(y_true, y_pred, beta=0.5172))

print("val with groundtruth   ",score_function(val_pred1_df_c['fraud'].values, val_pred1_df_c["fraud"].values))
print("val 1 with 0.5   ",score_function(val_pred1_df_c['fraud'].values, val_pred1_df_c["fraud_pred"].values))
print("val 1 with thresh",score_function(val_pred1_df_c['fraud'].values, val_pred1_df_c["fraud_pred_thresh"].values))
print("val 2 with 0.5   ",score_function(val_pred2_df_c['fraud'].values, val_pred2_df_c["fraud_pred"].values))
print("val 2 with thresh",score_function(val_pred2_df_c['fraud'].values, val_pred2_df_c["fraud_pred_thresh"].values))
print("test             ","no ground truth available, TODO: Compare distributions")

val with groundtruth    (115, 0.3058510638297872, [[353, 0], [0, 23]], 1.0)
val 1 with 0.5    (115, 0.3058510638297872, [[353, 0], [0, 23]], 1.0)
val 1 with thresh (-35, -0.09308510638297872, [[353, 0], [15, 8]], 0.7164836794317928)
val 2 with 0.5    (115, 0.3058510638297872, [[353, 0], [0, 23]], 1.0)
val 2 with thresh (-35, -0.09308510638297872, [[353, 0], [15, 8]], 0.7164836794317928)
test              no ground truth available, TODO: Compare distributions


In [71]:
# compare test set:
test_pred_df_c.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,fraud_0_score,fraud_1_score,fraud_pred,fraud_pred_thresh
trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
4,467,88.48,4,8,4,0.014989,0.189465,0.571429,1.0,1.228802e-15,0,0
3,1004,58.99,7,6,1,0.026892,0.058755,0.259259,1.0,4.509405e-10,0,0
1,162,14.0,4,5,4,0.006173,0.08642,4.0,1.0,0.0,0,0
5,532,84.79,9,3,4,0.026316,0.15938,0.642857,1.0,4.408285e-16,0,0
5,890,42.16,4,0,0,0.021348,0.047371,0.210526,1.0,1.02818e-14,0,0
5,1072,12.67,3,4,1,0.01959,0.011819,0.142857,1.0,6.953312e-12,0,0
3,259,93.75,0,7,0,0.100386,0.361969,0.0,1.0,6.456098e-14,0,0
2,1528,47.35,2,9,5,0.009817,0.030988,0.133333,0.971603,0.02839732,0,0
6,816,80.89,9,4,0,0.017157,0.09913,0.642857,1.0,1.067447e-15,0,0
4,16,31.91,7,7,4,1.3125,1.994375,0.333333,1.0,4.439841e-17,0,0
