<a href="https://colab.research.google.com/github/CptK1ng/dmc2019/blob/alexander_dev/notebooks/semisupervised_learning_hard_cut.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Semi Supervised Learning using the hard cut
by Alexander

we will say all trustLevel >2 are no frauds.

In [0]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import time as time
%matplotlib inline

Download our custom Dataset splits and the unlabeled Test Set:

In [0]:
!wget -nc -q --show-progress https://www.dropbox.com/s/6m8iq9ogpzmu7vx/train_new.csv?dl=1 -O train_new.csv
!wget -nc -q --show-progress https://www.dropbox.com/s/tjpkc45oqn3uv8s/val_new.csv?dl=1 -O val_new.csv
!wget -nc -q --show-progress https://www.dropbox.com/s/hbd6nzgwlnevu4x/test.csv?dl=1 -O test.csv

Import Data:

In [35]:
df_train_original = pd.read_csv("train_new.csv", sep="|")
df_val_original = pd.read_csv("val_new.csv", sep="|")
df_test_original = pd.read_csv("test.csv", sep="|")
df_train_original.head(2)

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud
0,4,828,66.56,7,4,3,0.007246,0.080386,1.166667,0
1,1,1612,31.34,2,4,3,0.008685,0.019442,0.142857,0


## Feature Engineering

In [36]:
def prepareData(df):
  df = df.copy()
  df['totalLineItems'] = df['scannedLineItemsPerSecond'] * df['totalScanTimeInSeconds'] #nur of scanned products
  return df

df_train = prepareData(df_train_original)
df_val = prepareData(df_val_original)
df_test = prepareData(df_test_original)

df_train.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud,totalLineItems
0,4,828,66.56,7,4,3,0.007246,0.080386,1.166667,0,6.0
1,1,1612,31.34,2,4,3,0.008685,0.019442,0.142857,0,14.0
2,3,848,52.37,2,4,0,0.022406,0.061757,0.105263,0,19.0
3,1,321,76.03,8,7,2,0.071651,0.236854,0.347826,0,23.0
4,1,660,6.06,3,7,1,0.027273,0.009182,0.166667,0,18.0


## Add rows to train and split X and Y

In [37]:
# Extract High Trust Level Entries and set their fraud level to 0
df_test_extract = df_test[df_test_X.trustLevel > 2].copy()
df_test_extract["fraud"] = 0
df_train_new = pd.concat([df_train, df_test_extract], sort=False)

# Splitting the final dataset into internal training and testing datasets
df_train_X = df_train.drop('fraud', axis=1)
df_train_y = df_train['fraud']
df_train_new_X = df_train_new.drop('fraud', axis=1)
df_train_new_y = df_train_new['fraud']
df_val_X = df_val.drop('fraud', axis=1)
df_val_y = df_val['fraud']

X_train, X_val, X_train_new, y_train, y_val, y_train_new = df_train_X.values, df_val_X.values, df_train_new_X.values, df_train_y.values, df_val_y.values, df_train_new_y.values
print("Shapes",X_train.shape, X_val.shape, X_train_new.shape, y_train.shape, y_val.shape, y_train_new.shape)

Shapes (1503, 10) (376, 10) (333998, 10) (1503,) (376,) (333998,)


## Simple Classificator


In [0]:
def score_function(y_true, y_pred):
  dmc = np.sum(metrics.confusion_matrix(y_true, y_pred)*np.array([[0, -25],[ -5, 5]])) #sklearn gives [[tn,fp],[fn,tp]]
  return (0 if all(y_pred == 0) else metrics.fbeta_score(y_true, y_pred, beta=2),
          dmc, 
          dmc/len(y_pred), #comparable relative score, the higher the better.
          metrics.confusion_matrix(y_true, y_pred).tolist())

##  Classify labels for validation set on original training set

In [39]:
classifier_adb = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3), n_estimators=500, algorithm='SAMME', random_state=1)

# Fitting the model and printing the accuracy score
classifier_adb.fit(X_train, y_train)
print("AdaBoost", "\t", score_function(y_val, classifier_adb.predict(X_val)) )

AdaBoost 	 (0.8771929824561403, 35, 0.09308510638297872, [[351, 2], [3, 20]])


##  Classify labels for validation set on new training set

In [40]:
classifier_adb = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3), n_estimators=500, algorithm='SAMME', random_state=1)

# Fitting the model and printing the accuracy score
classifier_adb.fit(X_train_new, y_train_new)
print("AdaBoost", "\t", score_function(y_val, classifier_adb.predict(X_val)) )

AdaBoost 	 (0.8482142857142857, 50, 0.13297872340425532, [[352, 1], [4, 19]])


As we can see, this helped the prediction to improve from 9.3% to 13% DMC Score.