In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import sklearn.metrics as metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.stats import uniform, randint
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split, cross_validate

In [2]:
def calc_scores(y_test, y_pred):
    accuracy = metrics.accuracy_score(y_test, y_pred)
    confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
    f2_score = (0 if all(y_pred == 0) else metrics.fbeta_score(y_test, y_pred, beta=2))
    dmc_score = np.sum(confusion_matrix * np.array([[0, -25], [-5, 5]]))

    return accuracy, f2_score, dmc_score, confusion_matrix

In [3]:
def own_scorer(estimator, X_val, ground_truth):
    prediction = estimator.predict(X_val)
    confusion_matrix = metrics.confusion_matrix(ground_truth, prediction)
    dmc_score = np.sum(confusion_matrix * np.array([[0, -25], [-5, 5]]))
    return dmc_score

In [4]:
def own_scorer_normalized(estimator, X_val, ground_truth):
    prediction = estimator.predict(X_val)
    confusion_matrix = metrics.confusion_matrix(ground_truth, prediction)
    dmc_score = np.sum(confusion_matrix * np.array([[0, -25], [-5, 5]]))
    return dmc_score/len(ground_truth)

In [5]:
def own_f2_score(estimator, X_val, ground_truth):
    prediction = estimator.predict(X_val)
    return 0 if all(prediction == 0) else metrics.fbeta_score(ground_truth, prediction, beta=2)

In [6]:
def report_best_scores(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [2]:
def own_cross_validation(classifier, data, cv=5):
    k = KFold(n_splits=cv)
    splits = k.get_n_splits(data)
    print(splits.shape)

## Testing your Classifier by multiple k fold cross validates and shuffles. Currently only sklearn familiar classifiers are allowed

In [1]:
def test_classification(classifier, df_train, df_val, predict_proba=True, ret_dataframe=True, ):
    result = dict()
    dmc_scores = list()
    dmc_scores_norm = list()
    res_dataframe = df_val.copy()
    y_train = df_train.fraud
    x_train = df_train.drop(columns=['fraud'])
    y_val = df_val.fraud
    x_val = df_val.drop(columns=['fraud'])
    
    classifier.fit(x_train, y_train)
    
    dmc_score = own_scorer(classifier, x_val, y_val)
    dmc_norm = own_scorer_normalized(classifier, x_val, y_val)
    res_dataframe['prediction'] = classifier.predict(x_val)
    
    if predict_proba:
        res_dataframe['probablity'] = [round(max(x),3) for x in classifier.predict_proba(x_val)]
    # Cross Validation
    scorings = {"DMC" : own_scorer, "DMC_Norm" : own_scorer_normalized}

    
    x_train_complete = x_train.append(x_val)
    y_train_complete = y_train.append(y_val)
    cross_validation = cross_validate(classifier, x_train_complete,y_train_complete,scoring=scorings, cv=5)
    cv_dmc_mean = sum(cross_validation['test_DMC'])/len(cross_validation['test_DMC'])
    cv_dmc_norm_mean = sum(cross_validation['test_DMC_Norm'])/len(cross_validation['test_DMC_Norm'])
    
    result = {"dmc_score" : dmc_score, "dmc_score_norm" : dmc_norm, "cv_dmc_score": cv_dmc_mean,"cv_dmc_score_norm":cv_dmc_norm_mean}
    if ret_dataframe:
        result['dataframe'] = res_dataframe
    print("Results Fix Split: \nDMC Score: {}  ---  Normalized DMC Score: {}, \n\nResults Cross Validation: \nDMC Score: {}  ---  Normalized DMC Score: {} ".format(dmc_score, dmc_norm, cv_dmc_mean,cv_dmc_norm_mean))
    return result
    

## Adding additional features by calling this on the specific Dataframe

In [8]:
def add_new_features(dataframe):
    dataframe['totalScannedItems'] = dataframe['scannedLineItemsPerSecond'] * dataframe['totalScanTimeInSeconds']
    return dataframe

## Scale dataframe with Sklean Scaler (Please scale only model input "train_x")

In [1]:
def scale_dataframe(df, scaler):
    df_tmp = pd.DataFrame()
    tmp_data = scaler.transform(df[df.columns])
    df_tmp[df.columns] = pd.DataFrame(tmp_data)
    return df_tmp.copy()

## Find nearest neigbour to test sample

In [2]:
def find_nearest_neighbor(row_scaled, dataset_scaled):  
    diffs = [np.sum((row_scaled[0] - ds_row)**2) for ds_row in dataset_scaled]
    idx = np.argmin(diffs)[0]
    return idx, diffs[idx]

In [None]:
def best_classifier_for_sample(idx, validation_set):
    ground_truth = validation_set.iloc[idx].fraud
    
    # Both classifier predicted the calue correctly
    if (validation_set.iloc[idx].lsvc_predict == ground_truth) and (validation_set.iloc[idx].xgb_predict == ground_truth):
        if validation_set.iloc[idx].lsvc_proba > validation_set.iloc[idx].xgb_proba:
            return "lsvc"
        else:
            return "xgboost"
    # lsvc predicted correctly
    elif (validation_set.iloc[idx].lsvc_predict == ground_truth) and (validation_set.iloc[idx].xgb_predict != ground_truth):
        return "lsvc"
    
    # xgboost predicted correcltly
    elif (validation_set.iloc[idx].lsvc_predict != ground_truth) and (validation_set.iloc[idx].xgb_predict == ground_truth):
        return "xgboost"
    
    # If No classifier predicted the knn correct, None is returned
    else: 
        return None
    
    

## Transformer Class

In [7]:
class DataTransformer:
    """
    for scaling, data transformations (new features, one-hot encoding, categorical, ...)
    """
    
    def __init__(self, scaler):
        self.scaler = scaler

    def fit_scaler(self, df):
        df_tmp = df.copy()
        self.scaler.fit(df_tmp.astype(np.float64))
        return self
        
    def apply_scaler(self, df):
        df_temp = df.copy()
        return pd.DataFrame(self.scaler.transform(df_temp),df_temp.index, df_temp.columns)
    
    def inverse_scale(self, df):
        df_tmp = df.copy()
        return pd.DataFrame(self.scaler.inverse_transform(df_tmp), df_tmp.index, df_tmp.columns)
    
    def add_features(self,df):
        #TODO: Choose relevant features
        df_tmp = df.copy()
        df_tmp['totalScannedLineItems'] = df_tmp['scannedLineItemsPerSecond'] * df_tmp['totalScanTimeInSeconds']
        #df['avgTimePerScan'] = 1/ df['scannedLineItemsPerSecond']
        #df['avgValuePerScan'] = df['avgTimePerScan'] * df['valuePerSecond']
        #df['withoutRegisPerPosition'] = df['scansWithoutRegistration'] / df['totalScannedLineItems'] #equivalent to lineItemVoidsPerPosition?
        #df['quantiModPerPosition'] = df['quantityModifications'] / df['totalScannedLineItems']
        #df['lineItemVoidsPerTotal'] = df['lineItemVoids'] / df['grandTotal']
        #df['withoutRegisPerTotal'] = df['scansWithoutRegistration'] / df['grandTotal']
        #df['quantiModPerTotal'] = df['quantityModifications'] / df['grandTotal']
        #df['lineItemVoidsPerTime'] = df['lineItemVoids'] / df['totalScanTimeInSeconds']
        #df['withoutRegisPerTime'] = df['scansWithoutRegistration'] / df['totalScanTimeInSeconds']
        #df['quantiModPerTime'] = df['quantityModifications'] / df['totalScanTimeInSeconds']
        #df['valuePerScannedLineItem'] = df['valuePerSecond'] / df['scannedLineItemsPerSecond']
        return df_tmp
    
    def transform(self, df):
        """
        All in one: Apply all transform methods
            1.) addFeatures
            2.) apply_scaler
        """
        df_tmp = df.copy()
        return self.apply_scaler(self.add_features(df_tmp))
    


## KNN class

In [4]:
class KNNLookup():

    def __init__(self, knn_data):

        #self.knn = NearestNeighbors(n_neighbors=1)
        #self.knn.fit(knn_data.values.tolist())

        self.tree = KDTree(knn_data)

    def refit(self, knn_data):
        self.tree = KDTree(knn_data)

    def find_nearest_neighbor(self, row_scaled, dataset_scaled):
        diffs = [np.sum((row_scaled - ds_row) ** 2) for idx, ds_row in dataset_scaled.iterrows()]
        idx = np.argmin(diffs)
        return idx, diffs[idx]

    def find_nearest_neighbor2(self, row_scaled, dataset_scaled):
        dist, ind = self.tree.query([row_scaled.values], k=1)
        return np.ravel(ind)[0], np.ravel(dist)[0]

## Plot results

In [1]:
import matplotlib.pyplot as plt
def plot_results_ssl(result_dict):
    # recall = TP / (TP + FN)
    # precision = TP / (TP + FP)

    dmc_scores_val = {}
    #dmc_scores_val = dict.fromkeys(['lin_svg', 'xgboost', 'own_classifier'])
    dmc_scores_train = {}
    precision_val = {}
    precision_train = {}
    recall_val = {}
    recall_train = {}

    for iteration_number, elem in enumerate(res):
        iteration_dict = elem
        #print("# Iteration: ", iteration_number, "\n# Value: ", iteration_dict)
        for classifier, results in iteration_dict.items():
            #print("\t## Classifier: ", classifier, "\n\t## Results: ", results)
            for set_name, result in results.items():
                conf_matrix = result['conf_matrix']
                tp = conf_matrix[0][0]
                fp = conf_matrix[0][1]
                fn = conf_matrix[1][0]
                tn = conf_matrix[1][1]

                recall = tp / (tp + fn)
                precision = tp / (tp + fp)

                if set_name == "val":
                    if classifier in dmc_scores_val:
                        dmc_scores_val[classifier].append(result['dmc_score'])
                    else:
                        dmc_scores_val[classifier] = [result['dmc_score']]
                    if classifier in recall_val:
                        recall_val[classifier].append(recall)
                    else:
                        recall_val[classifier] = [recall]
                    if classifier in precision_val:
                        precision_val[classifier].append(precision)
                    else:
                        precision_val[classifier] = [precision]
                else:
                    if classifier in dmc_scores_train:
                        dmc_scores_train[classifier].append(result['dmc_score'])
                    else:
                        dmc_scores_train[classifier] = [result['dmc_score']]
                    if classifier in recall_train:
                        recall_train[classifier].append(recall)
                    else:
                        recall_train[classifier] = [recall]
                    if classifier in precision_train:
                        precision_train[classifier].append(precision)
                    else:
                        precision_train[classifier] = [precision]
                #print("\t\t### Set: ", set_name, "\n\t\t### Result: ", result)
                #print("TP: ", tp, "\tFP: ", fp, "\tFN: ", fn, "\tTN: ", tn)


    #print(dmc_scores_val)
    #print(dmc_scores_train)
    #print(precision_val)
    #print(precision_train)
    #print(recall_val)
    #print(recall_train)

    plt.figure(num=1, figsize=(7,7))
    plt.subplot(311)
    plt.plot(dmc_scores_train['lin_svc'], label='lin_svc', color='r')# Entwicklung der Lin SVC)
    plt.plot(dmc_scores_train['xgboost'], label='xgboost', color='b')# Entwicklung der Lin SVC)
    plt.plot(dmc_scores_train['own_classifier'], label='own_classifier', color='g')# Entwicklung der Lin SVC)
    plt.title('DMC Score on the train set')
    plt.ylabel('Score')
    plt.xlabel('Iteration')
    plt.legend()
    #plt.show()

    plt.subplot(312)
    plt.plot(precision_train['lin_svc'], label='lin_svc', color='r')# Entwicklung der Lin SVC)
    plt.plot(precision_train['xgboost'], label='xgboost', color='b')# Entwicklung der Lin SVC)
    plt.plot(precision_train['own_classifier'], label='own_classifier', color='g')# Entwicklung der Lin SVC)
    plt.title('Precision on the train set')
    plt.ylabel('Precision')
    plt.xlabel('Iteration')
    plt.legend()
    #plt.show()

    plt.subplot(313)
    plt.plot(recall_train['lin_svc'], label='lin_svc', color='r')# Entwicklung der Lin SVC)
    plt.plot(recall_train['xgboost'], label='xgboost', color='b')# Entwicklung der Lin SVC)
    plt.plot(recall_train['own_classifier'], label='own_classifier', color='g')# Entwicklung der Lin SVC)
    plt.title('Recall on the train set')
    plt.ylabel('Recall')
    plt.xlabel('Iteration')
    plt.legend()
    plt.subplots_adjust(left=0.0, right=1.0, bottom=2.0, top=3.5)
    #plt.show()

    plt.figure(2, figsize=(7,7))
    plt.subplot(311)
    plt.plot(dmc_scores_val['lin_svc'], label='lin_svc', color='r')# Entwicklung der Lin SVC)
    plt.plot(dmc_scores_val['xgboost'], label='xgboost', color='b')# Entwicklung der Lin SVC)
    plt.plot(dmc_scores_val['own_classifier'], label='own_classifier', color='g')# Entwicklung der Lin SVC)
    plt.title('DMC Score on the val set')
    plt.ylabel('Score')
    plt.xlabel('Iteration')
    plt.legend()
    #plt.show()

    plt.subplot(312)
    plt.plot(precision_val['lin_svc'], label='lin_svc', color='r')# Entwicklung der Lin SVC)
    plt.plot(precision_val['xgboost'], label='xgboost', color='b')# Entwicklung der Lin SVC)
    plt.plot(precision_val['own_classifier'], label='own_classifier', color='g')# Entwicklung der Lin SVC)
    plt.title('Precision on the val set')
    plt.ylabel('Precision')
    plt.xlabel('Iteration')
    plt.legend()
    #plt.show()

    plt.subplot(313)
    plt.plot(recall_val['lin_svc'], label='lin_svc', color='r')# Entwicklung der Lin SVC)
    plt.plot(recall_val['xgboost'], label='xgboost', color='b')# Entwicklung der Lin SVC)
    plt.plot(recall_val['own_classifier'], label='own_classifier', color='g')# Entwicklung der Lin SVC)
    plt.title('Recall on the val set')
    plt.ylabel('Recall')
    plt.xlabel('Iteration')
    plt.legend()

    plt.subplots_adjust(left=0.0, right=1.0, bottom=2.0, top=3.5)
    plt.show()