In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import sklearn.metrics as metrics
from sklearn.preprocessing import StandardScaler
from scipy.stats import uniform, randint
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split, cross_validate

In [2]:
def calc_scores(y_test, y_pred):
    accuracy = metrics.accuracy_score(y_test, y_pred)
    confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
    f2_score = (0 if all(y_pred == 0) else metrics.fbeta_score(y_test, y_pred, beta=2))
    dmc_score = np.sum(confusion_matrix * np.array([[0, -25], [-5, 5]]))

    return accuracy, f2_score, dmc_score, confusion_matrix

In [3]:
def own_scorer(estimator, X_val, ground_truth):
    prediction = estimator.predict(X_val)
    confusion_matrix = metrics.confusion_matrix(ground_truth, prediction)
    dmc_score = np.sum(confusion_matrix * np.array([[0, -25], [-5, 5]]))
    return dmc_score

In [4]:
def own_scorer_normalized(estimator, X_val, ground_truth):
    prediction = estimator.predict(X_val)
    confusion_matrix = metrics.confusion_matrix(ground_truth, prediction)
    dmc_score = np.sum(confusion_matrix * np.array([[0, -25], [-5, 5]]))
    return dmc_score/len(ground_truth)

In [5]:
def own_f2_score(estimator, X_val, ground_truth):
    prediction = estimator.predict(X_val)
    return 0 if all(prediction == 0) else metrics.fbeta_score(ground_truth, prediction, beta=2)

In [6]:
def report_best_scores(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

## Testing your Classifier by multiple k fold cross validates and shuffles. Currently only sklearn familiar classifiers are allowed

In [9]:
def test_classification(classifier, df_train, df_val, predict_proba=True, ret_dataframe=True, ):
    result = dict()
    dmc_scores = list()
    dmc_scores_norm = list()
    res_dataframe = df_val.copy()
    y_train = df_train.fraud
    x_train = df_train.drop(columns=['fraud'])
    y_val = df_val.fraud
    x_val = df_val.drop(columns=['fraud'])
    
    dmc_score = own_scorer(classifier, x_val, y_val)
    dmc_norm = own_scorer_normalized(classifier, x_val, y_val)
    res_dataframe['prediction'] = classifier.predict(x_val)
    
    if predict_proba:
        res_dataframe['probablity'] = [round(max(x),3) for x in classifier.predict_proba(x_val)]
    # Cross Validation
    scorings = {"DMC" : own_scorer, "DMC_Norm" : own_scorer_normalized}

    
    x_train_complete = x_train.append(x_val)
    y_train_complete = y_train.append(y_val)
    cross_validation = cross_validate(classifier, x_train_complete,y_train_complete,scoring=scorings, cv=5)
    cv_dmc_mean = sum(cross_validation['test_DMC'])/len(cross_validation['test_DMC'])
    cv_dmc_norm_mean = sum(cross_validation['test_DMC_Norm'])/len(cross_validation['test_DMC_Norm'])
    
    result = {"dmc_score" : dmc_score, "dmc_score_norm" : dmc_norm, "cv_dmc_score": cv_dmc_mean,"cv_dmc_score_norm":cv_dmc_norm_mean}
    if ret_dataframe:
        result['dataframe'] = res_dataframe
    print("Results Fix Split: \nDMC Score: {}  ---  Normalized DMC Score: {}, \n\nResults Cross Validation: \nDMC Score: {}  ---  Normalized DMC Score: {} ".format(dmc_score, dmc_norm, cv_dmc_mean,cv_dmc_norm_mean))
    return result
    

## Adding additional features by calling this on the specific Dataframe

In [8]:
def add_new_features(dataframe):
    dataframe['totalScannedItems'] = dataframe['scannedLineItemsPerSecond'] * dataframe['totalScanTimeInSeconds']
    return dataframe

## Scale dataframe with Sklean Scaler (Please scale only model input "train_x")

In [1]:
def scale_dataframe(df, scaler):
    df_tmp = pd.DataFrame()
    tmp_data = scaler.fit_transform(df[df.columns])
    df_tmp[df.columns] = pd.DataFrame(tmp_data)
    return df_tmp.copy()