In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Warnings ignored to reduce output; warnings come from shap package that parameter was set automatically; remove to check for other warnings (e.g. package out of date)
import warnings
warnings.simplefilter('ignore')

# Set random seed 
RSEED = 100

In [2]:
# Load Training Data

train_data = pd.read_csv("../00_Data_Sets/1_mio_dataset_2010_2014.csv");
test_data = pd.read_csv("../00_Data_Sets/100_k_dataset_2015.csv");

In [4]:
# Load Input Data

passenger_outlier = pd.read_csv("../01_Synthetic Data/02_Passenger Count/passenger count_outlier_predited_02.csv");
passenger_drift = pd.read_csv("../01_Synthetic Data/02_Passenger Count/passenger count_drift_total_predicted_03.csv");
distance_drift = pd.read_csv("../01_Synthetic Data/03_Distance/distance_drift_total_predicted_02.csv");
location_drift = pd.read_csv("../01_Synthetic Data/01_Location/location_drift_total_predicted_02.csv");
fare_amount_drift = pd.read_csv("../01_Synthetic Data/04_Fare Amount/fare-amount_drift_total_02.csv");
incoming_data_location_outlier = pd.read_csv("../01_Synthetic Data/01_Location/location_outlier_predicted_01.csv");
incoming_data_distance_outlier = pd.read_csv("../01_Synthetic Data/03_Distance/distance_drift_total_predicted_02.csv");

In [5]:
def monitor_drift_main_metrics(training_data, incoming_batch, start_index, end_index, batch_info, column, thresholds):
    batch_thresholds = thresholds[batch_info['name'].lower() + '_batch_drift']
    threshold = batch_thresholds['one_dim_drift_metric']
            
    if abs(incoming_batch.iloc[start_index:end_index,column].mean()) > abs((training_data.iloc[:,column].mean() * (1 + threshold))):
        print('[DRIFT][{} Window]: Upwards Data Drift detected!. ({} AVG: {})\t(Index: {})'.format(batch_info['name'], training_data.columns[column], incoming_batch.iloc[start_index:end_index,column].mean(), end_index))
        
    if abs(incoming_batch.iloc[start_index:end_index,column].quantile(0.25)) > abs((training_data.iloc[:,column].quantile(0.25) * (1 + threshold))):
        print('[DRIFT][{} Window]: Upwards Data Drift detected! ({} 25%-Quantile: {})\t(Index: {})'.format(batch_info['name'], training_data.columns[column], incoming_batch.iloc[start_index:end_index,column].mean(), end_index))
        
    if abs(incoming_batch.iloc[start_index:end_index,column].quantile(0.75)) > abs((training_data.iloc[:,column].quantile(0.75) * (1 + threshold))):
        print('[DRIFT][{} Window]: Upwards Data Drift detected! ({} 75%-Quantile: {})\t(Index: {})'.format(batch_info['name'], training_data.columns[column], incoming_batch.iloc[start_index:end_index,column].mean(), end_index))
        
    if abs(incoming_batch.iloc[start_index:end_index,column].mean()) < abs((training_data.iloc[:,column].mean() * (1 - threshold))):
        print('[DRIFT][{} Window]: Downwards Data Drift detected! ({} AVG: {})\t(Index: {})'.format(batch_info['name'], training_data.columns[column], incoming_batch.iloc[start_index:end_index,column].mean(), end_index))
        
    if abs(incoming_batch.iloc[start_index:end_index,column].quantile(0.25)) < abs((training_data.iloc[:,column].quantile(0.25) * (1 - threshold))):
        print('[DRIFT][{} Window]: Downwards Data Drift detected! ({} 25%-Quantile: {})\t(Index: {})'.format(batch_info['name'], training_data.columns[column], incoming_batch.iloc[start_index:end_index,column].mean(), end_index))
        
    if abs(incoming_batch.iloc[start_index:end_index,column].quantile(0.75)) < abs((training_data.iloc[:,column].quantile(0.75) * (1 - threshold))):
        print('[DRIFT][{} Window]: Downwards Data Drift detected! ({} 75%-Quantile: {})\t(Index: {})'.format(batch_info['name'], training_data.columns[column], incoming_batch.iloc[start_index:end_index,column].mean(), end_index))
        
    return incoming_batch.iloc[start_index:end_index,column].mean();    

        
from math import log2

def kl_divergence(p, q):
    return sum(p[i] * log2(p[i]/q[i]) for i in range(len(p)))

        
def monitor_drift_kl_divergence(training_data, incoming_batch, start_index, end_index, batch_info, column, thresholds):
    batch_thresholds = thresholds[batch_info['name'].lower() + '_batch_drift']
    threshold = batch_thresholds['one_dim_drift_kl_divergence']
    
    # Filter out zero values in datasets
    batch_df = incoming_batch[incoming_batch.iloc[:,column] != 0]
    train_df = training_data[training_data.iloc[:,column] != 0]
    
    # Filter out negative values in dataset, if mean is positive (= remove negative outliers)
    if batch_df.iloc[:,column].mean() > 0:
        batch_df = batch_df[batch_df.iloc[:,column] > 0]
        
    batch_df = batch_df.reset_index()
    train_df = train_df.reset_index()
    batch_df = batch_df.drop('index', axis=1)
    train_df = train_df.drop('index', axis=1)
    batch = batch_df.iloc[start_index:end_index,column]
    batch = batch.reset_index()
    batch = batch.drop('index', axis=1)
    batch = batch.iloc[:,0]
    train = train_df.iloc[:,column]
    
    divergence_score = kl_divergence(batch, train);
    
    if divergence_score > threshold:
        print('[DRIFT][{} Window]: KL-Divergence detected! ({} Divergence: {})\t(Index: {})'.format(batch_info['name'], batch_df.columns[column], divergence_score, end_index))        
        
    return divergence_score;
        

from scipy.stats import wasserstein_distance

def monitor_drift_wasserstein_distance(training_data, incoming_batch, start_index, end_index, batch_info, column, thresholds):
    batch_thresholds = thresholds[batch_info['name'].lower() + '_batch_drift']
    threshold = batch_thresholds['one_dim_drift_wasserstein']
    
    batch = incoming_batch.iloc[start_index:end_index, column]
    train = training_data.iloc[:, column]
    wasserstein_dist = wasserstein_distance(batch, train);
    
    if wasserstein_dist > threshold:
        print('[DRIFT][{} Window]: Wasserstein Distance detected! ({} Distance: {})\t(Index: {})'.format(batch_info['name'], incoming_batch.columns[column], wasserstein_dist, end_index))
        
    return wasserstein_dist;
        

import statistics

def monitor_drift_pairwise_correlations(training_data, incoming_batch, start_index, end_index, batch_info, input_columns, train_pairwise_correlations, thresholds, dash_metrics):
    batch_thresholds = thresholds[batch_info['name'].lower() + '_batch_drift']
    threshold = batch_thresholds['one_dim_drift_correlations']
    is_first_batch = False;
    if(incoming_batch.shape[0] == batch_info['size']):
        is_first_batch = True;
    
    batch_pairwise_correlations = get_pairwise_correlations(incoming_batch.iloc[start_index:end_index,:], input_columns);
    
    for column in input_columns:
        temp_correlations = [];
        for key in train_pairwise_correlations:
            temp_columns = key.split(" <> ");
            for metric in range(0,1):
                distance = abs((train_pairwise_correlations[key][metric] - batch_pairwise_correlations[key][metric]))
                if distance > abs((train_pairwise_correlations[key][metric] * threshold)):
                    print('[DRIFT][{} Window]: Pairwise Correlation Drift detected: {}!\t(Index: {})'.format(batch_info['name'], key, end_index))
            if train_data.columns[column] in temp_columns:
                temp_correlations.append(batch_pairwise_correlations[key][0])
        dash_metrics[train_data.columns[column]]["pairwise correlation"].append(statistics.mean(temp_correlations))
        if is_first_batch:
            dash_metrics[train_data.columns[column]]["pairwise correlation"].append(statistics.mean(temp_correlations))
        
    return dash_metrics;
                    
    
def prepare_incoming_batch(data, start_index, end_index):
    # Remove fare-bin values with less occurences than 3; Avoid error in split data set
    df = data.iloc[start_index:end_index,:]
    df = df[df.groupby('fare-bin')['fare-bin'].transform(len) > 3]
    return df
    
def monitor_feature_importances(training_data, incoming_batch, start_index, end_index, batch_info, input_columns, feature_importances, thresholds, dash_metrics):
    batch_thresholds = thresholds[batch_info['name'].lower() + '_batch_drift']
    threshold = batch_thresholds['one_dim_drift_feature_importances']
    is_first_batch = False;
    if(incoming_batch.shape[0] == batch_info['size']):
        is_first_batch = True;
    
    incoming_batch = prepare_incoming_batch(incoming_batch, start_index, end_index);
  
    rf = get_rf(incoming_batch, input_columns);
    batch_feature_importance_list = rf.feature_importances_
    train_feature_importance_list = feature_importances
    feature_count = len(batch_feature_importance_list)
    
    for index in range(0, feature_count):
        if batch_feature_importance_list[index] > (train_feature_importance_list[index] * (1 + threshold)):
            print('[DRIFT][{} Window]: Feature Importance Drift detected for {}!\t(Index: {})'.format(batch_info['name'], incoming_batch.columns[input_columns[index]], end_index))
            
        if batch_feature_importance_list[index] < (train_feature_importance_list[index] / (1 + threshold)):
            print('[DRIFT][{} Window]: Feature Importance Drift detected for {}!\t(Index: {})'.format(batch_info['name'], incoming_batch.columns[input_columns[index]], end_index))
            
        dash_metrics[train_data.columns[input_columns[index]]]['feature importances'].append(batch_feature_importance_list[index]);
        if is_first_batch:
            dash_metrics[train_data.columns[input_columns[index]]]['feature importances'].append(batch_feature_importance_list[index]);
    
    return dash_metrics;
            
            

def monitor_drift_shap(train_data_infos, incoming_batch, start_index, end_index, batch_info, column, shap, shap_index, thresholds):
    batch_thresholds = thresholds[batch_info['name'].lower() + '_batch_drift']
    threshold = batch_thresholds['one_dim_drift_shap']
    shap_mean = train_data_infos['shap']['mean'][shap_index]

    # SHAP Mean
    if shap[1][shap_index] < (shap_mean - (abs(shap_mean * threshold))):
        print('[DRIFT][{} Window]: Downwards Shap Drift detected for {}!\t(Index: {})'.format(batch_info['name'], train_data.columns[column], end_index));    
        print('batch: {}, train: {}'.format(shap[1][shap_index], (shap_mean - (shap_mean * threshold))))
        
    # SHAP Mean
    if shap[1][shap_index] > (shap_mean + (abs(shap_mean * threshold))):
        print('[DRIFT][{} Window]: Upwards Shap Drift detected for {}!\t(Index: {})'.format(batch_info['name'], train_data.columns[column], end_index));            
        print('batch: {}, train: {}'.format(shap[1][shap_index], (shap_mean - (shap_mean * threshold))))
        
    return shap[1][shap_index];

def monitor_drift_one_dimension(training_data, train_data_infos, incoming_batch, start_index, end_index, batch_info, thresholds, dash_metrics):
    input_columns = train_data_infos['input_columns']
    output_columns = train_data_infos['output_columns']
    columns_to_use = np.concatenate((output_columns, input_columns), axis=0)
    train_pairwise_correlations = train_data_infos['correlation_pairs']
    shap = get_shap_feature_importances(incoming_batch.iloc[start_index:end_index,:], train_data_infos['rf'], train_data_infos['input_columns'], thresholds)
    shap_index = 0
    is_first_batch = False;
    if(incoming_batch.shape[0] == batch_info['size']):
        is_first_batch = True;
    
    batch_metrics = []
    
    for column in columns_to_use:
        dash_metric_mean = monitor_drift_main_metrics(training_data, incoming_batch, start_index, end_index, batch_info, column, thresholds);
        dash_metrics[train_data.columns[column]]['mean'].append(dash_metric_mean)
        dash_metric_kl_divergence = monitor_drift_kl_divergence(training_data, incoming_batch, start_index, end_index, batch_info, column, thresholds);
        dash_metrics[train_data.columns[column]]['kl_divergence'].append(dash_metric_kl_divergence)
        dash_metric_wasserstein = monitor_drift_wasserstein_distance(training_data, incoming_batch, start_index, end_index, batch_info, column, thresholds);
        dash_metrics[train_data.columns[column]]['wasserstein'].append(dash_metric_wasserstein)
        
        if is_first_batch:
            dash_metrics[train_data.columns[column]]['mean'].append(dash_metric_mean)
            dash_metrics[train_data.columns[column]]['kl_divergence'].append(dash_metric_kl_divergence)
            dash_metrics[train_data.columns[column]]['wasserstein'].append(dash_metric_wasserstein)
        
        if column in input_columns:
            dash_metric_shap = monitor_drift_shap(train_data_infos, incoming_batch, start_index, end_index, batch_info, column, shap, shap_index, thresholds);
            dash_metrics[train_data.columns[column]]['shap'].append(dash_metric_shap);
            if is_first_batch:
                dash_metrics[train_data.columns[column]]['shap'].append(dash_metric_shap);
            shap_index = shap_index + 1
        
           
    dash_metrics = monitor_drift_pairwise_correlations(training_data, incoming_batch, start_index, end_index, batch_info, input_columns, train_pairwise_correlations, thresholds, dash_metrics);
    dash_metrics = monitor_feature_importances(training_data, incoming_batch, start_index, end_index, batch_info, input_columns, train_data_infos['rf_feature_importances'], thresholds, dash_metrics);
    return dash_metrics;
    
def monitor_drift_multi_dimensions(training_data, train_data_infos, incoming_batch, start_index, end_index, batch_info, thresholds, dash_metrics):
    input_columns = train_data_infos['input_columns'];
    output_columns = train_data_infos['output_columns'];
    pca_model = train_data_infos['pca_model'];
    threshold = thresholds['multi_dim_outlier']
    is_first_batch = False;
    if(incoming_batch.shape[0] == batch_info['size']):
        is_first_batch = True;
    
    incoming_batch_filtered = get_data_filtered(incoming_batch, input_columns);
    incoming_batch_transformed = transform_pca(pca_model, incoming_batch_filtered);
    distances = calculate_euclidean_distances(incoming_batch_transformed, input_columns);
    distances_metrics = get_distance_metrics(distances, thresholds);
    
    if distances_metrics['mean'] > train_data_infos['distance_metrics']['mean'] * (1 + threshold):
        print('[DRIFT][{} Window]: Multi-Dimensional Upwards Mean Drift Detected! Distance: {}\t\t\t(Index: {})'.format(batch_info['name'], distances_metrics['maximum'], end_index));
        
    if distances_metrics['mean'] < train_data_infos['distance_metrics']['mean'] * (1 - threshold):
        print('[DRIFT][{} Window]: Multi-Dimensional Downwards Mean Drift Detected! Distance: {}\t\t\t(Index: {})'.format(batch_info['name'], distances_metrics['maximum'], end_index));
        
    if distances_metrics['first_quarter'] < train_data_infos['distance_metrics']['first_quarter'] * (1 - threshold):
        print('[DRIFT][{} Window]: Multi-Dimensional 1st Quarter Downwards Drift Detected! Distance: {}\t\t\t(Index: {})'.format(batch_info['name'], distances_metrics['first_quarter'], end_index));
        
    if distances_metrics['third_quarter'] > (train_data_infos['distance_metrics']['third_quarter'] * (1 + threshold)):
        print('batch: {}, train: {}, train+threshold: {}'.format(distances_metrics['third_quarter'], train_data_infos['distance_metrics']['third_quarter'], (train_data_infos['distance_metrics']['third_quarter'] * (1 - threshold))))
        print('[DRIFT][{} Window]: Multi-Dimensional 3rd Quarter Upwards Drift Detected! Distance: {}\t\t\t(Index: {})'.format(batch_info['name'], distances_metrics['third_quarter'], end_index));
        
    dash_metrics['multidimension euclidean distance'].append(distances_metrics['mean']);
    if is_first_batch:
        dash_metrics['multidimension euclidean distance'].append(distances_metrics['mean']);
    return dash_metrics;
        
        
def monitor_batch_drift(training_data, train_data_infos, incoming_batch, index, batch_info, thresholds, dash_metrics):
    batch_size = batch_info['size'];
    start_index = index - batch_size + 1;
    end_index = index;
    
    dash_metrics = monitor_drift_one_dimension(training_data, train_data_infos, incoming_batch, start_index, end_index, batch_info, thresholds, dash_metrics);
    return monitor_drift_multi_dimensions(training_data, train_data_infos, incoming_batch, start_index, end_index, batch_info, thresholds, dash_metrics);
    
def monitor_drift(training_data, train_data_infos, batch, step_sizes, index, row, batch_infos, thresholds, dash_metrics):
    # Add new sample to batch
    batch.loc[index] = row.values;
        
    if batch.shape[0] % step_sizes['small'] == 0 and batch.shape[0] > (step_sizes['small']):
        return monitor_batch_drift(training_data, train_data_infos, batch, index, batch_infos['small'], thresholds, dash_metrics);

    if batch.shape[0] % step_sizes['medium'] == 0:
        return monitor_batch_drift(training_data, train_data_infos, batch, index, batch_infos['medium'], thresholds, dash_metrics);

    if batch.shape[0] % step_sizes['large'] == 0:
        return monitor_batch_drift(training_data, train_data_infos, batch, index, batch_infos['large'], thresholds, dash_metrics);
    
    return -1;

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

def get_rf(data, columns):
    # Split data
    X_train, X_valid, y_train, y_valid = train_test_split(data, np.array(data['fare_amount']), 
                                                          stratify = data['fare-bin'], test_size=0.33,
                                                          random_state = RSEED)

    # Create the random forest        
    rf = RandomForestRegressor(n_estimators = 20, max_depth = 20, max_features = None, oob_score = True, 
                                bootstrap = True, verbose = 1, n_jobs = -1)

    # Train random forest
    column_list = []
    for column in columns:
        column_list.append(train_data.columns[column])
        
    rf.fit(X_train[column_list], y_train)
    
    return rf

In [7]:
# SAVE RF
'''
from sklearn.externals import joblib

input_columns = [3, 4, 5, 6, 7, 12, 13, 14]
rf = get_rf(train_data, input_columns)

filename = 'rf.sav'
joblib.dump(rf, filename);
'''

"\nfrom sklearn.externals import joblib\n\ninput_columns = [3, 4, 5, 6, 7, 12, 13, 14]\nrf = get_rf(train_data, input_columns)\n\nfilename = 'rf.sav'\njoblib.dump(rf, filename);\n"

In [27]:
def get_avg_samples_per_day(data):
    # TODO: More elegant solution possible?
    df = pd.DataFrame();
    df['pickup_datetime'] = train_data['pickup_datetime']
    df['count'] = 1
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    df.index = df['pickup_datetime'] 
    df = df.resample('D').sum()
    df = df[df['count'] != 0]
    return df['count'].mean()

def get_batch_infos(data):
    batch_size_day = get_avg_samples_per_day(data);
    batch_size_week = batch_size_day * 7;
    batch_size_month = batch_size_day * 30;
    
    batch_sizes = {
        'small': {'size': int(batch_size_day), 'name': 'Small'},
        'medium': {'size': int(batch_size_week), 'name': 'Medium'},
        'large': {'size': int(batch_size_month), 'name': 'Large'},        
    }
    
    return batch_sizes;

def get_step_sizes(batch_infos):
    step_sizes = {
        'small': (batch_infos['small']['size'] / 2),
        'medium': (batch_infos['medium']['size'] / 2),
        'large': (batch_infos['large']['size'] / 2) 
    }
    return step_sizes;

def get_data_filtered(data, columns_to_use):
    data_filtered = pd.DataFrame()
    i = 0

    # Filter Input Columns
    for column in columns_to_use:
        data_filtered.insert(i, data.columns[column], data.iloc[:,column])
        i = i + 1
        
    return data_filtered;

def apply_pca(train_data_filtered):
    pca_model = get_pca_model(train_data_filtered);
    return pca_model, transform_pca(pca_model, train_data_filtered);

def get_pca_model(train_data_filtered):
    from sklearn.decomposition import PCA
    
    pca = PCA(whiten=True)
    return pca.fit(train_data_filtered)

def transform_pca(pca_model, train_data_filtered):
    from sklearn.decomposition import PCA
    
    return pca_model.transform(train_data_filtered)

def get_zero_vector(columns_to_use):
    number_of_dimensions = len(columns_to_use)
    return np.zeros((number_of_dimensions)*1)

def calculate_euclidean_distance(sample, zero_vector):
    from scipy.spatial import distance
    
    return distance.euclidean(sample, zero_vector)

def calculate_euclidean_distances(transformed_data, columns_to_use):
    from scipy.spatial import distance
    
    zero_vector = get_zero_vector(columns_to_use);
    
    index = 0
    distances = np.ndarray(shape=(np.size(transformed_data,0),1))

    # Create distances to zero vector
    for sample in transformed_data:
        distances[index] = calculate_euclidean_distance(transformed_data[index], zero_vector);
        index = index + 1
        
    return distances;

def get_pairwise_correlations(data, input_columns):
    
    from scipy.stats.stats import pearsonr
    import itertools
    
    pairwise_correlations = {}

    for column_a, column_b in itertools.combinations(input_columns, 2):
        correlation = pearsonr(data.iloc[:,column_a], data.iloc[:,column_b])
        pairwise_correlations['{} <> {}'.format(data.columns[column_a], data.columns[column_b])] = correlation
        
    return pairwise_correlations

def get_shap_feature_importances(data, rf, input_columns, thresholds):
    import shap
    raw_threshold = thresholds['one_dim_outlier']
    threshold =     raw_threshold * 1000

    column_list = []
    for column in input_columns:
        column_list.append(train_data.columns[column])

    explainer = shap.TreeExplainer(rf, data=None);
    shap_values = explainer.shap_values(data[column_list]);

    shap_values_lower_quantile = []
    shap_values_mean = []
    shap_values_upper_quantile = []

    for column in range(0, len(input_columns)):    
        shap_values_lower_quantile.append(np.percentile(shap_values[:,column], threshold))
        shap_values_mean.append(shap_values[:,column].mean())
        shap_values_upper_quantile.append(np.percentile(shap_values[:,column], (100 - threshold)))
        
    shap = [shap_values_lower_quantile, shap_values_mean, shap_values_upper_quantile]
    return shap
    

def get_distance_metrics(distances, thresholds):
    
    raw_threshold = thresholds['multi_dim_outlier']
    threshold = raw_threshold * 100
    
    distance_metrics = {
        'minimum': distances.min(),
        'lower_threshold_percentile': np.percentile(distances, threshold),
        'first_quarter': np.percentile(distances, 25),
        'mean': distances.mean(),
        'third_quarter': np.percentile(distances, 75),
        'upper_threshold_percentile': np.percentile(distances, (100 - threshold)),
        'maximum': distances.max()
    }    
        
    return distance_metrics
    

def get_train_data_infos(distances, correlation_pairs, shap, rf, input_columns, output_columns, pca_model, thresholds):
    
    train_data_infos = {
        'input_columns': input_columns,
        'output_columns': output_columns,
        'pca_model': pca_model,
        'rf': rf,
        'rf_feature_importances': rf.feature_importances_,
        'shap': {
            'lower_threshold_percentile': shap[0],
            'mean': shap[1],
            'upper_threshold_percentile': shap[2]
        },
        'correlation_pairs': correlation_pairs,
        'distance_metrics': get_distance_metrics(distances, thresholds)
    }
    
    return train_data_infos;

def generate_train_data_infos(train_data, input_columns, output_columns, thresholds):
    # +++ REMOVE: Only for testing purposes +++    
    from sklearn.externals import joblib
    rf = joblib.load('rf.sav')
    test = {
        'input_columns': input_columns,
        'output_columns': output_columns,
        'pca_model': get_pca_model(get_data_filtered(train_data, input_columns)),
        'rf': rf,
        'rf_feature_importances': rf.feature_importances_,
        'shap': {
            'lower_threshold_percentile': [-2.528580313995101, -1.7864965686783245, -1.5736475393454952, -3.1326455478959168, -0.6049921796435059, -1.4670753181848482, -10.497608109368514, -1.170986733676531],
            'mean': [0.05859532547100909, 0.1150723753541944, 0.2375574423112044, -0.21863576089415065, 0.0049141129696212635, 0.18305498308684381, -0.6148760801903526, 0.16000324432733556],
            'upper_threshold_percentile': [7.925407442096883, 4.635143181597266, 19.453888955544727, 4.732747057367445, 0.4361906308296687, 6.485159131158373, 40.0529875893241, 6.800010400410237]
        },
        'correlation_pairs': get_pairwise_correlations(train_data, input_columns),
        'distance_metrics': {
            'minimum': 0.7249817600534729, 
            'lower_threshold_percentile': 0.7550153949050724,
            'first_quarter': 1.0764902427267036,
            'mean': 1.922411839793231,
            'third_quarter': 2.200820905397247,
            'upper_threshold_percentile': 10.6967040765229, 
            'maximum': 58.320576749831
        }
    }
    return test
    # +++ REMOVE: Only for testing purposes +++

    train_data_filtered = get_data_filtered(train_data, input_columns);
    pca_model, transformed_data = apply_pca(train_data_filtered);
    distances = calculate_euclidean_distances(transformed_data, input_columns);
    correlation_pairs = get_pairwise_correlations(train_data, input_columns);
    rf = get_rf(train_data, input_columns);
    shap = get_shap_feature_importances(train_data, rf, input_columns, thresholds);
    
    return get_train_data_infos(distances, correlation_pairs, shap, rf, input_columns, output_columns, pca_model, thresholds);  

In [28]:
def monitor_outliers_one_dimension(train_data, train_data_infos, incoming_sample, index, thresholds):
    
    threshold = thresholds['one_dim_outlier']
    input_columns = train_data_infos['input_columns']
    output_columns = train_data_infos['output_columns']
    columns_to_use = np.concatenate((output_columns, input_columns), axis=0) 
    shap = get_shap_feature_importances(pd.DataFrame(incoming_sample).transpose(), train_data_infos['rf'], input_columns, thresholds)
    shap_index = 0
    
    for column in columns_to_use:
            
            if incoming_sample[column] > train_data.iloc[:,column].max():
                print('[OUTLIER] MAX Outlier Detected! {}: {}\t\t\t(Index: {})'.format(train_data.columns[column], incoming_sample[column], index));

            elif incoming_sample[column] > train_data.iloc[:,column].quantile(1 - threshold):
                print('[POT. OUTLIER] Potential MAX Outlier Detected! {}: {}\t\t\t(Index: {})'.format(train_data.columns[column], incoming_sample[column], index));

            if incoming_sample[column] < train_data.iloc[:,column].min():
                print('[OUTLIER] MIN Outlier Detected! {}: {}\t\t\t(Index: {})'.format(train_data.columns[column], incoming_sample[column], index));
                
            elif incoming_sample[column] < train_data.iloc[:,column].quantile(threshold):
                print('[POT. OUTLIER] Potential MIN Outlier Detected! {}: {}\t\t\t(Index: {})'.format(train_data.columns[column], incoming_sample[column], index));
            
            
            if column in input_columns:
            
                # SHAP Lower Percentile
                if shap[0][shap_index] < train_data_infos['shap']['lower_threshold_percentile'][shap_index]:
                    print('[POT. MIN OUTLIER] Potential Shap Outlier Detected! {}\t\t\t(Index: {})'.format(train_data.columns[column], index));

                # SHAP Upper Percentile
                if shap[2][shap_index] > train_data_infos['shap']['upper_threshold_percentile'][shap_index]:
                    print('[POT. MAX OUTLIER] Potential Shap Outlier Detected! {}\t\t\t(Index: {})'.format(train_data.columns[column], index));

                shap_index = shap_index + 1
            

    
def monitor_outliers_multi_dimensions(train_data, train_data_infos, incoming_sample, index):
    incoming_sample_df = pd.DataFrame(incoming_sample).transpose();
    incoming_sample_df_filtered = get_data_filtered(incoming_sample_df, train_data_infos['input_columns']);
    incoming_sample_df_transformed = transform_pca(train_data_infos['pca_model'], incoming_sample_df_filtered);
    distance = calculate_euclidean_distance(incoming_sample_df_transformed, get_zero_vector(train_data_infos['input_columns']));

    if distance > train_data_infos['distance_metrics']['maximum']:
        print('[OUTLIER] Multi-Dimensional Outlier Detected! Distance: {}\t\t\t(Index: {})'.format(distance, index));
        
    elif distance > train_data_infos['distance_metrics']['upper_threshold_percentile']:
        print('[POT. OUTLIER] Potential Multi-Dimensional Outlier Detected! Distance: {}\t\t\t(Index: {})'.format(distance, index));

def monitor_outliers(train_data, train_data_infos, incoming_sample, index, thresholds):
    monitor_outliers_one_dimension(train_data, train_data_infos, incoming_sample, index, thresholds);
    monitor_outliers_multi_dimensions(train_data, train_data_infos, incoming_sample, index);

In [29]:
# Define desired thresholds

def get_thresholds():
    thresholds = {
        'one_dim_outlier': 0.001,
        'multi_dim_outlier': 0.5,
        'small_batch_drift': {
            'one_dim_drift_metric': 0.3,
            'one_dim_drift_kl_divergence': 6500,
            'one_dim_drift_wasserstein': 1.2,
            'one_dim_drift_correlations': 50,
            'one_dim_drift_feature_importances': 50,
            'one_dim_drift_shap': 5
        },
        'medium_batch_drift': {
            'one_dim_drift_metric': 0.1,
            'one_dim_drift_kl_divergence': 1000,
            'one_dim_drift_wasserstein': 0.25,
            'one_dim_drift_correlations': 20,
            'one_dim_drift_feature_importances': 20,
            'one_dim_drift_shap': 3          
        },
        'large_batch_drift': {
            'one_dim_drift_metric': 0.05,
            'one_dim_drift_kl_divergence': 500,
            'one_dim_drift_wasserstein': 0.2,
            'one_dim_drift_correlations': 15,
            'one_dim_drift_feature_importances': 15,
            'one_dim_drift_shap': 1           
        }
    }
    
    return thresholds;

In [30]:
# Needs to be configured individually

def prepare_data(data):
    prepared_data = data.copy()
    prepared_data['fare-bin'] = data.iloc[:,8].astype("category").cat.codes
    
    return prepared_data

In [31]:
def get_dict():
    return {
        "mean": [],
        "kl_divergence": [],
        "wasserstein": [],
        "shap": [],
        "feature importances": [],
        "pairwise correlation": []
    }

def get_dashboard_metrics(input_columns, output_columns):
    features_for_dashboard = {}
    all_columns = input_columns + output_columns
    
    for index in all_columns:
        features_for_dashboard[train_data.columns[index]] = get_dict()
        
    features_for_dashboard['multidimension euclidean distance'] = []
    
    return features_for_dashboard;

In [38]:
def monitor(train_data, incoming_data):
    # START: DEFINED BY USER
    input_columns = [3, 4, 5, 6, 7, 12, 13, 14]
    output_columns = [1, 8]
    dashboard_mode = False
    # END: DEFINED BY USER
    train_data = prepare_data(train_data);
    incoming_data = prepare_data(incoming_data);
    thresholds = get_thresholds();
    batch = pd.DataFrame(columns=incoming_data.columns);
    batch_infos = get_batch_infos(train_data);
    train_data_infos = generate_train_data_infos(train_data, input_columns, output_columns, thresholds);
    step_sizes = get_step_sizes(batch_infos);
    dash_metrics_raw = get_dashboard_metrics(input_columns, output_columns);
    dashboard_indices = [1]
    
    for index, row in incoming_data.iterrows():
        if index % 1000 == 0:
            print('------------------------ INDEX: {} ------------------------'.format(index))
        monitor_outliers(train_data, train_data_infos, row, index, thresholds);
        dash_metrics = monitor_drift(train_data, train_data_infos, batch, step_sizes, index, row, batch_infos, thresholds, dash_metrics_raw);
        
        if dashboard_mode and dash_metrics != -1:
            dashboard_indices.append(index)
            print_dashboard(dashboard_indices, dash_metrics, input_columns, output_columns);

In [39]:
from IPython.display import clear_output
import matplotlib
matplotlib.rcParams['mathtext.fontset'] = 'stix'
matplotlib.rcParams['font.family'] = 'STIXGeneral'


def print_dashboard(dashboard_indices, dash_metrics, input_columns, output_columns):    
    total_columns = input_columns + output_columns
    clear_output(wait=True)
    

    for column in total_columns:
        fig = plt.figure(figsize=(8, 6))
        fig.suptitle(train_data.columns[column], fontsize=16)        
        plt.plot(dashboard_indices,dash_metrics[train_data.columns[column]]['mean'], label="mean")
        plt.plot(dashboard_indices,dash_metrics['pickup_longitude']['kl_divergence'], label="kl_divergence")
        plt.plot(dashboard_indices,dash_metrics['pickup_longitude']['wasserstein'], label="wasserstein")
        plt.plot(dashboard_indices,dash_metrics['pickup_longitude']['shap'], label="shap")
        plt.plot(dashboard_indices,dash_metrics['pickup_longitude']['feature importances'], label="feature importances")
        plt.plot(dashboard_indices,dash_metrics['pickup_longitude']['pairwise correlation'], label="pairwise correlation")
        plt.legend(loc='upper left')
        # if len(dashboard_indices) > 5:
        #     plt.savefig('Dashboard-Charts/chart_column_{}.png'.format(column), dpi=300, transparent=True)

    fig = plt.figure(figsize=(8, 6))
    fig.suptitle('multidimensional', fontsize=16)        
    plt.plot(dashboard_indices,dash_metrics['multidimension euclidean distance'], label="euclidean distance")    
    plt.legend(loc='upper left')
    
    # if len(dashboard_indices) > 5:
    #     plt.savefig('Dashboard-Charts/chart_multidim.png', dpi=300, transparent=True)
        
    plt.show()

In [40]:
monitor(train_data, location_drift);

------------------------ INDEX: 0 ------------------------
[POT. OUTLIER] Potential MAX Outlier Detected! dropoff_latitude: 40.887882232666016			(Index: 4)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_latitude			(Index: 115)
[POT. MAX OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 120)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! manhattan			(Index: 120)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! haversine			(Index: 305)
[POT. OUTLIER] Potential MAX Outlier Detected! manhattan: 0.36305999755859375			(Index: 343)
[POT. OUTLIER] Potential MAX Outlier Detected! euclidean: 0.2629007645951818			(Index: 343)
[POT. OUTLIER] Potential MAX Outlier Detected! haversine: 27.40050231658701			(Index: 343)
[POT. OUTLIER] Potential Multi-Dimensional Outlier Detected! Distance: 12.049343616294971			(Index: 343)
[POT. MAX OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 344)
[POT. OUTLIER] Potential MAX Outlier Detected! pickup_l

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished


[POT. OUTLIER] Potential Multi-Dimensional Outlier Detected! Distance: 11.937218647778947			(Index: 621)
[POT. OUTLIER] Potential MAX Outlier Detected! dropoff_longitude: -73.62210845947266			(Index: 631)
[POT. OUTLIER] Potential MAX Outlier Detected! dropoff_latitude: 41.02389526367188			(Index: 631)
[POT. MAX OUTLIER] Potential Shap Outlier Detected! dropoff_latitude			(Index: 631)
[POT. OUTLIER] Potential MAX Outlier Detected! manhattan: 0.495525360107429			(Index: 631)
[POT. OUTLIER] Potential MAX Outlier Detected! euclidean: 0.3504990597107201			(Index: 631)
[POT. MAX OUTLIER] Potential Shap Outlier Detected! euclidean			(Index: 631)
[POT. OUTLIER] Potential MAX Outlier Detected! haversine: 34.81891090248362			(Index: 631)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! haversine			(Index: 631)
[POT. OUTLIER] Potential Multi-Dimensional Outlier Detected! Distance: 16.13067721846217			(Index: 631)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_latitude			(Index: 68

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished


[POT. OUTLIER] Potential MIN Outlier Detected! dropoff_longitude: -74.18255615234375			(Index: 965)
[POT. OUTLIER] Potential MIN Outlier Detected! dropoff_longitude: -74.18158721923827			(Index: 997)
------------------------ INDEX: 1000 ------------------------
[POT. MIN OUTLIER] Potential Shap Outlier Detected! manhattan			(Index: 1212)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished


[POT. MAX OUTLIER] Potential Shap Outlier Detected! passenger_count			(Index: 1677)
[POT. OUTLIER] Potential MAX Outlier Detected! manhattan: 0.34459304809571023			(Index: 1706)
[POT. OUTLIER] Potential MAX Outlier Detected! euclidean: 0.25502846339118684			(Index: 1706)
[POT. OUTLIER] Potential MAX Outlier Detected! haversine: 27.038184350321245			(Index: 1706)
[POT. OUTLIER] Potential Multi-Dimensional Outlier Detected! Distance: 12.91668895448385			(Index: 1706)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_latitude			(Index: 1722)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 1765)
[DRIFT][Small Window]: Pairwise Correlation Drift detected: pickup_longitude <> passenger_count!	(Index: 1823)
[DRIFT][Small Window]: Pairwise Correlation Drift detected: pickup_longitude <> passenger_count!	(Index: 1823)
[DRIFT][Small Window]: Pairwise Correlation Drift detected: pickup_longitude <> passenger_count!	(Index: 1823)
[DRIFT][Small Window]: Pai

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished


[POT. MIN OUTLIER] Potential Shap Outlier Detected! pickup_latitude			(Index: 1929)
[POT. OUTLIER] Potential MAX Outlier Detected! pickup_longitude: -73.71955108642578			(Index: 1938)
[POT. MAX OUTLIER] Potential Shap Outlier Detected! pickup_longitude			(Index: 1938)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! pickup_latitude			(Index: 1938)
[POT. OUTLIER] Potential MAX Outlier Detected! dropoff_longitude: -73.7195587158203			(Index: 1938)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_latitude			(Index: 1938)
[POT. OUTLIER] Potential Multi-Dimensional Outlier Detected! Distance: 11.670874228277823			(Index: 1938)
------------------------ INDEX: 2000 ------------------------
[POT. MAX OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 2012)
[POT. MAX OUTLIER] Potential Shap Outlier Detected! euclidean			(Index: 2012)
[POT. OUTLIER] Potential MIN Outlier Detected! dropoff_longitude: -74.17822265625			(Index: 2039)
[POT. MAX OUTLIER] Potential Sha

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished


[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_latitude			(Index: 2258)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_latitude			(Index: 2261)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! haversine			(Index: 2276)
[POT. OUTLIER] Potential MAX Outlier Detected! pickup_latitude: 41.49712980324225			(Index: 2282)
[POT. OUTLIER] Potential MAX Outlier Detected! manhattan: 0.7928870969434172			(Index: 2282)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! manhattan			(Index: 2282)
[POT. OUTLIER] Potential MAX Outlier Detected! euclidean: 0.7791178495648801			(Index: 2282)
[POT. OUTLIER] Potential MAX Outlier Detected! haversine: 86.7231781569875			(Index: 2282)
[OUTLIER] Multi-Dimensional Outlier Detected! Distance: 69.46614602542712			(Index: 2282)
[OUTLIER] MAX Outlier Detected! pickup_longitude: -71.85166639941991			(Index: 2283)
[OUTLIER] MAX Outlier Detected! manhattan: 2.0238218818300884			(Index: 2283)
[POT. MIN OUTLIER] Potential Shap Outlier Dete

[OUTLIER] MAX Outlier Detected! pickup_longitude: -72.54687045595792			(Index: 2308)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 2308)
[POT. OUTLIER] Potential MAX Outlier Detected! manhattan: 1.449139370702241			(Index: 2308)
[OUTLIER] MAX Outlier Detected! euclidean: 1.4314732627426998			(Index: 2308)
[OUTLIER] MAX Outlier Detected! haversine: 120.74725030745633			(Index: 2308)
[OUTLIER] Multi-Dimensional Outlier Detected! Distance: 85.51790767107238			(Index: 2308)
[POT. OUTLIER] Potential MAX Outlier Detected! pickup_longitude: -73.46215669246794			(Index: 2309)
[OUTLIER] MAX Outlier Detected! pickup_latitude: 42.507103077476685			(Index: 2309)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 2309)
[OUTLIER] MAX Outlier Detected! manhattan: 2.2665040021962426			(Index: 2309)
[OUTLIER] MAX Outlier Detected! euclidean: 1.8242335598950297			(Index: 2309)
[OUTLIER] MAX Outlier Detected! haversine: 199.4340297167287			(I

[OUTLIER] MAX Outlier Detected! manhattan: 3.522059782447009			(Index: 2331)
[OUTLIER] MAX Outlier Detected! euclidean: 2.6463923134245295			(Index: 2331)
[OUTLIER] MAX Outlier Detected! haversine: 282.3660661672968			(Index: 2331)
[OUTLIER] Multi-Dimensional Outlier Detected! Distance: 155.61682708867727			(Index: 2331)
[OUTLIER] MAX Outlier Detected! pickup_longitude: -71.98608359466806			(Index: 2333)
[OUTLIER] MAX Outlier Detected! pickup_latitude: 42.478551877710345			(Index: 2333)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 2333)
[OUTLIER] MAX Outlier Detected! manhattan: 3.721891805991497			(Index: 2333)
[OUTLIER] MAX Outlier Detected! euclidean: 2.6385962506642873			(Index: 2333)
[OUTLIER] MAX Outlier Detected! haversine: 253.98995614812983			(Index: 2333)
[OUTLIER] Multi-Dimensional Outlier Detected! Distance: 125.22877091879417			(Index: 2333)
[OUTLIER] MAX Outlier Detected! pickup_latitude: 42.42183860925469			(Index: 2334)
[OUTLIER] MAX O

[OUTLIER] MAX Outlier Detected! manhattan: 2.3361934006818856			(Index: 2362)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! manhattan			(Index: 2362)
[OUTLIER] MAX Outlier Detected! euclidean: 2.317981667730704			(Index: 2362)
[OUTLIER] MAX Outlier Detected! haversine: 195.4414502696886			(Index: 2362)
[OUTLIER] Multi-Dimensional Outlier Detected! Distance: 140.25660447981153			(Index: 2362)
[OUTLIER] MAX Outlier Detected! pickup_latitude: 41.71944653462013			(Index: 2363)
[POT. OUTLIER] Potential MAX Outlier Detected! manhattan: 1.226480836377931			(Index: 2363)
[POT. OUTLIER] Potential MAX Outlier Detected! euclidean: 1.0285926999651325			(Index: 2363)
[POT. OUTLIER] Potential MAX Outlier Detected! haversine: 113.3342835841465			(Index: 2363)
[OUTLIER] Multi-Dimensional Outlier Detected! Distance: 72.43462530558114			(Index: 2363)
[POT. OUTLIER] Potential MAX Outlier Detected! pickup_longitude: -72.83540532728959			(Index: 2364)
[POT. OUTLIER] Potential MAX Outlier Detected! pi

[POT. MIN OUTLIER] Potential Shap Outlier Detected! haversine			(Index: 2395)
[OUTLIER] Multi-Dimensional Outlier Detected! Distance: 119.97086148862655			(Index: 2395)
[POT. OUTLIER] Potential MAX Outlier Detected! pickup_longitude: -73.1314711599721			(Index: 2396)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 2396)
[POT. OUTLIER] Potential MAX Outlier Detected! manhattan: 0.8697113961802287			(Index: 2396)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! manhattan			(Index: 2396)
[POT. OUTLIER] Potential MAX Outlier Detected! euclidean: 0.8486105764115773			(Index: 2396)
[POT. OUTLIER] Potential MAX Outlier Detected! haversine: 71.57294164250816			(Index: 2396)
[POT. OUTLIER] Potential Multi-Dimensional Outlier Detected! Distance: 49.21779943315274			(Index: 2396)
[OUTLIER] MAX Outlier Detected! pickup_longitude: -72.42812421189201			(Index: 2399)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 2399)
[POT. OUTLIER]

[OUTLIER] MAX Outlier Detected! pickup_longitude: -71.73194172579346			(Index: 2420)
[OUTLIER] MAX Outlier Detected! pickup_latitude: 42.543682257865186			(Index: 2420)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 2420)
[OUTLIER] MAX Outlier Detected! manhattan: 3.9918133164955516			(Index: 2420)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! manhattan			(Index: 2420)
[OUTLIER] MAX Outlier Detected! euclidean: 2.8404545049832297			(Index: 2420)
[OUTLIER] MAX Outlier Detected! haversine: 270.1338160947235			(Index: 2420)
[OUTLIER] Multi-Dimensional Outlier Detected! Distance: 132.16054404434055			(Index: 2420)
[OUTLIER] MAX Outlier Detected! pickup_longitude: -72.5969668861043			(Index: 2422)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 2422)
[POT. OUTLIER] Potential MAX Outlier Detected! manhattan: 1.3813961510050774			(Index: 2422)
[OUTLIER] MAX Outlier Detected! euclidean: 1.375845564229073			(Index: 2422)
[PO

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished


[DRIFT][Small Window]: Multi-Dimensional Upwards Mean Drift Detected! Distance: 188.13250274722634			(Index: 2431)
[POT. OUTLIER] Potential MAX Outlier Detected! pickup_longitude: -73.1409982496997			(Index: 2432)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 2432)
[POT. OUTLIER] Potential MAX Outlier Detected! manhattan: 0.8310629075268565			(Index: 2432)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! manhattan			(Index: 2432)
[POT. OUTLIER] Potential MAX Outlier Detected! euclidean: 0.8289598690925567			(Index: 2432)
[POT. OUTLIER] Potential MAX Outlier Detected! haversine: 69.9009155729013			(Index: 2432)
[POT. OUTLIER] Potential Multi-Dimensional Outlier Detected! Distance: 50.55777673450235			(Index: 2432)
[POT. OUTLIER] Potential MAX Outlier Detected! pickup_longitude: -73.20185337620562			(Index: 2437)
[OUTLIER] MAX Outlier Detected! pickup_latitude: 43.007028692878436			(Index: 2437)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! drop

[POT. OUTLIER] Potential MAX Outlier Detected! manhattan: 0.6761088067844909			(Index: 2461)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! manhattan			(Index: 2461)
[POT. OUTLIER] Potential MAX Outlier Detected! euclidean: 0.673075359720357			(Index: 2461)
[POT. OUTLIER] Potential MAX Outlier Detected! haversine: 56.75632507943496			(Index: 2461)
[POT. OUTLIER] Potential Multi-Dimensional Outlier Detected! Distance: 40.83323572958281			(Index: 2461)
[OUTLIER] MAX Outlier Detected! pickup_longitude: -72.61678477680688			(Index: 2463)
[OUTLIER] MAX Outlier Detected! pickup_latitude: 42.215011005751016			(Index: 2463)
[POT. OUTLIER] Potential MAX Outlier Detected! dropoff_longitude: -73.29459079250223			(Index: 2463)
[OUTLIER] MAX Outlier Detected! manhattan: 2.1678490649034003			(Index: 2463)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! manhattan			(Index: 2463)
[OUTLIER] MAX Outlier Detected! euclidean: 1.6369634337412748			(Index: 2463)
[OUTLIER] MAX Outlier Detected! have

[POT. OUTLIER] Potential MAX Outlier Detected! pickup_longitude: -73.29739501414026			(Index: 2485)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 2485)
[POT. OUTLIER] Potential MAX Outlier Detected! manhattan: 0.6861788994339548			(Index: 2485)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! manhattan			(Index: 2485)
[POT. OUTLIER] Potential MAX Outlier Detected! euclidean: 0.6692775464516315			(Index: 2485)
[POT. OUTLIER] Potential MAX Outlier Detected! haversine: 56.44781635608521			(Index: 2485)
[POT. OUTLIER] Potential Multi-Dimensional Outlier Detected! Distance: 38.78476657321889			(Index: 2485)
[OUTLIER] MAX Outlier Detected! pickup_longitude: -72.37607519720734			(Index: 2487)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 2487)
[POT. OUTLIER] Potential MAX Outlier Detected! manhattan: 1.6144757976657047			(Index: 2487)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! manhattan			(Index: 2487)
[OUTLIER] M

[POT. OUTLIER] Potential MAX Outlier Detected! manhattan: 1.2877726611976144			(Index: 2523)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! manhattan			(Index: 2523)
[POT. OUTLIER] Potential MAX Outlier Detected! euclidean: 1.1826530218331734			(Index: 2523)
[POT. OUTLIER] Potential MAX Outlier Detected! haversine: 99.93139277362951			(Index: 2523)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! haversine			(Index: 2523)
[OUTLIER] Multi-Dimensional Outlier Detected! Distance: 59.87398660793027			(Index: 2523)
[POT. OUTLIER] Potential MAX Outlier Detected! pickup_longitude: -73.60919388776041			(Index: 2525)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! pickup_longitude			(Index: 2525)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 2525)
[POT. OUTLIER] Potential MAX Outlier Detected! manhattan: 0.3710726832845026			(Index: 2525)
[POT. OUTLIER] Potential MAX Outlier Detected! euclidean: 0.3697665571284507			(Index: 2525)
[POT. OUTLIER] Pote

[POT. OUTLIER] Potential MAX Outlier Detected! dropoff_longitude: -72.82656734057632			(Index: 2542)
[POT. MAX OUTLIER] Potential Shap Outlier Detected! passenger_count			(Index: 2542)
[POT. OUTLIER] Potential MAX Outlier Detected! manhattan: 1.1352475397947757			(Index: 2542)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! manhattan			(Index: 2542)
[POT. OUTLIER] Potential MAX Outlier Detected! euclidean: 1.1279243620157935			(Index: 2542)
[POT. OUTLIER] Potential MAX Outlier Detected! haversine: 95.07944001653769			(Index: 2542)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! haversine			(Index: 2542)
[OUTLIER] Multi-Dimensional Outlier Detected! Distance: 69.0494775503204			(Index: 2542)
[OUTLIER] MAX Outlier Detected! pickup_longitude: -71.65307766519024			(Index: 2543)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 2543)
[OUTLIER] MAX Outlier Detected! manhattan: 2.332960542817581			(Index: 2543)
[OUTLIER] MAX Outlier Detected! euclidean: 2

[POT. OUTLIER] Potential MAX Outlier Detected! pickup_longitude: -73.00975268711004			(Index: 2567)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 2567)
[POT. OUTLIER] Potential MAX Outlier Detected! manhattan: 0.9799781478509004			(Index: 2567)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! manhattan			(Index: 2567)
[POT. OUTLIER] Potential MAX Outlier Detected! euclidean: 0.9667038643114804			(Index: 2567)
[POT. OUTLIER] Potential MAX Outlier Detected! haversine: 81.48118060880769			(Index: 2567)
[POT. OUTLIER] Potential Multi-Dimensional Outlier Detected! Distance: 57.76062640270188			(Index: 2567)
[OUTLIER] MAX Outlier Detected! pickup_longitude: -72.6251750688337			(Index: 2568)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 2568)
[POT. OUTLIER] Potential MAX Outlier Detected! manhattan: 1.397110392592083			(Index: 2568)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! manhattan			(Index: 2568)
[OUTLIER] MAX

[OUTLIER] MAX Outlier Detected! manhattan: 2.948286447215665			(Index: 2597)
[OUTLIER] MAX Outlier Detected! euclidean: 2.487292304621835			(Index: 2597)
[OUTLIER] MAX Outlier Detected! haversine: 274.2140668588468			(Index: 2597)
[OUTLIER] Multi-Dimensional Outlier Detected! Distance: 178.4243215244439			(Index: 2597)
[POT. OUTLIER] Potential MAX Outlier Detected! pickup_longitude: -73.76509859380909			(Index: 2598)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! pickup_longitude			(Index: 2598)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! manhattan			(Index: 2598)
[POT. OUTLIER] Potential MAX Outlier Detected! pickup_longitude: -72.89206660766395			(Index: 2599)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 2599)
[POT. MAX OUTLIER] Potential Shap Outlier Detected! passenger_count			(Index: 2599)
[POT. OUTLIER] Potential MAX Outlier Detected! manhattan: 1.087902416994261			(Index: 2599)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! ma

[POT. OUTLIER] Potential MAX Outlier Detected! pickup_longitude: -73.3253827531568			(Index: 2623)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 2623)
[POT. OUTLIER] Potential MAX Outlier Detected! manhattan: 0.6508898298510033			(Index: 2623)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! manhattan			(Index: 2623)
[POT. OUTLIER] Potential MAX Outlier Detected! euclidean: 0.6492287522674947			(Index: 2623)
[POT. OUTLIER] Potential MAX Outlier Detected! haversine: 54.76149680732109			(Index: 2623)
[POT. OUTLIER] Potential Multi-Dimensional Outlier Detected! Distance: 39.463643304561096			(Index: 2623)
[OUTLIER] MAX Outlier Detected! pickup_longitude: -72.27592609241512			(Index: 2624)
[POT. OUTLIER] Potential MAX Outlier Detected! pickup_latitude: 41.51287069644639			(Index: 2624)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 2624)
[OUTLIER] MAX Outlier Detected! manhattan: 2.507704186550804			(Index: 2624)
[OUTLIE

[POT. MIN OUTLIER] Potential Shap Outlier Detected! haversine			(Index: 2640)
[OUTLIER] Multi-Dimensional Outlier Detected! Distance: 74.90150936641136			(Index: 2640)
[OUTLIER] MAX Outlier Detected! pickup_longitude: -71.98553591187263			(Index: 2641)
[OUTLIER] MAX Outlier Detected! manhattan: 1.9967867809984623			(Index: 2641)
[OUTLIER] MAX Outlier Detected! euclidean: 1.9896584828971948			(Index: 2641)
[OUTLIER] MAX Outlier Detected! haversine: 167.75565050138243			(Index: 2641)
[OUTLIER] Multi-Dimensional Outlier Detected! Distance: 121.53624784704607			(Index: 2641)
[POT. OUTLIER] Potential MAX Outlier Detected! pickup_longitude: -72.90606863163502			(Index: 2642)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 2642)
[POT. OUTLIER] Potential MAX Outlier Detected! manhattan: 1.0751630578181022			(Index: 2642)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! manhattan			(Index: 2642)
[POT. OUTLIER] Potential MAX Outlier Detected! euclidean: 1.06339

[OUTLIER] MAX Outlier Detected! manhattan: 2.6640263799881225			(Index: 2661)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! manhattan			(Index: 2661)
[OUTLIER] MAX Outlier Detected! euclidean: 2.053337193794523			(Index: 2661)
[OUTLIER] MAX Outlier Detected! haversine: 221.64263197118592			(Index: 2661)
[OUTLIER] Multi-Dimensional Outlier Detected! Distance: 125.67248239573652			(Index: 2661)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! pickup_latitude			(Index: 2662)
[POT. OUTLIER] Potential MAX Outlier Detected! pickup_longitude: -73.24434966286756			(Index: 2663)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 2663)
[POT. OUTLIER] Potential MAX Outlier Detected! manhattan: 0.7597358779039354			(Index: 2663)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! manhattan			(Index: 2663)
[POT. OUTLIER] Potential MAX Outlier Detected! euclidean: 0.7464401675361146			(Index: 2663)
[POT. OUTLIER] Potential MAX Outlier Detected! haversine: 62.983

[OUTLIER] MAX Outlier Detected! manhattan: 4.6479569830713245			(Index: 2684)
[OUTLIER] MAX Outlier Detected! euclidean: 3.2926027839078564			(Index: 2684)
[OUTLIER] MAX Outlier Detected! haversine: 328.48238686762335			(Index: 2684)
[OUTLIER] Multi-Dimensional Outlier Detected! Distance: 166.72814043898418			(Index: 2684)
[OUTLIER] MAX Outlier Detected! pickup_longitude: -72.35266423021262			(Index: 2685)
[OUTLIER] MAX Outlier Detected! pickup_latitude: 42.725741844797966			(Index: 2685)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 2685)
[OUTLIER] MAX Outlier Detected! manhattan: 3.635349495688857			(Index: 2685)
[OUTLIER] MAX Outlier Detected! euclidean: 2.5810432876950733			(Index: 2685)
[OUTLIER] MAX Outlier Detected! haversine: 259.86410245687506			(Index: 2685)
[OUTLIER] Multi-Dimensional Outlier Detected! Distance: 132.6926771631945			(Index: 2685)
[POT. OUTLIER] Potential MAX Outlier Detected! pickup_latitude: 40.866146102937776			(Index: 2686

[OUTLIER] MAX Outlier Detected! manhattan: 3.669844131482485			(Index: 2707)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! manhattan			(Index: 2707)
[OUTLIER] MAX Outlier Detected! euclidean: 2.620438081232053			(Index: 2707)
[OUTLIER] MAX Outlier Detected! haversine: 247.46130520731097			(Index: 2707)
[OUTLIER] Multi-Dimensional Outlier Detected! Distance: 119.05534579989914			(Index: 2707)
[OUTLIER] MAX Outlier Detected! pickup_longitude: -72.15552887683631			(Index: 2708)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! dropoff_longitude			(Index: 2708)
[OUTLIER] MAX Outlier Detected! manhattan: 1.8035317921089769			(Index: 2708)
[POT. MIN OUTLIER] Potential Shap Outlier Detected! manhattan			(Index: 2708)
[OUTLIER] MAX Outlier Detected! euclidean: 1.8022047663881684			(Index: 2708)
[OUTLIER] MAX Outlier Detected! haversine: 151.940304904377			(Index: 2708)
[OUTLIER] Multi-Dimensional Outlier Detected! Distance: 110.77258727022327			(Index: 2708)
[OUTLIER] MAX Outlier Detec

KeyboardInterrupt: 