In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.ensemble import BaggingRegressor

# Read Input Data

In [None]:
# This should read in the dataset in a pandas dataframe. 
df_batch = None

# Prepare Data

In [None]:
def prepare_data_potassium(df):
  # drop unused columns here

  # first drop rows where Potassium is missing
  df.dropna(subset=['potassium'], inplace=True)
  df.drop(df[df['potassium'] < 0.4].index  , inplace=True)
  df.reset_index(drop=True, inplace=True)  
  
  # Seasonal Month
  # March = 1, April = 2, .. Febuary = 12
  df["month"] = df["production_date"].dt.month
  df["month"] = df["month"] - 2
  df.loc[df["month"] == -1, "month"] = 11
  df.loc[df["month"] == 0, "month"] = 12
  
  # Clean up the dataset
  df.drop(columns=["production_run", "production_pallet"], inplace=True)
  df.dropna(how="any", inplace=True)
  df = df.reset_index(drop=True)
  
  features = ["lactose", "month", "protein", "KCl", "fat", "milk_flow"]
  
  features_df = df[features + ["potassium"]]
  
  return features_df

In [None]:
fig = plt.figure(figsize=(14,4))
sns.histplot(data=pos_df, x="potassium", kde=True, stat='probability')
plt.show()

# Model

In [None]:
def mround(x, base=5):
    return base * round(x/base)

def accuracy(y_pred, y_true, acc_err):
        delta = (abs(y_pred - y_true) > acc_err) * 1
        acc = sum(1 - delta) / len(y_true)
        return 1 - acc

def accuracy_CI(y_pred, y_true, acc_err):
    lower_bound = y_true - (1.96 * acc_err * y_true)
    upper_bound = y_true + (1.96 * acc_err * y_true)

    # Check if prediction falls into 95% CI of measurement
    delta = list(map(lambda x, y, z: (x > y) & (x < z), y_pred, lower_bound, upper_bound))
    acc = sum(delta) / len(y_true)

    return acc

# custom metric to measure the accuracy only on extreme samples
def extreme_mse(y_true, y_pred, extreme_low = 0.55, extreme_high = 0.65):
  extreme_idx = [(y_true <= extreme_low) | (y_true >= extreme_high)][0]
  sq_err = (y_true[extreme_idx] - y_pred[extreme_idx])**2
  if sq_err.size == 0:
    return 0
  else:
    return sq_err.mean()

In [None]:
def compute_score(y_train, y_pred_train, y_test, y_pred_test, acc_err=0.04, plots=False):
    # create average prediction
    avg_pred_test = np.array([y_train.mean()] * len(y_test))

    # create accuracy metric
    train_acc = accuracy_CI(y_pred_train, y_train, acc_err)
    test_acc = accuracy_CI(y_pred_test, y_test, acc_err)
    avg_acc = accuracy_CI(avg_pred_test, y_test, acc_err)

    print(f"\033[1mMedian Accuracy     - Train    = {train_acc}\033[0m")
    print(f"\033[1m                    - Test     = {test_acc}\033[0m")
    print(f"\033[1m                    - Avg-Test = {avg_acc}\033[0m")

    print(f'\033[1mExtreme MSE         - Train    = {extreme_mse(y_train.values, y_pred_train)}\033[0m')
    print(f'\033[1m                    - Test     = {extreme_mse(y_test.values, y_pred_test)}\033[0m')
    print(f'\033[1m                    - Avg-Test = {extreme_mse(y_test.values, avg_pred_test)}\033[0m')

    print(f'Mean Squared Error  - Train    = {mean_squared_error(y_train.values, y_pred_train)}')
    print(f'                    - Test     = {mean_squared_error(y_test.values, y_pred_test)}')
    print(f'                    - Avg-Test = {mean_squared_error(y_test.values, avg_pred_test)}')

    print(f'Max Error           - Train    = {abs(y_train.values - y_pred_train).max()}')
    print(f'                    - Test     = {abs(y_test.values - y_pred_test).max()}')
    print(f'                    - Avg-Test = {abs(y_test.values - avg_pred_test).max()}')

    print(f'Mean Absolute Error - Train    = {mean_absolute_error(y_train.values, y_pred_train)}')
    print(f'                    - Test     = {mean_absolute_error(y_test.values, y_pred_test)}')
    print(f'                    - Avg-Test = {mean_absolute_error(y_test.values, avg_pred_test)}')

    if plots:
        fig, (ax0, ax1) = plt.subplots(nrows=2, ncols=1, figsize=(20,8))

        sns.lineplot(ax=ax0, x=y_train.index, y=y_train.values, color='royalblue')
        sns.lineplot(ax=ax0, x=y_train.index, y=y_pred_train, color='orange')
        sns.lineplot(ax=ax0, x=y_train.index, y=[y_test.mean()]*len(y_train), color='royalblue', linestyle='--')
        sns.lineplot(ax=ax0, x=y_train.index, y=[y_pred_train.mean()]*len(y_train), color='orange', linestyle='--')
        ax0.set_title('Training Data')
        ax0.legend(['True Potassium', 'Prediction', 'True Mean', 'Prediction mean'])

        sns.lineplot(ax=ax1, x=y_test.index, y=y_test.values, color='royalblue')
        sns.lineplot(ax=ax1, x=y_test.index, y=y_pred_test, color='orange')
        sns.lineplot(ax=ax1, x=y_test.index, y=[y_test.mean()]*len(y_test), color='royalblue', linestyle='--')
        sns.lineplot(ax=ax1, x=y_test.index, y=[y_pred_test.mean()]*len(y_test), color='orange', linestyle='--')
        ax1.set_title('Test Data (NOT seen during Search)')
        ax1.legend(['True Potassium', 'Prediction', 'True Mean', 'Prediction mean'])

        plt.show()

In [None]:
x = pos_df.drop(columns=["potassium"])
y = pos_df["potassium"]

In [None]:
hyperparams = {'eta': 0.05, 
               'max_depth': 4, 
               'objective': 'reg:pseudohubererror', 
               'eval_metric': 'rmse'}

regr = BaggingRegressor(base_estimator=xgb.XGBRegressor(**hyperparams), n_estimators=100, max_features = 5)

In [None]:
scores = {}
scores['train'] = {}
scores['test'] = {}
scores['train']['accuracy_CI'] = []
scores['train']['extreme_mse'] = []
scores['test']['accuracy_CI'] = []
scores['test']['extreme_mse'] = []

kfolds = [103, 96, 107, 102, 97, 0] 
test_start = 0
test_stop = kfolds[0]

for i in range(len(kfolds) - 1):
    print(f'Iteration {i}')
    # create splits
    train = list(x.index[:test_start]) + list(x.index[test_stop:])
    test = list(x.index[test_start: test_stop])
    
    # compute scores
    x_train, y_train = x.loc[train], y.loc[train]
    x_test, y_test = x.loc[test], y.loc[test]

    regr.fit(x_train, y_train)
    y_pred_train = regr.predict(x_train)
    y_pred_test = regr.predict(x_test)
    
    scores['train']['accuracy_CI'].append(accuracy_CI(y_train, y_pred_train, acc_err = 0.04))
    scores['train']['extreme_mse'].append(extreme_mse(y_train, y_pred_train))
    scores['test']['accuracy_CI'].append(accuracy_CI(y_test, y_pred_test, acc_err = 0.04))
    scores['test']['extreme_mse'].append(extreme_mse(y_test, y_pred_test))
    
    compute_score(y_train, y_pred_train, y_test, y_pred_test, acc_err=0.04, plots=True)
    
    test_start += kfolds[i]
    test_stop += kfolds[i+1]