# Homework 4

## Preparations

In [864]:
# Import libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

import BucketFactory
import EncoderFactory
from sklearn.pipeline import FeatureUnion, Pipeline
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

In [865]:
# Read in the event log
log = pd.read_csv(os.path.join('data', 'BPI_Challenge_2012_filtered.csv'), sep=',')
log

Unnamed: 0,case_id,AMOUNT_REQ,resource,activity,Start_Time,End_Time,REG_DATE
0,196554,6000,112.0,A_DECLINED,2011-12-29T21:43:20.708,2011-12-29T21:43:20.708,2011-12-29T21:42:45.706
1,196557,10000,112.0,A_SUBMITTED,2011-12-29T21:48:26.730,2011-12-29T21:48:26.730,2011-12-29T21:48:26.730
2,196557,10000,112.0,A_PARTLYSUBMITTED,2011-12-29T21:48:26.956,2011-12-29T21:48:26.956,2011-12-29T21:48:26.730
3,196557,10000,112.0,A_PREACCEPTED,2011-12-29T21:49:09.983,2011-12-29T21:49:09.983,2011-12-29T21:48:26.730
4,196557,10000,11003.0,W_Completeren aanvraag,2011-12-30T12:23:05.764,2011-12-30T12:37:06.319,2011-12-29T21:48:26.730
...,...,...,...,...,...,...,...
12474,202833,10000,112.0,A_SUBMITTED,2012-01-21T16:01:47.362,2012-01-21T16:01:47.362,2012-01-21T16:01:47.362
12475,202833,10000,112.0,A_PARTLYSUBMITTED,2012-01-21T16:01:47.906,2012-01-21T16:01:47.906,2012-01-21T16:01:47.362
12476,202833,10000,112.0,A_PREACCEPTED,2012-01-21T16:02:26.922,2012-01-21T16:02:26.922,2012-01-21T16:01:47.362
12477,202833,10000,10932.0,W_Completeren aanvraag,2012-01-21T16:33:53.726,2012-01-21T16:44:50.485,2012-01-21T16:01:47.362


In [866]:
# See data types
log.dtypes

case_id         int64
AMOUNT_REQ      int64
resource      float64
activity       object
Start_Time     object
End_Time       object
REG_DATE       object
dtype: object

In [867]:
# Change data types
log['resource'] = log['resource'].astype(int)
log['Start_Time'] = pd.to_datetime(log['Start_Time'], infer_datetime_format=True)
log['End_Time'] = pd.to_datetime(log['End_Time'], infer_datetime_format=True)
log['REG_DATE'] = pd.to_datetime(log['REG_DATE'], infer_datetime_format=True)
log.dtypes

case_id                int64
AMOUNT_REQ             int64
resource               int32
activity              object
Start_Time    datetime64[ns]
End_Time      datetime64[ns]
REG_DATE      datetime64[ns]
dtype: object

In [868]:
# Check what activities exist in the event log
log['activity'].unique()

array(['A_DECLINED', 'A_SUBMITTED', 'A_PARTLYSUBMITTED', 'A_PREACCEPTED',
       'W_Completeren aanvraag', 'A_CANCELLED', 'W_Afhandelen leads',
       'A_ACCEPTED', 'O_SELECTED', 'A_FINALIZED', 'O_CREATED', 'O_SENT',
       'W_Nabellen offertes', 'O_CANCELLED', 'O_SENT_BACK',
       'W_Valideren aanvraag', 'A_APPROVED', 'O_ACCEPTED', 'A_REGISTERED',
       'A_ACTIVATED', 'W_Nabellen incomplete dossiers', 'O_DECLINED',
       'W_Beoordelen fraude'], dtype=object)

In [869]:
# As we can see, there are no null values
log.isna().sum()

case_id       0
AMOUNT_REQ    0
resource      0
activity      0
Start_Time    0
End_Time      0
REG_DATE      0
dtype: int64

In [870]:
# See an example of one specific case
log[log['case_id']==202833]

Unnamed: 0,case_id,AMOUNT_REQ,resource,activity,Start_Time,End_Time,REG_DATE
12474,202833,10000,112,A_SUBMITTED,2012-01-21 16:01:47.362,2012-01-21 16:01:47.362,2012-01-21 16:01:47.362
12475,202833,10000,112,A_PARTLYSUBMITTED,2012-01-21 16:01:47.906,2012-01-21 16:01:47.906,2012-01-21 16:01:47.362
12476,202833,10000,112,A_PREACCEPTED,2012-01-21 16:02:26.922,2012-01-21 16:02:26.922,2012-01-21 16:01:47.362
12477,202833,10000,10932,W_Completeren aanvraag,2012-01-21 16:33:53.726,2012-01-21 16:44:50.485,2012-01-21 16:01:47.362
12478,202833,10000,10932,A_DECLINED,2012-01-21 16:44:47.078,2012-01-21 16:44:47.078,2012-01-21 16:01:47.362


## Task 1

As part of the log preprocessing, we need to calculate the remaining time for each case in the log. This can be done by finding the difference in seconds between the end timestamp of one event and the end timestamp of the last event in the case. To implement this, a new column called "remtime" should be created in the log, containing this new feature as an event attribute.

In [871]:
# Because there is no "Complete/end timestamp" for the case in the log, we need to add it.

log['complete_timestamp'] = log.groupby('case_id').End_Time.transform('max')

In [872]:
# Check that case complete timestamp was added
log.head()

Unnamed: 0,case_id,AMOUNT_REQ,resource,activity,Start_Time,End_Time,REG_DATE,complete_timestamp
0,196554,6000,112,A_DECLINED,2011-12-29 21:43:20.708,2011-12-29 21:43:20.708,2011-12-29 21:42:45.706,2011-12-29 21:43:20.708
1,196557,10000,112,A_SUBMITTED,2011-12-29 21:48:26.730,2011-12-29 21:48:26.730,2011-12-29 21:48:26.730,2012-01-02 10:23:29.922
2,196557,10000,112,A_PARTLYSUBMITTED,2011-12-29 21:48:26.956,2011-12-29 21:48:26.956,2011-12-29 21:48:26.730,2012-01-02 10:23:29.922
3,196557,10000,112,A_PREACCEPTED,2011-12-29 21:49:09.983,2011-12-29 21:49:09.983,2011-12-29 21:48:26.730,2012-01-02 10:23:29.922
4,196557,10000,11003,W_Completeren aanvraag,2011-12-30 12:23:05.764,2011-12-30 12:37:06.319,2011-12-29 21:48:26.730,2012-01-02 10:23:29.922


In [873]:
# See what the result looks like for one specific case
log[log['case_id']==202833]

Unnamed: 0,case_id,AMOUNT_REQ,resource,activity,Start_Time,End_Time,REG_DATE,complete_timestamp
12474,202833,10000,112,A_SUBMITTED,2012-01-21 16:01:47.362,2012-01-21 16:01:47.362,2012-01-21 16:01:47.362,2012-01-21 16:44:50.485
12475,202833,10000,112,A_PARTLYSUBMITTED,2012-01-21 16:01:47.906,2012-01-21 16:01:47.906,2012-01-21 16:01:47.362,2012-01-21 16:44:50.485
12476,202833,10000,112,A_PREACCEPTED,2012-01-21 16:02:26.922,2012-01-21 16:02:26.922,2012-01-21 16:01:47.362,2012-01-21 16:44:50.485
12477,202833,10000,10932,W_Completeren aanvraag,2012-01-21 16:33:53.726,2012-01-21 16:44:50.485,2012-01-21 16:01:47.362,2012-01-21 16:44:50.485
12478,202833,10000,10932,A_DECLINED,2012-01-21 16:44:47.078,2012-01-21 16:44:47.078,2012-01-21 16:01:47.362,2012-01-21 16:44:50.485


In [874]:
# Now we can add the remaining time
log['remtime'] = (log['complete_timestamp'] - log['End_Time']).dt.total_seconds()

In [875]:
log.head()

Unnamed: 0,case_id,AMOUNT_REQ,resource,activity,Start_Time,End_Time,REG_DATE,complete_timestamp,remtime
0,196554,6000,112,A_DECLINED,2011-12-29 21:43:20.708,2011-12-29 21:43:20.708,2011-12-29 21:42:45.706,2011-12-29 21:43:20.708,0.0
1,196557,10000,112,A_SUBMITTED,2011-12-29 21:48:26.730,2011-12-29 21:48:26.730,2011-12-29 21:48:26.730,2012-01-02 10:23:29.922,304503.192
2,196557,10000,112,A_PARTLYSUBMITTED,2011-12-29 21:48:26.956,2011-12-29 21:48:26.956,2011-12-29 21:48:26.730,2012-01-02 10:23:29.922,304502.966
3,196557,10000,112,A_PREACCEPTED,2011-12-29 21:49:09.983,2011-12-29 21:49:09.983,2011-12-29 21:48:26.730,2012-01-02 10:23:29.922,304459.939
4,196557,10000,11003,W_Completeren aanvraag,2011-12-30 12:23:05.764,2011-12-30 12:37:06.319,2011-12-29 21:48:26.730,2012-01-02 10:23:29.922,251183.603


## Task 2
One way to improve the model's precision is by extracting new time contextual features from the timestamps. These features can provide meaningful information to the model about possible seasonal influences on the process behavior. For example, from a start timestamp like "2023-04-18 13:00:00", we can extract the month of the year (e.g., 04), the day of the week (e.g., 2 for Tuesday, where Monday is 0 and Sunday is 6), and the relative time in seconds since midnight (e.g., 2808000 for 1:00 PM, as midnight is 0). To implement this, six new columns must be created in the log, containing these three contextual features for the start and complete timestamps. NB! Consider using the weekday() method from Python.

In [876]:
# Add day of the week
log['start_day'] = log['Start_Time'].dt.dayofweek
log['end_day'] = log['End_Time'].dt.dayofweek

# Add the month
log['start_month'] = log['Start_Time'].dt.month
log['end_month'] = log['End_Time'].dt.month

In [877]:
# Function for calculating the relative time (seconds since midnight)
# Source of this function: https://stackoverflow.com/questions/54787146/get-the-time-spent-since-midnight-in-dataframe
def secSinceMidnight(datTimStr):
    tt = pd.to_datetime(datTimStr).time()
    return tt.hour * 3600 + tt.minute * 60 + tt.second

In [878]:
# Add the relative time
log['start_since_midnight'] = log['Start_Time'].apply(secSinceMidnight)
log['end_since_midnight'] = log['End_Time'].apply(secSinceMidnight)

In [879]:
# Check the result
log.head()

Unnamed: 0,case_id,AMOUNT_REQ,resource,activity,Start_Time,End_Time,REG_DATE,complete_timestamp,remtime,start_day,end_day,start_month,end_month,start_since_midnight,end_since_midnight
0,196554,6000,112,A_DECLINED,2011-12-29 21:43:20.708,2011-12-29 21:43:20.708,2011-12-29 21:42:45.706,2011-12-29 21:43:20.708,0.0,3,3,12,12,78200,78200
1,196557,10000,112,A_SUBMITTED,2011-12-29 21:48:26.730,2011-12-29 21:48:26.730,2011-12-29 21:48:26.730,2012-01-02 10:23:29.922,304503.192,3,3,12,12,78506,78506
2,196557,10000,112,A_PARTLYSUBMITTED,2011-12-29 21:48:26.956,2011-12-29 21:48:26.956,2011-12-29 21:48:26.730,2012-01-02 10:23:29.922,304502.966,3,3,12,12,78506,78506
3,196557,10000,112,A_PREACCEPTED,2011-12-29 21:49:09.983,2011-12-29 21:49:09.983,2011-12-29 21:48:26.730,2012-01-02 10:23:29.922,304459.939,3,3,12,12,78549,78549
4,196557,10000,11003,W_Completeren aanvraag,2011-12-30 12:23:05.764,2011-12-30 12:37:06.319,2011-12-29 21:48:26.730,2012-01-02 10:23:29.922,251183.603,4,4,12,12,44585,45426


## Task 3

Given the “remtime” column in Task 1 and the contextual features columns in Task 2, train an XGBoost Regressor using single bucketing and aggregation encoding.

In [880]:
# Create the log schema
log_schema = {'case_id_col': 'case_id',
              'timestamp_col': 'complete_timestamp',
              'activity_col': 'activity',
              'label': 'remtime',
              'static_cat_cols': [],
              'static_num_cols': ["AMOUNT_REQ"],
              'dynamic_cat_cols': ["activity", "resource", "start_day", "end_day"],
              'dynamic_num_cols': ["start_month", "end_month", "start_since_midnight", "end_since_midnight"], 
             }

### Train-test split

In [881]:
# This function is from practice materials
# Split into training and test

def temporal_split(data, log_schema, train_ratio):  
    # Split into train and test using temporal split and discard events that overlap the periods
    data = data.sort_values([log_schema.get('timestamp_col'), log_schema.get('activity_col')], ascending=True, kind='mergesort')
    grouped = data.groupby(log_schema.get('case_id_col'))
    start_timestamps = grouped[log_schema.get('timestamp_col')].min().reset_index()
    start_timestamps = start_timestamps.sort_values(log_schema.get('timestamp_col'), ascending=True, kind='mergesort')
    train_ids = list(start_timestamps[log_schema.get('case_id_col')])[:int(train_ratio*len(start_timestamps))]
    train = data[data[log_schema.get('case_id_col')].isin(train_ids)].sort_values([log_schema.get('timestamp_col'), log_schema.get('activity_col')], ascending=True, kind='mergesort')
    test = data[~data[log_schema.get('case_id_col')].isin(train_ids)].sort_values([log_schema.get('timestamp_col'), log_schema.get('activity_col')], ascending=True, kind='mergesort')
    split_ts = test[log_schema.get('timestamp_col')].min()
    train = train[train[log_schema.get('timestamp_col')] < split_ts]
    return (train, test)

In [882]:
# Do the temporal split into train and test set
train, test = temporal_split(log, log_schema, 0.8)
len(train), len(test)

(8839, 3640)

### Prefix data

In [883]:
# Determine min and max (truncated) prefix lengths
# From practice materials
min_prefix_length = 1
def get_pos_case_length_quantile(data, log_schema, quantile=0.90):
    return int(np.ceil(data.groupby(log_schema.get('case_id_col')).size().quantile(quantile)))
max_prefix_length = min(40, get_pos_case_length_quantile(log, log_schema, 0.90))

In [884]:

# Function is from practice materials
def generate_prefix_data(data, min_length, max_length, log_schema, gap=1):
    # generate prefix data (each possible prefix becomes a trace)
    data['case_length'] = data.groupby(log_schema.get('case_id_col'))[log_schema.get('activity_col')].transform(len)

    dt_prefixes = data[data['case_length'] >= min_length].groupby(log_schema.get('case_id_col')).head(min_length)
    dt_prefixes["prefix_nr"] = 1
    dt_prefixes["orig_case_id"] = dt_prefixes[log_schema.get('case_id_col')]
    for nr_events in range(min_length+gap, max_length+1, gap):
        tmp = data[data['case_length'] >= nr_events].groupby(log_schema.get('case_id_col')).head(nr_events)
        tmp["orig_case_id"] = tmp[log_schema.get('case_id_col')]
        tmp[log_schema.get('case_id_col')] = tmp[log_schema.get('case_id_col')].apply(lambda x: "%s_%s"%(x, nr_events))
        tmp["prefix_nr"] = nr_events
        dt_prefixes = pd.concat([dt_prefixes, tmp], axis=0)

    dt_prefixes['case_length'] = dt_prefixes['case_length'].apply(lambda x: min(max_length, x))

    return dt_prefixes

In [885]:
dt_train_prefixes = generate_prefix_data(train, min_prefix_length, max_prefix_length, log_schema)
dt_test_prefixes = generate_prefix_data(test, min_prefix_length, max_prefix_length, log_schema)

In [886]:
len(dt_train_prefixes)

46247

In [887]:
print(dt_train_prefixes.prefix_nr.unique())

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


### Single Bucketing

In [888]:
random_state = 22
bucketer_args = {'case_id_col': log_schema.get('case_id_col'), 
                 'cat_cols':[log_schema.get('activity_col')], 
                 'num_cols':[], 
                 'random_state':random_state}

In [889]:
bucket_method = 'single'
bucketer = BucketFactory.get_bucketer(bucket_method, **bucketer_args)
bucket_assignments_train = bucketer.fit_predict(dt_train_prefixes)
bucket_assignments_test = bucketer.predict(dt_test_prefixes)

In [890]:
print('Train assignments:')
bucket_assignments_train
print(pd.DataFrame(bucket_assignments_train, columns=['bucket']).bucket.value_counts())
print('Test assignments:')
bucket_assignments_test
print(pd.DataFrame(bucket_assignments_test, columns=['bucket']).bucket.value_counts())

Train assignments:
1    8222
Name: bucket, dtype: int64
Test assignments:
1    3069
Name: bucket, dtype: int64


In [891]:
bucketer.n_states

1

In [892]:
def get_label_numeric(data):
    y = data.groupby(log_schema.get('case_id_col')).first()[log_schema.get('label')] # one row per case
    return y

In [893]:
bucket_indexes = dt_train_prefixes.groupby(log_schema.get('case_id_col')).first().index
bucket_indexes = bucket_indexes[bucket_assignments_train == 1]
print(bucket_indexes)

bucket_data = dt_train_prefixes[dt_train_prefixes[log_schema.get('case_id_col')].isin(bucket_indexes)]
train_y = get_label_numeric(bucket_data)

Index([    195455,     195458,     195461,     195464,     195467,     195470,
           195473,     195485,     195491,     195497,
       ...
       '201689_2', '201689_3', '201689_4', '201689_5', '201698_2', '201698_3',
       '201707_2', '201707_3', '201734_2', '201734_3'],
      dtype='object', name='case_id', length=8222)


In [894]:
train_y

case_id
195455      847904.751
195458      573988.243
195461           3.262
195464           0.000
195467           0.000
               ...    
201698_3         0.000
201707_2         0.000
201707_3         0.000
201734_2         0.000
201734_3         0.000
Name: remtime, Length: 8222, dtype: float64

In [895]:
bucket_indexes = dt_test_prefixes.groupby(log_schema.get('case_id_col')).first().index
bucket_indexes = bucket_indexes[bucket_assignments_test == 1]
print(bucket_indexes)
bucket_data_test = dt_test_prefixes[dt_test_prefixes[log_schema.get('case_id_col')].isin(bucket_indexes)]
test_y = get_label_numeric(bucket_data_test)

Index([    195539,     195606,     195678,     195743,     195941,     195988,
           196072,     196126,     196225,     196228,
       ...
       '202812_3', '202812_4', '202812_5', '202812_6', '202827_2', '202827_3',
       '202833_2', '202833_3', '202833_4', '202833_5'],
      dtype='object', name='case_id', length=3069)


In [896]:
test_y

case_id
195539      2147361.234
195606      1920013.526
195678      1920469.001
195743      2070985.062
195941      1809215.079
               ...     
202827_3          0.000
202833_2          3.407
202833_3          3.407
202833_4          3.407
202833_5          3.407
Name: remtime, Length: 3069, dtype: float64

### Regressor Pipeline: create encodings and train an XGBoost Regressor

In [897]:
methods = ['agg'] # Aggregation encoding
feature_combiner = FeatureUnion([(method, EncoderFactory.get_encoder(method, **log_schema)) for method in methods])
feature_combiner

In [898]:
model = xgb.XGBRegressor(n_estimators=400, learning_rate=0.3, n_jobs=2)

In [899]:
pipeline = Pipeline([('encoder', feature_combiner), ('xgb', model)])
pipeline.fit(bucket_data, train_y)

In [900]:
preds = pipeline.predict(bucket_data_test)
print(preds)

[1092920.9   1028228.8    968097.94  ...    8963.064    5510.502
  -20464.535]


In [901]:
score_task_3 = mean_absolute_error(test_y, preds)
score_task_3

271897.0546970529

## Task 4

Perform Task 3 again, but this time exclude the contextual features. Then, compare the accuracy of the resulting models with the previous step, and explain any differences in the results. Consider whether the use of contextual features impacted the accuracy of the models, and explain why such an effect may or may not have occurred.

In [902]:
# Remove the contextual information
log.drop(['start_day', 'end_day', 'start_month', 'end_month', 'start_since_midnight', 'end_since_midnight'], axis=1, inplace=True)

In [903]:
log.head()

Unnamed: 0,case_id,AMOUNT_REQ,resource,activity,Start_Time,End_Time,REG_DATE,complete_timestamp,remtime
0,196554,6000,112,A_DECLINED,2011-12-29 21:43:20.708,2011-12-29 21:43:20.708,2011-12-29 21:42:45.706,2011-12-29 21:43:20.708,0.0
1,196557,10000,112,A_SUBMITTED,2011-12-29 21:48:26.730,2011-12-29 21:48:26.730,2011-12-29 21:48:26.730,2012-01-02 10:23:29.922,304503.192
2,196557,10000,112,A_PARTLYSUBMITTED,2011-12-29 21:48:26.956,2011-12-29 21:48:26.956,2011-12-29 21:48:26.730,2012-01-02 10:23:29.922,304502.966
3,196557,10000,112,A_PREACCEPTED,2011-12-29 21:49:09.983,2011-12-29 21:49:09.983,2011-12-29 21:48:26.730,2012-01-02 10:23:29.922,304459.939
4,196557,10000,11003,W_Completeren aanvraag,2011-12-30 12:23:05.764,2011-12-30 12:37:06.319,2011-12-29 21:48:26.730,2012-01-02 10:23:29.922,251183.603


In [904]:
# Create the log schema without contextual data
log_schema = {'case_id_col': 'case_id',
              'timestamp_col': 'complete_timestamp',
              'activity_col': 'activity',
              'label': 'remtime',
              'static_cat_cols': [],
              'static_num_cols': ["AMOUNT_REQ"],
              'dynamic_cat_cols': ["activity", "resource"],
              'dynamic_num_cols': [], 
             }

In [905]:
# Do the temporal split into train and test set
train, test = temporal_split(log, log_schema, 0.8)
len(train), len(test)

(8839, 3640)

In [906]:
# Prefixes
max_prefix_length = min(40, get_pos_case_length_quantile(log, log_schema, 0.90))
dt_train_prefixes = generate_prefix_data(train, min_prefix_length, max_prefix_length, log_schema)
dt_test_prefixes = generate_prefix_data(test, min_prefix_length, max_prefix_length, log_schema)

In [907]:
# Single bucketing, args stay the same
bucket_method = 'single'
bucketer = BucketFactory.get_bucketer(bucket_method, **bucketer_args)
bucket_assignments_train = bucketer.fit_predict(dt_train_prefixes)
bucket_assignments_test = bucketer.predict(dt_test_prefixes)

In [908]:
print('Train assignments:')
bucket_assignments_train
print(pd.DataFrame(bucket_assignments_train, columns=['bucket']).bucket.value_counts())
print('Test assignments:')
bucket_assignments_test
print(pd.DataFrame(bucket_assignments_test, columns=['bucket']).bucket.value_counts())

Train assignments:
1    8222
Name: bucket, dtype: int64
Test assignments:
1    3069
Name: bucket, dtype: int64


In [909]:
bucket_indexes = dt_train_prefixes.groupby(log_schema.get('case_id_col')).first().index
bucket_indexes = bucket_indexes[bucket_assignments_train == 1]
#print(bucket_indexes)

bucket_data = dt_train_prefixes[dt_train_prefixes[log_schema.get('case_id_col')].isin(bucket_indexes)]
train_y = get_label_numeric(bucket_data)

In [910]:
bucket_indexes = dt_test_prefixes.groupby(log_schema.get('case_id_col')).first().index
bucket_indexes = bucket_indexes[bucket_assignments_test == 1]
#print(bucket_indexes)
bucket_data_test = dt_test_prefixes[dt_test_prefixes[log_schema.get('case_id_col')].isin(bucket_indexes)]
test_y = get_label_numeric(bucket_data_test)

In [911]:
methods = ['agg'] # Aggregation encoding
feature_combiner = FeatureUnion([(method, EncoderFactory.get_encoder(method, **log_schema)) for method in methods])
feature_combiner

In [912]:
model = xgb.XGBRegressor(n_estimators=400, learning_rate=0.3, n_jobs=2)

In [913]:
# Encoder and model are the same, data is different
pipeline = Pipeline([('encoder', feature_combiner), ('xgb', model)])

In [914]:
pipeline.fit(bucket_data, train_y)

In [915]:
preds = pipeline.predict(bucket_data_test)
print(preds)

[ 1.0696436e+06  8.1559325e+05  9.6083838e+05 ... -2.4689957e+02
 -6.2932275e+02  2.6021494e+02]


In [916]:
score_task_4 = mean_absolute_error(test_y, preds)
score_task_4

281025.1817774778

In [917]:
print('The MAE of task 3 (with contextual info) is:', score_task_3)
print('The MAE of task 4 (without contextual info) is:', score_task_4)
print('Therefore, the model with contextual information was better. The difference in MAE was:', (score_task_4 - score_task_3))

The MAE of task 3 (with contextual info) is: 271897.0546970529
The MAE of task 4 (without contextual info) is: 281025.1817774778
Therefore, the model with contextual information was better. The difference in MAE was: 9128.127080424863
