# Modelling

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

# Load Data

In [7]:
df = pd.read_parquet('../../data/processed/full_data.pq')
df.head()

Unnamed: 0,ProzessData_ActData_AB1_Analogs_DX1_MassPressure,ProzessData_ActData_AB1_Analogs_GY1_MassLevelTank,ProzessData_ActData_AB1_Current_DV1_Scraper,ProzessData_ActData_AB1_Current_DW1_RiserPumpFwd,ProzessData_ActData_AB1_Speed_DV1_Scraper,ProzessData_ActData_AB1_Speed_DW1_RiserPumpFwd,ProzessData_ActData_AB1_Temperature_DP1_MassHeatingStage,ProzessData_ActData_AB1_Temperature_DP1_WaterHeatingStage,ProzessData_ActData_AB1_Temperature_DQ1_MassCoolingStage,ProzessData_ActData_AB1_Temperature_DQ1_WaterCoolingStage,ProzessData_ActData_AB1_Temperature_DR1_WaterMixingStage,ProzessData_ActData_AB1_Temperature_DS1_WaterPipe,ProzessData_ActData_AB1_Temperature_DU1_WaterTank,ProzessData_ActData_AB1_Temperature_DX1_MassInfeed,hour,minute,day_time,night_time
2023-03-14 01:00:00+00:00,0.92,63.349998,0.0,0.0,0.0,0.0,41.299999,42.0,41.700001,42.0,42.0,41.849998,42.099998,41.900002,1,0,0,1
2023-03-14 01:00:01+00:00,0.93,63.34,0.0,0.0,0.0,0.0,41.299999,42.0,41.700001,42.0,42.0,41.849998,42.099998,41.900002,1,0,0,1
2023-03-14 01:00:02+00:00,0.93,63.34,0.0,0.0,0.0,0.0,41.299999,42.0,41.700001,42.0,42.0,41.849998,42.0,41.900002,1,0,0,1
2023-03-14 01:00:03+00:00,0.92,63.349998,0.0,0.0,0.0,0.0,41.299999,41.900002,41.700001,42.0,42.0,41.849998,42.0,41.900002,1,0,0,1
2023-03-14 01:00:04+00:00,0.92,63.34,0.0,0.0,0.0,0.0,41.299999,41.900002,41.700001,42.0,42.0,41.849998,42.0,41.900002,1,0,0,1


### Utility Functions

In [None]:
def evaluation(X_test, y_test, y_pred, pca=False):
    # Calculate the MAE, MSE and RMSE
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred, squared=True)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")

    if not pca:
        # Print some example predictions
        example_indices = [0, 1, 2, 3, 4]
        for i in example_indices:
            print(f"{X_test.index[i]}: | Actual y: {y_test.iloc[i, 0]} | Predicted y: {y_pred[i]}")

## Baseline

### Train-Test-Split (ADJUST TO CUSTOM ONE!!!)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

evaluation(X_test, y_test, y_pred)

MAE: 0.06190399249997842
MSE: 0.010507321269666683
RMSE: 0.10250522557248817
2023-03-22 15:46:04+00:00: | Actual y: 42.0 | Predicted y: [41.97445781]
2023-03-21 15:53:06+00:00: | Actual y: 41.5 | Predicted y: [41.49825148]
2023-03-14 12:17:10+00:00: | Actual y: 29.600000381469727 | Predicted y: [29.45843337]
2023-03-15 16:36:12+00:00: | Actual y: 41.5 | Predicted y: [41.54789011]
2023-03-22 02:41:08+00:00: | Actual y: 41.5 | Predicted y: [41.52204897]


### Ridge Linear Regression

In [None]:
from sklearn.linear_model import Ridge

# Train the model
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

print(evaluation(X_test, y_test, y_pred))

MAE: 0.061897700829230726
MSE: 0.010507102895257027
RMSE: 0.10250416038023544
2023-03-22 15:46:04+00:00: | Actual y: 42.0 | Predicted y: [41.97440317]
2023-03-21 15:53:06+00:00: | Actual y: 41.5 | Predicted y: [41.49826343]
2023-03-14 12:17:10+00:00: | Actual y: 29.600000381469727 | Predicted y: [29.45866431]
2023-03-15 16:36:12+00:00: | Actual y: 41.5 | Predicted y: [41.54788437]
2023-03-22 02:41:08+00:00: | Actual y: 41.5 | Predicted y: [41.52207635]
None


### Lasso Linear Regression

In [None]:
from sklearn.linear_model import Lasso

# Train the model
model = Lasso(alpha=0.1)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

print(evaluation(X_test, y_test, y_pred))

MAE: 0.08795412453720379
MSE: 0.03756376450999514
RMSE: 0.1938137366390606
2023-03-22 15:46:04+00:00: | Actual y: 42.0 | Predicted y: 42.099302050522695
2023-03-21 15:53:06+00:00: | Actual y: 41.5 | Predicted y: 41.53367734480601
2023-03-14 12:17:10+00:00: | Actual y: 29.600000381469727 | Predicted y: 29.49632557757533
2023-03-15 16:36:12+00:00: | Actual y: 41.5 | Predicted y: 41.53367734480601
2023-03-22 02:41:08+00:00: | Actual y: 41.5 | Predicted y: 41.53367663151721
None


## XGBoost

In [None]:
import xgboost as xgb

# Convert the data into DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set the parameters for XGBoost
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse'
}

# Train the XGBoost model
model = xgb.train(params, dtrain)

# Make predictions on the testing data
y_pred = model.predict(dtest)

evaluation(X_test, y_test, y_pred)

MAE: 0.17041372978551872
MSE: 0.03125276789951157
RMSE: 0.17678452392534696
2023-03-22 15:46:04+00:00: | Actual y: 42.0 | Predicted y: 41.63616943359375
2023-03-21 15:53:06+00:00: | Actual y: 41.5 | Predicted y: 41.34490966796875
2023-03-14 12:17:10+00:00: | Actual y: 29.600000381469727 | Predicted y: 29.702049255371094
2023-03-15 16:36:12+00:00: | Actual y: 41.5 | Predicted y: 41.33098220825195
2023-03-22 02:41:08+00:00: | Actual y: 41.5 | Predicted y: 41.34490966796875


In [None]:
from sklearn.decomposition import PCA
import xgboost as xgb

# Perform PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X1)

# # Custom Split
# X_train = X_pca
# y_train = y1
# X_test = eval_3_x
# y_test = eval_3_y

# Split the data into training and testing sets
pca_X_train, pca_X_test, pca_y_train, pca_y_test = train_test_split(X_pca, y1, test_size=0.2, random_state=42)

# Convert the data into DMatrix format
dtrain = xgb.DMatrix(pca_X_train, label=pca_y_train)
dtest = xgb.DMatrix(pca_X_test, label=pca_y_test)

# Set the parameters for XGBoost
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse'
}

# Train the XGBoost model
model = xgb.train(params, dtrain)

# Make predictions on the testing data
y_pred = model.predict(dtest)

evaluation(pca_X_test, pca_y_test, y_pred, pca=True)

MAE: 0.21534708539246938
MSE: 0.1916308900449985
RMSE: 0.43775665619725135


In [None]:
from sklearn.preprocessing import StandardScaler

# Standardize the features using z-score
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)

# Perform PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X1)

# Split the data into training and testing sets
pca_X_train, pca_X_test, pca_y_train, pca_y_test = train_test_split(X_scaled, y_train, test_size=0.2, random_state=42)

# Convert the data into DMatrix format
dtrain = xgb.DMatrix(pca_X_train, label=pca_y_train)
dtest = xgb.DMatrix(pca_X_test, label=pca_y_test)

# Set the parameters for XGBoost
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'mse'
}

# Train the XGBoost model
model = xgb.train(params, dtrain)

# Make predictions on the testing data
y_pred = model.predict(dtest)

evaluation(pca_X_test, pca_y_test, y_pred, pca=True)

XGBoostError: [09:03:58] /workspace/src/metric/metric.cc:49: Unknown metric function mse
Stack trace:
  [bt] (0) /home/codespace/.python/current/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x1ba24e) [0x7fc79473224e]
  [bt] (1) /home/codespace/.python/current/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x4f7e7e) [0x7fc794a6fe7e]
  [bt] (2) /home/codespace/.python/current/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x4ccdd3) [0x7fc794a44dd3]
  [bt] (3) /home/codespace/.python/current/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x4c5268) [0x7fc794a3d268]
  [bt] (4) /home/codespace/.python/current/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x70) [0x7fc7946d9ef0]
  [bt] (5) /lib/x86_64-linux-gnu/libffi.so.7(+0x6ff5) [0x7fc86ef4fff5]
  [bt] (6) /lib/x86_64-linux-gnu/libffi.so.7(+0x640a) [0x7fc86ef4f40a]
  [bt] (7) /home/codespace/.python/current/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0x12f51) [0x7fc86f1f5f51]
  [bt] (8) /home/codespace/.python/current/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0xca11) [0x7fc86f1efa11]

