In [3]:
import sensor_imputation_thesis.shared.load_data as load
from data_insight import setup_duckdb
from duckdb import DuckDBPyConnection as DuckDB
import pandas as pd
from duckdb import DuckDBPyRelation as Relation
from pathlib import Path
import hashlib

In [1]:
import sys

sys.path.append("/home/ec2-user/SageMaker/sensor-imputation-thesis")


In [4]:
con = setup_duckdb()
f = con.sql("""
       SELECT *
       FROM timeseries
       WHERE
       time BETWEEN '2023-10-01' AND '2024-10-01'
       AND pid = '4408337-3'
       LIMIT 10
          """)
print(f)

>>> con.sql("SHOW TABLES;")
┌────────────┐
│    name    │
│  varchar   │
├────────────┤
│ shipinfo   │
│ timeseries │
└────────────┘


>>> con.sql("DESCRIBE shipinfo;")
┌──────────────────────────────────────────────┬───────────────────────────────────────────────────────────────────────────────┬─────────┬─────────┬─────────┬─────────┐
│                 column_name                  │                                  column_type                                  │  null   │   key   │ default │  extra  │
│                   varchar                    │                                    varchar                                    │ varchar │ varchar │ varchar │ varchar │
├──────────────────────────────────────────────┼───────────────────────────────────────────────────────────────────────────────┼─────────┼─────────┼─────────┼─────────┤
│ information_validity                         │ VARCHAR                                                                       │ YES     │ NULL    │ NULL  

In [None]:
e = con.sql("""
          SELECT distinct pid
          FROM timeseries
          LIMIT 50
          """)
print(e)

get tags from timeseries 

In [None]:
query = "PRAGMA table_info('timeseries')"

# Execute the query
result = con.execute(query).fetchall()

# Extract and print column names
tags = [row[1] for row in result]
print("Available tags in the timeseries table:")
for tag in tags:
    print(tag)


In [None]:
pd.set_option("display.max_columns", None)


def load_engine_data(
    con: DuckDB, product_id: str, start: pd.Timestamp, stop: pd.Timestamp, tags: list[str]
) -> Relation:
    return con.sql(f"""
    SELECT {",".join(tags)}
    FROM timeseries
    WHERE
        time BETWEEN '{start}' AND '{stop}'
        AND pid = '{product_id}'
    """)


def get_tags_hash(tags):
    return hashlib.md5(",".join(tags).encode()).hexdigest()


start, stop = pd.Timestamp("2023-10-01"), pd.Timestamp("2024-10-01")

tags = [
    "time",
    "fr_eng",
    "pr_air_start",
    "pr_air_control",
    "te_fuel_eng_in",
    "pr_air_scav",
    "re_exh_incr",
    "pr_pmax_ordered",
    "fr_eng_setpoint",
    "re_eng_load_estimate_ecs",
    "in_engine_running_mode",
    "re_fuel_sulpher",
    "bo_melub_state_prelube",
    "cv_fuel",
    "re_total_fuel_quality_offset",
    "pr_hydr_out",
    "te_lub_oil_in_hps",
    "vi_fuel_eng_in",
    "ti_fuel_prim_inj_gov__8",
]
product_id = "89ccb7a888d53f8792f0580801cede9a"

cache = Path(f"/tmp/data_{get_tags_hash(tags)}.parquet")
if cache.exists():
    df = pd.read_parquet(cache)
else:
    con = setup_duckdb()
    df = load_engine_data(con, product_id, start, stop, tags).df()
    df.to_parquet(cache)

print(df.head(10))

In [4]:
path = "/home/ec2-user/SageMaker/sensor-imputation-thesis/sensor-imputation-thesis/src/sensor_imputation_thesis/nadire/dataframe"
df.to_csv(path, index=False)


In [None]:
# Df saved, no need to load everytime, just read it. change data to df
df = pd.read_csv(
    "/home/ec2-user/SageMaker/sensor-imputation-thesis/sensor-imputation-thesis/src/sensor_imputation_thesis/nadire/dataframe"
)

In [None]:
# drop columns with nan values
df1 = df.dropna(axis=1, how="all")
print(df1)

In [None]:
# encode categorical column "bo_melub_state_prelube" into numerical
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
label_encoder = LabelEncoder()
# Fit and transform the categorical column
df1["bo_melub_state_prelube_encoded"] = label_encoder.fit_transform(df1["bo_melub_state_prelube"])
# drop the original column
df1.drop("bo_melub_state_prelube", axis=1, inplace=True)
# print
print(df1)


In [None]:
df1.columns

CREATE A HEATMAP

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Filter the DataFrame to include only numerical columns
df2 = df1[:10000]
numeric_df = df2.select_dtypes(include="number")

# create correlation matrix
corr_matrix = numeric_df.corr()
# Create a heatmap with the correlation matrix
sns.heatmap(corr_matrix, annot=True, cmap="YlGnBu")

# Show the plot
plt.show()


## ML Set up

In [23]:
mlflow.set_tracking_uri("http://localhost:5000")

Linear Attempt

In [6]:
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.compose import make_column_selector as selector
import mlflow
import mlflow.sklearn
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor


Simple Multi-variable Attempts to Impute pr_hydr_out based on heatmap

In [None]:
df2 = df1[:10000]
df2.columns

In [17]:
# Assign X and y with smaller df2(10000 rows)
X = df2[["fr_eng", "pr_air_control", "te_fuel_eng_in", "pr_air_scav", "fr_eng_setpoint", "re_fuel_sulpher"]]
y = df2["pr_hydr_out"]

# Split train and test size with chronological order 8:2
train_size = int(len(df2) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [27]:
Preprocessor = ColumnTransformer(
    transformers=[
        (
            "num",
            Pipeline([("imputer", KNNImputer(n_neighbors=2, weights="uniform")), ("scaler", StandardScaler())]),
            selector(dtype_include="number"),
        )
    ],
    remainder="drop",
)

Pipeline1 = Pipeline([("preprocessor", Preprocessor), ("Linear Regression", LinearRegression())])

In [None]:
Pipeline1.fit(X_train, y_train)

In [None]:
y_pred = Pipeline1.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("mse:", mse)
print("mae:", mae)
print("r2:", r2)


# Linear with the re_eng_load_estimate_ecs as y 


In [16]:
# Assign X and y with smaller df2(10000 rows)
X = df2[["fr_eng", "pr_air_control", "te_fuel_eng_in", "pr_air_scav", "fr_eng_setpoint", "pr_hydr_out"]]
y = df2["re_eng_load_estimate_ecs"]

# Split train and test size with chronological order 8:2
train_size = int(len(df2) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [None]:
Preprocessor = ColumnTransformer(
    transformers=[
        (
            "num",
            Pipeline([("imputer", KNNImputer(n_neighbors=2, weights="uniform")), ("scaler", StandardScaler())]),
            selector(dtype_include="number"),
        )
    ],
    remainder="drop",
)

Pipeline1 = Pipeline([("preprocessor", Preprocessor), ("Linear Regression", LinearRegression())])

Pipeline1.fit(X_train, y_train)

In [None]:
y_pred=Pipeline1.predict(X_test)
mse=mean_squared_error(y_test,y_pred)
mae=mean_absolute_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)
print("mse:",mse)
print("mae:",mae)
print("r2:",r2)
input_example=X_test.iloc[:1]
with mlflow.start_run(run_name="model_with_re_eng_load_estimate_ecs_as_y")
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2",r2)
    mlflow.sklearn.log_model(Pipeline1, artifact_path="model_with_re_eng_load_estimate_ecs_as_y", input_example=input_example)

## Random Forest with KNN Imputer Deleted 

In [None]:
# Assign X and y with smaller df2(10000 rows) and changed X variables for RandomForest
X = df2[["fr_eng", "pr_air_control", "te_fuel_eng_in", "pr_air_scav", "fr_eng_setpoint", "pr_hydr_out"]]
y = df2["re_eng_load_estimate_ecs"]

# Split train and test size with chronological order 8:2
train_size = int(len(df2) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

Pipeline1 = Pipeline([("Scaler", StandardScaler()), ("RandomForest", RandomForestRegressor())])

Pipeline1.fit(X_train, y_train)

In [None]:
y_pred = Pipeline1.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("mse:", mse)
print("mae:", mae)
print("r2:", r2)

In [38]:
input_example = X_test.iloc[:1]

In [None]:
with mlflow.start_run():
    mlflow.log_param("mse", mse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    mlflow.sklearn.log_model(Pipeline1, artifact_path="randomforest_re_engsy", input_example=input_example)

## Grid Search for Ranfom Forest, y as re_eng_load_estimate_ecs

In [None]:
# Update pipeline
Preprocessor = ColumnTransformer(
    transformers=[
        (
            "num",
            Pipeline([("imputer", KNNImputer(n_neighbors=2, weights="uniform")), ("scaler", StandardScaler())]),
            selector(dtype_include="number"),
        )
    ],
    remainder="drop",
)

Pipeline2 = Pipeline([("preprocessor", Preprocessor), ("Random Forest", RandomForestRegressor())])
# Hyperparameter tuning
param_grid = {"Random Forest__n_estimators": [50, 100, 200], "Random Forest__max_depth": [None, 10, 20, 30]}
grid_search = GridSearchCV(Pipeline2, param_grid, cv=5, scoring="neg_mean_squared_error")
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best score: ", -grid_search.best_score_)

In [None]:
# Fit randomforestregressor model with optimal parameter
Pipeline2 = Pipeline(
    [("preprocessor", Preprocessor), ("Random Forest", RandomForestRegressor(n_estimators=50, random_state=42))]
)
Pipeline2.fit(X_train, y_train)
# Make predictions
y_pred = Pipeline2.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# log in mlflow
with mlflow.start_run(run_name="randommodel_with_re_eng_as_y"):
    mlflow.log_param("mse", mse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    mlflow.sklearn.log_model(Pipeline2, artifact_path="randommodel_with_re_eng_as_y", input_example=input_example)

print("mse:", mse)
print("mae:", mae)
print("r2:", r2)


### Gradient Boosting Model for re_eng_load_ecs as y 

In [53]:
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
# Fit the model on the available data
model.fit(X_train, y_train)
# Predict the missing values
y_pred = model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("mse:", mse)
print("mae:", mae)
print("r2:", r2)


Grid Search for Gradient Boosting 

In [None]:
model = GradientBoostingRegressor(random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    "n_estimators": [50, 100, 200],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [3, 4, 5],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

# Perform grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_

print("Best parameters found: ", best_params)
print("Best score: ", best_score)

In [None]:
model = GradientBoostingRegressor(
    n_estimators=50, learning_rate=0.2, max_depth=4, min_samples_leaf=1, min_samples_split=10, random_state=42
)
# Fit the model on the available data
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("mse:", mse)
print("mae:", mae)
print("r2:", r2)


Feature Importance Engineering from Gradient Boosting based on previous model 

In [None]:
import matplotlib.pyplot as plt

# Get feature importances
importances = model.feature_importances_
# Create a DataFrame for better visualization
importances_df = pd.DataFrame({"Feature": X.columns, "Importance": importances})
# Sort the DataFrame by importance
importances_df = importances_df.sort_values(by="Importance", ascending=False)
# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(importances_df["Feature"], importances_df["Importance"])
plt.xlabel("Importance")
plt.title("Feature Importances")
plt.gca().invert_yaxis()
plt.show()


Optimize GradientBoostingModel with important Xs

In [69]:
# Create Interactions with two speed related columns
df2.loc[:, "fr_eng*fr_eng_setpoint"] = df2["fr_eng"] * df2["fr_eng_setpoint"]
# Reassign Xs
X = df2[["fr_eng", "pr_air_scav", "fr_eng_setpoint", "pr_hydr_out", "fr_eng*fr_eng_setpoint"]]
y = df2["re_eng_load_estimate_ecs"]

# Split train and test size with chronological order 8:2
train_size = int(len(df2) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [None]:
model = GradientBoostingRegressor(
    n_estimators=50, learning_rate=0.2, max_depth=4, min_samples_leaf=1, min_samples_split=10, random_state=42
)
# Fit the model on the available data
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("mse:", mse)
print("mae:", mae)
print("r2:", r2)


In [None]:
model = GradientBoostingRegressor(random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    "n_estimators": [50, 100, 200],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [3, 4, 5],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

# Perform grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_

print("Best parameters found: ", best_params)
print("Best score: ", best_score)

In [None]:
model = GradientBoostingRegressor(
    n_estimators=100, learning_rate=0.2, max_depth=4, min_samples_leaf=2, min_samples_split=5, random_state=42
)
# Fit the model on the available data
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("mse:", mse)
print("mae:", mae)
print("r2:", r2)

In [None]:
# Check if deleting interactions help improve performance
X = df2[["fr_eng", "pr_air_scav", "fr_eng_setpoint", "pr_hydr_out"]]
y = df2["re_eng_load_estimate_ecs"]

# Split train and test size with chronological order 8:2
train_size = int(len(df2) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

model = GradientBoostingRegressor(
    n_estimators=100, learning_rate=0.2, max_depth=4, min_samples_leaf=2, min_samples_split=5, random_state=42
)
# Fit the model on the available data
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("mse:", mse)
print("mae:", mae)
print("r2:", r2)

In [None]:
import sensor_imputation_thesis.shared.mlflow as _mlflow

input_example = X_test.iloc[:1]
with mlflow.start_run():
    mlflow.log_param("mse", mse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    mlflow.sklearn.log_model(Pipeline1, artifact_path="model_with_pr_hydro_out_as_y", input_example=input_example)

RandomForest Model Attempt for pr_hydro_out since the previous linear performance is poor

In [None]:
# Updated pipeline
preprocessor = ColumnTransformer(
    transformers=[
        (
            "num",
            Pipeline(steps=[("imputer", KNNImputer(n_neighbors=2, weights="uniform")), ("scaler", StandardScaler())]),
            slice(0, X.shape[1]),
        )
    ]
)
# Hyperparameter tuning
param_grid = {"Random Forest__n_estimators": [50, 100, 200], "Random Forest__max_depth": [None, 10, 20, 30]}
grid_search = GridSearchCV(Pipeline2, param_grid, cv=5, scoring="neg_mean_squared_error")
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best score: ", -grid_search.best_score_)


In [None]:
# Fit randomforestregressor model with optimal parameter
model = RandomForestRegressor(n_estimators=50, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# log in mlflow
with mlflow.start_run():
    mlflow.log_param("mse", mse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    mlflow.sklearn.log_model(model, artifact_path="randommodel_with_pr_hydro_out_as_y", input_example=input_example)

print("mse:", mse)
print("mae:", mae)
print("r2:", r2)


Gradient Boosting Regressor with the same y (pr_hydro_out)

In [27]:
from sklearn.ensemble import GradientBoostingRegressor

# Define the model
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
# Fit the model on the available data
model.fit(X_train, y_train)
# Predict the missing values
y_pred = model.predict(X_test)


In [None]:
# calculate metrics and log them in mlflow
input_example = X_test.iloc[:1]
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("mse:", mse)
print("mae:", mae)
print("r2:", r2)

with mlflow.start_run():
    mlflow.log_param("mse", mse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    mlflow.sklearn.log_model(model, artifact_path="gdboostmodel_with_pr_hydro_out_as_y", input_example=input_example)

In [None]:
# delete some columns for x since the model performance is very poor
X = df2[["pr_air_control", "te_fuel_eng_in", "pr_air_scav", "fr_eng_setpoint"]]
y = df2["pr_hydr_out"]

# Split train and test size with chronological order 8:2
train_size = int(len(df2) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [26]:
# model
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
# Fit the model on the available data
model.fit(X_train, y_train)
# Predict the missing values
y_pred = model.predict(X_test)

In [None]:
# calculate metrics and log them in mlflow
input_example = X_test.iloc[:1]
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("mse:", mse)
print("mae:", mae)
print("r2:", r2)

with mlflow.start_run():
    mlflow.log_param("mse", mse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    mlflow.sklearn.log_model(model, artifact_path="gdboostmodel", input_example=input_example)

Create plot to investigate the data distribution

In [None]:
import matplotlib.pyplot as plt

data = pd.read_csv(
    "/home/ec2-user/SageMaker/sensor-imputation-thesis/sensor-imputation-thesis/src/sensor_imputation_thesis/nadire/dataframe",
    low_memory=False,
)
# Ensure the date column is parsed correctly
data["time"] = pd.to_datetime(data["time"])
data.set_index("time", inplace=True)
# get the column counts
num_columns = len(data.columns)
# Calculate the number of rows and columns for the subplot grid
num_rows = (num_columns + 2) // 3
# Create subplots
fig, axes = plt.subplots(num_rows, 3, figsize=(25, num_rows * 4))
# Flatten the axes array for easy iteration
axes = axes.flatten()
# Loop through the columns and create individual plots
for i, column in enumerate(data.columns):
    axes[i].plot(data[column], label=f"{column}")
    axes[i].set_title(f"Time Series Data ({column})")
    axes[i].set_xlabel("Date")
    axes[i].set_ylabel(f"{column}")
    axes[i].legend()
# Remove any empty subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])


X as "fr_eng","pr_hydr_in" and y as "pr_air_control"

In [None]:
Preprocessor = ColumnTransformer(
    transformers=[
        (
            "num",
            Pipeline([("imputer", KNNImputer(n_neighbors=2, weights="uniform")), ("scaler", StandardScaler())]),
            ["fr_eng", "pr_hydr_in"],
        )
    ],
    remainder="drop",
)
Pipeline1 = Pipeline([("preprocessor", Preprocessor), ("Linear Regression", LinearRegression())])

In [None]:
print(Pipeline)

In [75]:
df_sampled = df[:3000]
df_sampled = df_sampled.dropna(subset=["pr_air_control"])
X = df_sampled[["fr_eng", "pr_hydr_in"]]
y = df_sampled["pr_air_control"]

train_size = int(len(df_sampled) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [None]:
Pipeline1.fit(X_train, y_train)

In [None]:
y_pred = Pipeline1.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("mse:", mse)
print("mae:", mae)
print("r2:", r2)


X as "pr_hydr_out","pr_hydr_in","de_fuel15c" and y as "re_total_fuel_quality_offset"

In [63]:
X = df_sampled[["pr_hydr_out", "pr_hydr_in", "de_fuel15c"]]
y = df_sampled["re_total_fuel_quality_offset"]

train_size = int(len(df_sampled) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [None]:
df_sampled.columns

In [66]:
Preprocessor = ColumnTransformer(
    transformers=[
        (
            "num",
            Pipeline([("imputer", KNNImputer(n_neighbors=2, weights="uniform")), ("scaler", StandardScaler())]),
            ["pr_hydr_out", "pr_hydr_in", "de_fuel15c"],
        )
    ],
    remainder="drop",
)
Pipeline = Pipeline([("preprocessor", Preprocessor), ("Linear Regression", LinearRegression())])


In [None]:
Pipeline.fit(X_train, y_train)

In [None]:
y_pred = Pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
accuracy = accuracy(y_test, y_pred)
print("mse:", mse)
print("mae", mae)
print("r2", r2)

In [None]:
df_sampled["re_total_fuel_quality_offset"].describe()