In [7]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

# Outlier removal function

def remove_outliers(df):
    R = df["gasResistance"]

    df = df[(R > 300) & (R < 5_000_000)]
    R = df["gasResistance"]

    Q1 = R.quantile(0.25)
    Q3 = R.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 2 * IQR
    upper = Q3 + 2 * IQR
    df = df[(df["gasResistance"] > lower) & (df["gasResistance"] < upper)]
    R = df["gasResistance"]

    df["prev"] = R.shift(1)
    df["delta"] = (R - df["prev"]).abs()
    median_R = R.median()
    df = df[df["delta"] < median_R * 3]

    return df.drop(columns=["prev", "delta"])

# Directories for model training

base_dirs = [
    r"/Users/artempavlov/Documents/Smart Brewery/Measurements/Measurements 11.11.2025",
    r"/Users/artempavlov/Documents/Smart Brewery/Measurements/Measurements 14.11.2025",
    r"/Users/artempavlov/Documents/Smart Brewery/Measurements/Measurements 21.11.2025",
    r"/Users/artempavlov/Documents/Smart Brewery/Measurements/Measurements 28.11.2025"
]

dfs = []
required_cols = {"temp", "gasResistance"}

for directory in base_dirs:
    if not os.path.exists(directory):
        continue

    for file in os.listdir(directory):
        if file.lower().endswith(".csv"):
            df = pd.read_csv(os.path.join(directory, file))
            if required_cols.issubset(df.columns):
                df = remove_outliers(df)
                dfs.append(df)

df_train = pd.concat(dfs, ignore_index=True)
print("Training rows:", len(df_train))

control_model = Pipeline([
    ("poly", PolynomialFeatures(degree=2)),
    ("linreg", LinearRegression())
])

control_model.fit(df_train[["temp"]], df_train["gasResistance"])
joblib.dump(control_model, "control_model.pkl")

print("\nSaved: control_model.pkl\n")

# List of boiling files

base_dir_14 = r"/Users/artempavlov/Documents/Smart Brewery/Measurements/Measurements 14.11.2025"
base_dir_21 = r"/Users/artempavlov/Documents/Smart Brewery/Measurements/Measurements 21.11.2025"
base_dir_28 = r"/Users/artempavlov/Documents/Smart Brewery/Measurements/Measurements 28.11.2025"

files = {
    0:   os.path.join(base_dir_21, "Wort end 0ppB DMS boiling .csv"),
    50:  os.path.join(base_dir_14, "14-11-25-Wort end 50ppb DMS boiling.csv"),
    70:  os.path.join(base_dir_28, "Wort end 70 ppB DMS boiling.csv"),
    90:  os.path.join(base_dir_28, "Wort end 90 ppB DMS boiling.csv"),
    100: os.path.join(base_dir_14, "14-11-25-Wort end 100ppb DMS boiling.csv"),
    110: os.path.join(base_dir_28, "Wort end 110ppB DMS Boiling.csv"),
    150: os.path.join(base_dir_21, "Wort end 150ppB DMS boiling.csv"),
}


# Apply offset to these files

LAMBDA = 0.5
output_dir = r"/Users/artempavlov/Documents/Smart Brewery/Boiling_Measurements_clean"
os.makedirs(output_dir, exist_ok=True)

control_model = joblib.load("control_model.pkl")

def apply_control_model(df):
    df = remove_outliers(df)
    baseline = control_model.predict(df[["temp"]])
    df["gasResistance"] = df["gasResistance"] * (1 - LAMBDA) + baseline * LAMBDA
    return df

for conc, file_path in files.items():
    if not os.path.exists(file_path):
        print("Missing:", file_path)
        continue

    df = pd.read_csv(file_path)
    df_clean = apply_control_model(df)

    out_path = os.path.join(output_dir, f"{conc}ppb_corrected.csv")
    df_clean.to_csv(out_path, index=False)

    print("Saved:", out_path)

print("\nAll boiling files processed successfully!")


Training rows: 33767

Saved: control_model.pkl

Saved: /Users/artempavlov/Documents/Smart Brewery/Boiling_Measurements_clean/0ppb_corrected.csv
Saved: /Users/artempavlov/Documents/Smart Brewery/Boiling_Measurements_clean/50ppb_corrected.csv
Saved: /Users/artempavlov/Documents/Smart Brewery/Boiling_Measurements_clean/70ppb_corrected.csv
Saved: /Users/artempavlov/Documents/Smart Brewery/Boiling_Measurements_clean/90ppb_corrected.csv
Saved: /Users/artempavlov/Documents/Smart Brewery/Boiling_Measurements_clean/100ppb_corrected.csv
Saved: /Users/artempavlov/Documents/Smart Brewery/Boiling_Measurements_clean/110ppb_corrected.csv
Saved: /Users/artempavlov/Documents/Smart Brewery/Boiling_Measurements_clean/150ppb_corrected.csv

All boiling files processed successfully!
