In [None]:
# !pip install pandas scikit-learn

In [2]:
import os
import pandas as pd

dfs = []
for dirname, _, filenames in os.walk(RUNWAY_DATA_PATH):
    for filename in filenames:
        if filename.endswith(".csv"):
            d = pd.read_csv(os.path.join(dirname, filename))
        elif filename.endswith(".parquet"):
            d = pd.read_parquet(os.path.join(dirname, filename))
        else:
            raise ValueError("Not valid file type")
        dfs += [d]
df = pd.concat(dfs)

In [3]:
proc_df = df.set_index("datetime").drop(columns=["id"]).tail(1000)

In [4]:
from sklearn.model_selection import train_test_split

train, valid = train_test_split(proc_df, test_size=0.2, random_state=2024)

In [1]:
import mlflow
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


class PCADetector(mlflow.pyfunc.PythonModel):
    def __init__(self, n_components):
        self._use_columns = ...
        self._scaler = StandardScaler()
        self._pca = PCA(n_components=n_components)

    def fit(self, X):
        self._use_columns = X.columns
        X_scaled = self._scaler.fit_transform(X)
        self._pca.fit(X_scaled)

    def predict(self, context, X):
        X = X[self._use_columns]
        X_scaled = self._scaler.transform(X)
        recon = self._recon(X_scaled)
        recon_err = ((X_scaled - recon) ** 2).mean(1)
        recon_err_df = pd.DataFrame(recon_err, columns=["anomaly_score"], index=X.index)
        return recon_err_df

    def _recon(self, X):
        z = self._pca.transform(X)
        recon = self._pca.inverse_transform(z)
        return recon

    def reconstruct(self, X):
        X_scaled = self._scaler.transform(X)
        recon_scaled = self._recon(X_scaled)
        recon = self._scaler.inverse_transform(recon_scaled)
        recon_df = pd.DataFrame(recon, index=X.index, columns=X.columns)
        return recon_df

In [5]:
parameters = {"n_components": N_COMPONENTS}
detector = PCADetector(n_components=parameters["n_components"])
detector.fit(train)

train_pred = detector.predict(None, train)
valid_pred = detector.predict(None, valid)

mean_train_recon_err = train_pred.mean()
mean_valid_recon_err = valid_pred.mean()

In [6]:
import mlflow
import runway

with mlflow.start_run():
    mlflow.log_params(parameters)

    mlflow.log_metric("mean_train_recon_err", mean_train_recon_err)
    mlflow.log_metric("mean_valid_recon_err", mean_valid_recon_err)

    runway.log_modbel(
        model=detector,
        input_samples={"predict": proc_df.sample(1)},
        model_name="pca-model",
    )

  return float(x)
 - mlflow (current: 2.4.1, required: mlflow==2.4)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


['/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/usr/local/lib/python3.8/dist-packages', '/usr/lib/python3/dist-packages']
