In [1]:
# !pip install -r requirements.txt

In [1]:
import snowflake.snowpark.functions as F
from snowflake.snowpark.session import Session
from snowflake.snowpark.types import StructType, StructField, FloatType
from snowflake.snowpark import Session
import os
import json
import pandas as pd
import numpy as np

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import xgboost as xgb

In [3]:
connection_parameters = {
    "account": os.getenv("SNOWFLAKE_ACCOUNT"),
    "user": os.getenv("SNOWFLAKE_USER"),
    "password": os.getenv("SNOWFLAKE_PASSWORD"),
    "schema": "SENSOR",
    "database": "THINGSBOARD",
    "role": os.getenv("SNOWFLAKE_ROLE"),
    "warehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
}

session = Session.builder.configs(connection_parameters).create()

In [6]:
environmental_df = session.table("co_table_predication")

In [7]:
environmental_df.show(5)

--------------------------------------------
|"TS"                 |"CO"                |
--------------------------------------------
|2024-01-01 12:00:00  |4.379166666666667   |
|2024-01-02 12:00:00  |4.270833333333335   |
|2024-01-03 12:00:00  |4.179166666666668   |
|2024-01-04 12:00:00  |4.2250000000000005  |
|2024-01-05 12:00:00  |4.175               |
--------------------------------------------



In [8]:
df =environmental_df.toPandas()

In [9]:
df.head()

Unnamed: 0,TS,CO
0,2024-01-01 12:00:00,4.379167
1,2024-01-02 12:00:00,4.270833
2,2024-01-03 12:00:00,4.179167
3,2024-01-04 12:00:00,4.225
4,2024-01-05 12:00:00,4.175


In [10]:
df = df.sort_values('TS').reset_index(drop=True)

In [11]:

for lag in range(1, 4):  # Lag 1, 2, 3
    df[f'CO_lag_{lag}'] = df['CO'].shift(lag)

In [12]:
df['CO_roll_mean_3'] = df['CO'].rolling(window=3).mean()
df['CO_roll_std_3'] = df['CO'].rolling(window=3).std()

In [13]:
# Extract hour and minute
df['hour'] = df['TS'].dt.hour
df['minute'] = df['TS'].dt.minute

# Cyclical encoding
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['minute_sin'] = np.sin(2 * np.pi * df['minute'] / 60)
df['minute_cos'] = np.cos(2 * np.pi * df['minute'] / 60)


In [14]:
df = df.dropna().reset_index(drop=True)

In [15]:
df.head()

Unnamed: 0,TS,CO,CO_lag_1,CO_lag_2,CO_lag_3,CO_roll_mean_3,CO_roll_std_3,hour,minute,hour_sin,hour_cos,minute_sin,minute_cos
0,2024-01-04 12:00:00,4.225,4.179167,4.270833,4.379167,4.225,0.045833,12,0,1.224647e-16,-1.0,0.0,1.0
1,2024-01-05 12:00:00,4.175,4.225,4.179167,4.270833,4.193056,0.027743,12,0,1.224647e-16,-1.0,0.0,1.0
2,2024-01-06 12:00:00,4.166667,4.175,4.225,4.179167,4.188889,0.031549,12,0,1.224647e-16,-1.0,0.0,1.0
3,2024-01-07 12:00:00,4.283333,4.166667,4.175,4.225,4.208333,0.065085,12,0,1.224647e-16,-1.0,0.0,1.0
4,2024-01-08 12:00:00,4.183333,4.283333,4.166667,4.175,4.211111,0.063099,12,0,1.224647e-16,-1.0,0.0,1.0


In [16]:
# Define feature columns
feature_cols = [
    'CO_lag_1', 'CO_lag_2', 'CO_lag_3',
    'CO_roll_mean_3', 'CO_roll_std_3',
    'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos'
]

X = df[feature_cols]
y = df['CO']



In [17]:
import mlflow

mlflow.set_tracking_uri("https://fuzzy-goldfish-r4rr46q4jp4jf5gv6-5000.app.github.dev/")


In [18]:
mlflow.set_experiment("Time_Series_CO_Predict")


2024/11/27 13:10:52 INFO mlflow.tracking.fluent: Experiment with name 'Time_Series_CO_Predict' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/artifacts/2', creation_time=1732713052738, experiment_id='2', last_update_time=1732713052738, lifecycle_stage='active', name='Time_Series_CO_Predict', tags={}>

In [19]:
tscv = TimeSeriesSplit(n_splits=3)

# Start MLflow Run
with mlflow.start_run(run_name="Time_Series_CO_Predict") as run:
    # Define and Train the Model
    rf = RandomForestRegressor(n_estimators=200, max_depth=None, min_samples_split=2, random_state=42)
    rf.fit(X, y)

    # Predictions (Using the entire dataset for simplicity; consider using a separate test set)
    predictions = rf.predict(X)

    # Calculate Metrics
    mae = mean_absolute_error(y, predictions)
    rmse = np.sqrt(mean_squared_error(y, predictions))

    # Log Parameters
    mlflow.log_param("model", "Time_Series_CO_Predict")
    mlflow.log_param("n_estimators", 200)
    mlflow.log_param("max_depth", None)
    mlflow.log_param("min_samples_split", 2)
    mlflow.log_param("random_state", 42)

    # Log Metrics
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)

    # Log the Model
    mlflow.sklearn.log_model(rf, "model")

    print(f"Run ID: {run.info.run_id}")
    print(f"Logged MAE: {mae:.2f}, RMSE: {rmse:.2f}")

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Run ID: b937619ec8b743cdb435fa1259fe3046
Logged MAE: 0.02, RMSE: 0.02
🏃 View run Time_Series_CO_Predict at: https://fuzzy-goldfish-r4rr46q4jp4jf5gv6-5000.app.github.dev/#/experiments/2/runs/b937619ec8b743cdb435fa1259fe3046
🧪 View experiment at: https://fuzzy-goldfish-r4rr46q4jp4jf5gv6-5000.app.github.dev/#/experiments/2


In [20]:
# Get the Run ID from the previous step
run_id = run.info.run_id

# Register the model
model_name = "Time_Series_CO_Predict"
result = mlflow.register_model(
    f"runs:/{run_id}/model",
    model_name
)

print(f"Registered Model: {result.name}, Version: {result.version}")


Successfully registered model 'Time_Series_CO_Predict'.
2024/11/27 13:11:29 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Time_Series_CO_Predict, version 1


Registered Model: Time_Series_CO_Predict, Version: 1


Created version '1' of model 'Time_Series_CO_Predict'.


In [21]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

# Transition the model to 'Staging'
client.transition_model_version_stage(
    name=model_name,
    version=result.version,
    stage="Staging",
    archive_existing_versions=True
)

print(f"Model {model_name} version {result.version} transitioned to Staging.")


Model Time_Series_CO_Predict version 1 transitioned to Staging.


  client.transition_model_version_stage(
