In [1]:
import pandas as pd
import time
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from mlflow.tracking import MlflowClient

In [2]:
# Set up MLflow tracking and registry URIs
mlflow.set_tracking_uri("http://193.166.180.240:5000")
mlflow.set_registry_uri("http://193.166.180.240:5000")

In [3]:
# Custom Transformer for Dropping Unnecessary Features
class DropColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(self.columns_to_drop, axis=1)

# Custom Transformer for Selecting Specific Features
class SelectColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_select):
        self.columns_to_select = columns_to_select

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.loc[:, self.columns_to_select]

In [4]:
# Custom Transformer for Dropping Unnecessary Features
class DropColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(self.columns_to_drop, axis=1)

# Custom Transformer for Selecting Specific Features
class SelectColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_select):
        self.columns_to_select = columns_to_select

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.loc[:, self.columns_to_select]

In [5]:
def create_svc_pipeline(model_name, run_params):
    # Define run name and tags
    run_name = "SVC_Model_Training_Run"
    tags = {
        "project": "SVC Pipeline",
        "environment": "production",
        "developer": "John Wickström",
        "model_type": "Support Vector Classifier"
    }

    with mlflow.start_run(run_name=run_name) as run:
        # Set run tags
        mlflow.set_tags(tags)

        # Step 1: Data Ingestion
        print("Step 1: Ingesting Data - Loading and preprocessing data...")
        data_file_path = run_params['data_file_path']
        data = pd.read_csv(data_file_path)
        data["Datetime"] = pd.to_datetime(data["Datetime"])
        data = data.set_index("Datetime")
        data["Price_Change"] = (data["Close"].diff() > 0).astype(int)
        data = data.dropna() 

        # Step 2: Splitting Data
        print("Step 2: Splitting Data...")
        X = data.drop(columns=["Price_Change"])
        y = data["Price_Change"]
        X_train, X_test, y_train, y_test = train_test_split(X, y, **run_params['data_segmentation'])

        # Save X_test and y_test for inference
        X_test.to_csv("X_test_data_svc.csv")
        y_test.to_csv("y_test_data_svc.csv")

        # Step 3: Data Transformation
        print("Step 3: Transforming Data...")
        columns_to_drop = ["Timestamp", "Gmtoffset", "Close"]
        columns_to_select = X.columns.difference(columns_to_drop + ["Price_Change"])

        drop_columns_transformer = DropColumnsTransformer(columns_to_drop=columns_to_drop)
        select_columns_transformer = SelectColumnsTransformer(columns_to_select=columns_to_select)

        # Create a FeatureUnion pipeline for preprocessing
        preprocessing_pipeline = FeatureUnion([
            ('drop_columns', drop_columns_transformer),
            ('select_columns', select_columns_transformer),
        ])

        # Step 4: Build Pipeline with SVC
        print("Step 4: Building Pipeline...")
        pipeline = Pipeline(steps=[
            ('preprocessing', preprocessing_pipeline),
            ('scaler', StandardScaler()),
            ('svc', SVC(probability=True))
        ])

        # Step 5: Train the model using GridSearchCV
        print("Step 5: Training Model - Using GridSearchCV to find best parameters...")
        hyperparameter_grid = run_params['model_training']['grid_search_params']

        # Using StratifiedKFold to ensure balanced splits for imbalanced datasets
        skf = StratifiedKFold(n_splits=5)

        grid_search = GridSearchCV(pipeline, hyperparameter_grid, cv=skf, scoring="accuracy", n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_

        print(f"Best hyperparameters: {grid_search.best_params_}")
        print(f"Best cross-validation score: {grid_search.best_score_}")

        # Log hyperparameters and metrics to MLflow
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_params(run_params)  # Log all run parameters used in training
        mlflow.log_metric('best_cv_score', grid_search.best_score_)

        # Log the model with MLflow, capturing the input/output signature
        signature = infer_signature(X_train, best_model.predict(X_train))
        mlflow.sklearn.log_model(
            sk_model=best_model,
            artifact_path="svc_model",
            signature=signature,
            registered_model_name=model_name
        )

        # Step 6: Evaluate the model
        print("Step 6: Evaluating Model...")
        y_pred = best_model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])

        print(f"Accuracy on the test set: {accuracy}")
        print(f"F1 Score on the test set: {f1}")
        print(f"AUC Score on the test set: {auc}")

        # Log evaluation metrics to MLflow
        mlflow.log_metric('test_accuracy', accuracy)
        mlflow.log_metric('test_f1_score', f1)
        mlflow.log_metric('test_auc_score', auc)

        # Register the model if accuracy meets the threshold
        if accuracy >= 0.80:  # Example threshold for registration
            client = MlflowClient()

            # Register the model
            model_uri = f"runs:/{run.info.run_id}/svc_model"
            model_version = client.create_model_version(
                name=model_name,
                source=model_uri,
                run_id=run.info.run_id
            )

            time.sleep(10)

            # Add a description to the registered model version
            client.update_model_version(
                name=model_name,
                version=model_version.version,
                description="SVC model trained using GridSearchCV with different C and kernel hyperparameters."
            )

            # Add aliases for the model version
            client.set_registered_model_alias(
                name=model_name, 
                alias="Production", 
                version=model_version.version
            )

            # Add tags to the registered model version
            client.set_model_version_tag(
                name=model_name,
                version=model_version.version,
                key="training_data",
                value="5m_intraday_data.csv"
            )
            client.set_model_version_tag(
                name=model_name,
                version=model_version.version,
                key="model_type",
                value="SVC"
            )

            print(f"Model registered in MLflow Model Registry under name: {model_name}, version: {model_version.version}")

        return best_model


In [7]:
# Calling the function with run_params
create_svc_pipeline(
    model_name='svc-classification-model',
    run_params={
        'data_file_path': '5m_intraday_data.csv',
        'data_segmentation': {
            'test_size': 0.2,
            'random_state': 42
        },
        'model_training': {
            'grid_search_params': {
                'svc__C': [1, 10],
                'svc__kernel': ['rbf', 'linear']
            }
        }
    }
)

Step 1: Ingesting Data - Loading and preprocessing data...
Step 2: Splitting Data...
Step 3: Transforming Data...
Step 4: Building Pipeline...
Step 5: Training Model - Using GridSearchCV to find best parameters...
Best hyperparameters: {'svc__C': 10, 'svc__kernel': 'linear'}
Best cross-validation score: 0.8381223617098259


Registered model 'svc-classification-model' already exists. Creating a new version of this model...
2024/10/08 10:21:44 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: svc-classification-model, version 17
Created version '17' of model 'svc-classification-model'.


Step 6: Evaluating Model...


2024/10/08 10:21:46 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: svc-classification-model, version 18


Accuracy on the test set: 0.83947532792005
F1 Score on the test set: 0.8486454652532391
AUC Score on the test set: 0.9101320008694229


2024/10/08 10:21:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run SVC_Model_Training_Run at: http://193.166.180.240:5000/#/experiments/0/runs/bd4c7b60bd464dbda6c438089fa2ef43.
2024/10/08 10:21:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://193.166.180.240:5000/#/experiments/0.


Model registered in MLflow Model Registry under name: svc-classification-model, version: 18
