In [13]:
import os
import pandas as pd
import numpy as np
import wandb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector
from copy import deepcopy
from joblib import dump, load
import params
from utils import *

## Reading the data from Excel file and adding it as a W&B artifact

In [14]:
prefix = "raw_data"
file_name = "data.xlsx"
data_location = f'{prefix}/{file_name}'

In [15]:
def get_raw_data_step(project: str, entity: str, data_location: str):
    data = pd.read_excel(data_location)

    run = wandb.init(project=project, entity=entity, job_type="upload")
    try:
        raw_data_artifact = wandb.Artifact('raw_data_artifact', type="raw_data")

        column_names = list(data.columns)

        table = create_table(raw_data_artifact, data, column_names, "raw_table")

        run.log_artifact(raw_data_artifact)
    finally:
        run.finish()

In [16]:
get_raw_data_step(
    params.PROJECT_NAME,
    params.ENTITY,
    data_location
)

VBox(children=(Label(value='0.001 MB of 0.009 MB uploaded\r'), FloatProgress(value=0.1143264723863868, max=1.0…

# Preprocessing

In [17]:
def preprocessing_step(project_name, entity, quality_features_columns, target_column, test_size):
    def preprocess(data: pd.DataFrame, quality_features_columns: List[str]):
        data = data.drop(['N'], axis=1)
        data = drop_records_with_many_nulls(data)
        data = update_columns_with_nulls(data)
        data = pd.get_dummies(data, columns=list(quality_features_columns), drop_first=True)
        return data

    run = wandb.init(project=project_name, entity=entity, job_type="train_data_preparation")
    try:
        raw_data_artifact = run.use_artifact(f"raw_data_artifact:latest")
        raw_data = raw_data_artifact.get("raw_table").get_dataframe()
        preprocessed_data = preprocess(raw_data, quality_features_columns)
        x_train, x_test, y_train, y_test = split_data(preprocessed_data, target_column, test_size)
        train_data = pd.concat([x_train, y_train], axis=1)
        test_data = pd.concat([x_test, y_test], axis=1)

        processed_data_artifact = wandb.Artifact("preprocessed_data_artifact", type="preprocessed_data", metadata={
            "train_data_row_count": len(train_data),
            "test_data_row_count": len(test_data)
        })
        preprocessed_data_table = create_table(
            processed_data_artifact,
            preprocessed_data,
            list(preprocessed_data.columns),
            "preprocessed_data_table"
        )
        train_table = create_table(
            processed_data_artifact,
            train_data,
            list(train_data.columns),
            "train_table"
        )
        test_table = create_table(
            processed_data_artifact,
            test_data,
            list(test_data.columns),
            "test_table"
        )

        run.log_artifact(processed_data_artifact)
    finally:
        run.finish()

In [18]:
preprocessing_step(
    params.PROJECT_NAME,
    params.ENTITY,
    params.QUALITY_FEATURES_COLUMNS,
    params.TARGET_COLUMN,
    params.TEST_SIZE
)

[34m[1mwandb[0m:   1 of 1 files downloaded.  


VBox(children=(Label(value='0.376 MB of 0.376 MB uploaded (0.176 MB deduped)\r'), FloatProgress(value=1.0, max…

# Training the model

In [19]:
def training_step(project_name, entity, target_column, model_name):
    def training(train_x, train_y, test_x, test_y):
        best_accuracy = 0
        best_model = None
        best_regularisation_parameter = None
        i = 0.01
        while i != 101:
            model = LogisticRegression(solver='liblinear', penalty='l1', C=i).fit(train_x, train_y)
            accuracy = model.score(test_x, test_y)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_model = model
                best_regularisation_parameter = i
            i += 0.01 if i < 1 else 1

        best_accuracy = 0
        best_model_with_lesser_features = None
        best_features = None
        best_train_x = None
        best_test_x = None
        for n in range(2, 51):
            feature_selector = SequentialFeatureSelector(
                best_model,
                n_features_to_select=n,
                direction='backward',
                scoring='accuracy'
            ).fit(train_x, train_y)
            new_train_x = feature_selector.transform(train_x)
            new_test_x = feature_selector.transform(test_x)
            new_trained_model = best_model.fit(new_train_x, train_y)
            accuracy = new_trained_model.score(new_test_x, test_y)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_model_with_lesser_features = deepcopy(new_trained_model)
                best_features = list(feature_selector.get_feature_names_out())
                best_train_x = new_train_x
                best_test_x = new_test_x
        return (
            best_model_with_lesser_features,
            best_features,
            pd.DataFrame(best_train_x, columns=best_features),
            pd.DataFrame(best_test_x, columns=best_features),
            best_regularisation_parameter,
            {
                "test_accuracy": best_model_with_lesser_features.score(best_test_x, test_y),
                "train_accuracy": best_model_with_lesser_features.score(best_train_x, train_y)
            }
        )

    run = wandb.init(project=project_name, entity=entity, job_type="model_training")
    try:
        preprocessed_data_artifact = run.use_artifact("preprocessed_data_artifact:latest")
        train_data = preprocessed_data_artifact.get("train_table").get_dataframe()
        test_data = preprocessed_data_artifact.get("test_table").get_dataframe()
    
        train_x, train_y = separate_x_from_y(train_data, target_column)
        test_x, test_y = separate_x_from_y(test_data, target_column)
    
        new_model, features, new_train_x, new_test_x, regularisation_parameter, accuracies = training(
            train_x, train_y, test_x, test_y
        )
    
        model_artifact = wandb.Artifact(
            "medical_logistic_regression_model_artifact",
            type="model",
            metadata={
                "regularisation_parameter": regularisation_parameter,
                "features": features,
                "train_accuracy": accuracies["train_accuracy"],
                "test_accuracy": accuracies["test_accuracy"]
            }
        )
        serialized_model = dump(new_model, f"models/{model_name}.joblib")
        model_artifact.add_file(f"models/{model_name}.joblib", f"{model_name}")
    
        run.log_artifact(model_artifact, aliases=["latest"])
        run.link_artifact(model_artifact, f"george-sokolovsky2001/{project_name}", aliases=["latest"])
 
        model_data_artifact = wandb.Artifact(
            "medical_logistic_regression_model_data_artifact",
            type="model_data",
            metadata={
                "train_row_count": len(new_train_x),
                "test_row_count": len(new_test_x),
            }
        )
    
        train_data = pd.concat([pd.DataFrame(new_train_x, columns=features), pd.Series(train_y)], axis=1)
        train_table = create_table(model_data_artifact, train_data, list(train_data.columns), "train_table")
    
        test_data = pd.concat([pd.DataFrame(new_test_x, columns=features), pd.Series(test_y)], axis=1)
        test_table = create_table(model_data_artifact, test_data, list(test_data.columns), "test_table")
        run.log_artifact(model_data_artifact)
    finally:
        run.finish()

In [20]:
training_step(
    params.PROJECT_NAME,
    params.ENTITY,
    params.TARGET_COLUMN,
    params.MODEL_NAME
)

[34m[1mwandb[0m:   3 of 3 files downloaded.  
[34m[1mwandb[0m:   3 of 3 files downloaded.  
wandb: Network error (ConnectionError), entering retry loop.


VBox(children=(Label(value='0.031 MB of 0.031 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [21]:
def monitoring_step(project_name, entity, target_column, model_name):
    run = wandb.init(project=project_name, entity=entity, job_type="monitoring")
    try:
        model_artifact = run.use_artifact("medical_logistic_regression_model_artifact:latest")
        model_dir = model_artifact.download()
        model = load(f"{model_dir}/{model_name}")

        model_data_artifact = run.use_artifact("medical_logistic_regression_model_data_artifact:latest")
        train_data = model_data_artifact.get("train_table").get_dataframe()
        test_data = model_data_artifact.get("test_table").get_dataframe()

        train_x, train_y = separate_x_from_y(train_data, target_column)
        test_x, test_y = separate_x_from_y(test_data, target_column)

        y_probas = model.predict_proba(test_x)
        features = model_artifact.metadata["features"]

        wandb.sklearn.plot_learning_curve(model, train_x, train_y)
        wandb.termlog('Logged learning curve.')
        wandb.sklearn.plot_summary_metrics(model, X=train_x, y=train_y, X_test=test_x, y_test=test_y)
        wandb.termlog('Logged summary metrics.')
        wandb.sklearn.plot_class_proportions(train_y, test_y, features)
        wandb.termlog('Logged class proportions.')
        wandb.sklearn.plot_roc(test_y, y_probas, features)
        wandb.termlog('Logged roc curve.')
        wandb.sklearn.plot_precision_recall(test_y, y_probas, features)
        wandb.termlog('Logged precision recall curve.')
        wandb.sklearn.plot_feature_importances(model, features)
        wandb.termlog('Logged feature importances.')

    finally:
        run.finish()

In [22]:
monitoring_step(
    params.PROJECT_NAME,
    params.ENTITY,
    params.TARGET_COLUMN,
    params.MODEL_NAME
)

[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   2 of 2 files downloaded.  
[34m[1mwandb[0m:   2 of 2 files downloaded.  
[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision recall curve.
[34m[1mwandb[0m: Logged feature importances.


VBox(children=(Label(value='0.016 MB of 0.016 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [23]:
def deploy_to_prod(project_name, entity, model_name):
    run = wandb.init(project=project_name, entity=entity, job_type="deploy")
    latest_model_artifact = run.use_artifact("medical_logistic_regression_model_artifact:latest")
    latest_model_train_accuracy = latest_model_artifact.metadata["train_accuracy"]
    latest_model_test_accuracy = latest_model_artifact.metadata["test_accuracy"]
    latest_model_data_artifact = run.use_artifact("medical_logistic_regression_model_data_artifact:latest")

    try:
        # If we already have a production model, we'll compare it to the latest one
        production_model_artifact = run.use_artifact("medical_logistic_regression_model_artifact:production")
        production_model_train_accuracy = production_model_artifact.metadata["train_accuracy"]
        production_model_test_accuracy = production_model_artifact.metadata["test_accuracy"]
        production_model_data_artifact = run.use_artifact("medical_logistic_regression_model_data_artifact:production")

        # If latest model showed more accuracy on test data than the production one, we add the production alias to it
        if latest_model_test_accuracy > production_model_test_accuracy:
            latest_model_artifact.aliases.append("production")
            latest_model_artifact.save()
            latest_model_data_artifact.aliases.append("production")
            latest_model_data_artifact.save()
            production_model_artifact.aliases.remove("production")
            production_model_artifact.save()
            production_model_data_artifact.aliases.remove("production")
            production_model_data_artifact.save()
    except Exception:
        # If we don't have a production model yet, the latest model will be automatically marked as one.
        latest_model_artifact.aliases.append("production")
        latest_model_artifact.save()
        latest_model_data_artifact.aliases.append("production")
        latest_model_data_artifact.save()
    finally:
        run.finish()


In [24]:
deploy_to_prod(
    params.PROJECT_NAME,
    params.ENTITY,
    params.MODEL_NAME
)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011168353699354662, max=1.0…

VBox(children=(Label(value='0.001 MB of 0.009 MB uploaded\r'), FloatProgress(value=0.11433812289819627, max=1.…