## Training Notebook for a Custom PyTorch Lightning Model 

Replace the train_models.ipynb notebook in your pipeline with this notebook to setup an MLOps workflow for a custom-built PyTorch Lightning model.

### The following cell is a way to get the utility script required for this notebook. 
Since IBM CPD SaaS doesn't have a filesystem, this is the only reliable way to get scripts on the cloud environment.

```
!rm -rf MLOps-CPD && git clone --quiet -b master https://github.com/IBM/MLOps-CPD.git
```

⚠️ Run the following cells only if you are executing on IBM CPD SaaS.

In [None]:
#!rm -rf MLOps-CPD && git clone --quiet -b master https://github.com/iiias/MLOps-CPD.git

In [None]:
#!mv MLOps-CPD MLOps_CPD

In [None]:
from botocore.client import Config
from ibm_botocore.client import Config
from sklearn.model_selection import train_test_split,cross_val_score, KFold
from sklearn.metrics import roc_auc_score,confusion_matrix,plot_confusion_matrix,plot_roc_curve,f1_score,auc,roc_curve,accuracy_score
from sklearn.preprocessing import StandardScaler
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from ibm_aigov_facts_client import AIGovFactsClient
from ibm_watson_studio_pipelines import WSPipelines
from ibm_watson_machine_learning import APIClient
import warnings
import os, types
import pandas as pd
import numpy as np
import ibm_boto3
import seaborn as sns
import json
import pickle
import matplotlib.pyplot as plt

from os.path import exists
if exists("utils/fs_utils.py") and exists("utils/catalog_utils.py"):
    from utils import fs_utils,catalog_utils
else:
    # If utils/fs_utils.py and utils/catalog_utils.py exist we assume that you are running on CPD SaaS
    # and will therefore import scripts from the freshly cloned repository
    from MLOps_CPD.utils import fs_utils, catalog_utils

warnings.filterwarnings("ignore")

## Succeeding cell contains the credentials for MLOps COS
```
## PROJECT COS 
AUTH_ENDPOINT = "https://iam.cloud.ibm.com/oidc/token"
ENDPOINT_URL = "https://s3.private.us.cloud-object-storage.appdomain.cloud"
API_KEY_COS = "xxx"
BUCKET_PROJECT_COS = "mlops-donotdelete-pr-qxxcecxi1dtw94"

##MLOPS COS
ENDPOINT_URL_MLOPS = "https://s3.jp-tok.cloud-object-storage.appdomain.cloud"
API_KEY_MLOPS = "xxx"
CRN_MLOPS = "xxx"
BUCKET_MLOPS  = "mlops-asset"

##CATALOG
CATALOG_NAME = "MLOps-ns"
```

In [None]:
## Retrieve cos credentials from global pipeline parameters
import json
# Get json from environment and convert to string
project_cos_credentials = json.loads(os.getenv('project_cos_credentials'))
mlops_cos_credentials = json.loads(os.getenv('mlops_cos_credentials'))

## PROJECT COS 
AUTH_ENDPOINT = project_cos_credentials['AUTH_ENDPOINT']
ENDPOINT_URL = project_cos_credentials['ENDPOINT_URL']
API_KEY_COS = project_cos_credentials['API_KEY']
BUCKET_PROJECT_COS = project_cos_credentials['BUCKET']

## MLOPS COS
ENDPOINT_URL_MLOPS = mlops_cos_credentials['ENDPOINT_URL']
API_KEY_MLOPS = mlops_cos_credentials['API_KEY']
CRN_MLOPS = mlops_cos_credentials['CRN']
BUCKET_MLOPS  = mlops_cos_credentials['BUCKET']

## Pipeline Params

In [None]:
# filename = os.getenv("feature_pickle")
CLOUD_API_KEY = os.getenv("cloud_api_key")

MODEL_NAME =os.getenv("model_name")
DEPLOYMENT_NAME =os.getenv("deployment_name")

project_id = os.environ['PROJECT_ID']
space_id = os.getenv("model_name")
CATALOG_NAME = "mlops-ns"

In [None]:
# This variable will be the filename of the serialized model.
# DO NOT include a filetype in the variable name, as it will be serialized to ONNX and then archived as tar.gz with the same variable as name. 
MODEL_SERIALIZED = "gcr-torch_lightning"

MODEL_TYPE = "pytorch-onnx_1.10"

## Utility Functions

In [None]:
def download_file_cos(local_file_name,key):
    cos = ibm_boto3.client(service_name='s3',
                           ibm_api_key_id=API_KEY_MLOPS,
                           ibm_service_instance_id=CRN_MLOPS,
                           ibm_auth_endpoint=AUTH_ENDPOINT,
                           config=Config(signature_version='oauth'),
                           endpoint_url=ENDPOINT_URL_MLOPS)
    try:
        res=cos.download_file(Bucket=BUCKET_MLOPS,Key=key,Filename=local_file_name)
    except Exception as e:
        print(Exception, e)
    else:
        print('File Downloaded')
        
def read_data_from_mlops_cos(key):
    def __iter__(self): return 0
    MLOPS_DATA_STORE_client = ibm_boto3.client(
        service_name='s3',
        ibm_api_key_id=API_KEY_MLOPS,
        ibm_service_instance_id=CRN_MLOPS,
        ibm_auth_endpoint=AUTH_ENDPOINT,
        config=Config(signature_version='oauth'),
        endpoint_url=ENDPOINT_URL_MLOPS)

    body = MLOPS_DATA_STORE_client.get_object(Bucket=BUCKET_MLOPS, Key=key)['Body']
    # add missing __iter__ method, so pandas accepts body as file-like object
    if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

    gcf_df = pd.read_csv(body)
    return gcf_df

def load_model(key, filename):
    download_file_cos(key,filename)
    with open (filename,"rb") as f:
        pipeline = pickle.load(f)
    return pipeline

def check_if_file_exists(filename):
    mlops_client = ibm_boto3.client(
        service_name='s3',
        ibm_api_key_id=API_KEY_MLOPS,
        ibm_service_instance_id=CRN_MLOPS,
        ibm_auth_endpoint=AUTH_ENDPOINT,
        config=Config(signature_version='oauth'),
        endpoint_url=ENDPOINT_URL_MLOPS)
    
    for key in mlops_client.list_objects(Bucket=BUCKET_MLOPS)['Contents']:
        files = key['Key']
        if files == filename:
            return True
    return False

def save_data_in_cos(dictionary,filename,key):
    """
    Save Data in IBM Cloud Object Storage
    """
    try:
        with open(filename+'.pkl', 'wb') as f:
            pickle.dump(dictionary, f)
        mlops_res = ibm_boto3.resource(
            service_name='s3',
            ibm_api_key_id=API_KEY_MLOPS,
            ibm_service_instance_id=CRN_MLOPS,
            ibm_auth_endpoint=AUTH_ENDPOINT,
            config=Config(signature_version='oauth'),
            endpoint_url=ENDPOINT_URL_MLOPS)

        mlops_res.Bucket(BUCKET_MLOPS).upload_file(filename+'.pkl',key+'.pkl')
        print(f"File {filename} uploaded successfully")
    except Exception as e:
        print(e)
        print("File upload for {filename} failed")


In [None]:
# The code was removed by Watson Studio for sharing.

## Instantiate FactSheets Client

In [None]:
facts_client = AIGovFactsClient(api_key=CLOUD_API_KEY, experiment_name="CreditRiskModel", container_type="project", container_id=project_id, set_as_current_experiment=True)

## Load Train Data and Test Data 

In [None]:
train_data = read_data_from_mlops_cos('train_tfr.csv')
test_data = read_data_from_mlops_cos('test_tfr.csv')
train_data.head()

## Load train and test set

In [None]:
y_train = train_data['Risk']

y_test = test_data['Risk']


X_train = train_data.drop('Risk',axis=1)

X_test = test_data.drop('Risk',axis=1)

## Make validation set

In [None]:
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

## Define the Model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import pytorch_lightning as pl
import pandas as pd

class CreditRiskClassifier(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim=10, output_dim=1):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x.squeeze()

    def training_step(self, batch, batch_idx):
        x, y = batch
        x = x.float()
        y = y.float()
        y_hat = self(x)
        loss_fn = nn.BCEWithLogitsLoss()
        loss = loss_fn(y_hat, y)        
        self.log(f'train_loss_epoch_{self.trainer.current_epoch}', loss)


        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        x = x.float()
        y = y.float()
        y_hat = self(x)
        loss_fn = nn.BCEWithLogitsLoss()
        loss = loss_fn(y_hat, y)        
        self.log(f'val_loss_epoch_{self.trainer.current_epoch}', loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
        return optimizer


# Dataset
class CreditRiskDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
        
    def __getitem__(self, idx):
        x = self.X[idx]
        x = torch.tensor(x, dtype=torch.float32)
        y = self.y[idx]
        y = torch.tensor(y, dtype=torch.float32)        
        return x, y


## Instantiate and train the Model 

In [None]:
# Split into train and val sets
X_train = pd.get_dummies(X_tr).values
y_train = y_tr.to_numpy()

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val1 = pd.get_dummies(X_val).values
X_val1 = scaler.fit_transform(X_val1)
y_val = y_val.to_numpy()

input_dim = X_train.shape[1]


In [None]:
n_epochs = 100

# Create datasets and dataloaders
train_dataset = CreditRiskDataset(X_train, y_train)
val_dataset = CreditRiskDataset(X_val1, y_val)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128)

# Initialize model and trainer
model = CreditRiskClassifier(input_dim=input_dim)
trainer = pl.Trainer(max_epochs=n_epochs, progress_bar_refresh_rate=20)

# Train the model and collect losses
t = trainer.fit(model, train_loader, val_loader)

In [None]:
# Collect historic loss (loss per epoch)
train_loss = [trainer.callback_metrics[f'train_loss_epoch_{epoch_idx}'] for epoch_idx in range(n_epochs)]
val_loss = [trainer.callback_metrics[f'val_loss_epoch_{epoch_idx}'] for epoch_idx in range(n_epochs)]

In [None]:
# Plot historic loss curves
plt.plot(range(n_epochs), train_loss, label="train_loss")
plt.plot(range(n_epochs), val_loss, label="val_loss")
plt.legend()
plt.show()

## Save train and val loss to COS

In [None]:
save_data_in_cos(val_loss,'val_loss','val_loss')
save_data_in_cos(train_loss,'train_loss','train_loss')

## Check if the files are copied in COS


In [None]:
files_copied_in_cos = check_if_file_exists('val_loss.pkl') and check_if_file_exists('val_loss.pkl')
files_copied_in_cos

## Baseline Results of the Model

In [None]:
# Get predictions for test data
inputs = torch.from_numpy(X_val1).float()
outputs = model(inputs)
pred = torch.round(torch.sigmoid(outputs))
pred = pd.DataFrame(pred.detach().numpy())

In [None]:
# Calculate score for area under ROC
print(roc_auc_score(y_val,pred))

In [None]:
df_cm = confusion_matrix(y_val,pred)

group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in df_cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in df_cm.flatten()/np.sum(df_cm)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2,v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(df_cm, annot=labels, fmt='', cmap='Blues')


In [None]:
fpr, tpr, thresholds = roc_curve(y_val, pred)

fig, ax = plt.subplots()
ax.plot(fpr, tpr)
ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c=".3")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.rcParams['font.size'] = 12

plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)

print("\n")
print ("Area Under Curve: %.2f" %auc(fpr, tpr))
print("\n")

## Serialize Model to ONNX and compress

In [None]:
# Save model
torch.save(model.state_dict(), f'{MODEL_SERIALIZED}.pt')

# Load model
loaded_model = CreditRiskClassifier(X_train.shape[1])
loaded_model.load_state_dict(torch.load(f'{MODEL_SERIALIZED}.pt'))

#loaded_model.eval()

# Convert to ONNX format
dummy_input = torch.randn(1, X_train.shape[1])
torch.onnx.export(
    loaded_model, 
    dummy_input,
    f'{MODEL_SERIALIZED}.onnx',
    export_params=True, 
    verbose=True
)

In [None]:
import tarfile

def onnx_to_tar(onnx_path, tar_file_path):
    with tarfile.open(tar_file_path, "w:gz") as tar:
        tar.add(onnx_path, arcname=os.path.basename(onnx_path))

onnx_path = f"{MODEL_SERIALIZED}.onnx"
tar_file_path = f"{MODEL_SERIALIZED}.tar.gz"
onnx_to_tar(onnx_path, tar_file_path)

In [None]:
WML_CREDENTIALS = {
                   "url": "https://us-south.ml.cloud.ibm.com",
                   "apikey": CLOUD_API_KEY
            }

In [None]:
wml_client = APIClient(WML_CREDENTIALS)

## Save and Log Models in AI Factsheets.

In [None]:
y_train = train_data['Risk']
X_train = train_data.drop('Risk',axis=1)

def save_log_facts():
    ctutils = catalog_utils.CatalogUtils(
        service_url="https://api.dataplatform.cloud.ibm.com",
        api_key=CLOUD_API_KEY,
        project_id=project_id,
        auth_url="https://iam.cloud.ibm.com/identity/token")
    catalog_id = ctutils.get_catalog_id_map()[CATALOG_NAME]
    fsutils = fs_utils.FSUtils(wml_client=wml_client,catalog_id=catalog_id,project_id=project_id,bss_account_id='27ff418fedd6aedffb8dc6ae4164a1d2',space_id=space_id,facts_client=facts_client)
    train_ref  = fsutils.prepare_training_reference(apikey=CLOUD_API_KEY,crn=CRN_MLOPS,bucket_name=BUCKET_MLOPS,endpoint=ENDPOINT_URL_MLOPS,training_file_name="german_credit_risk.csv")
    model_id = fsutils.save_model(model=tar_file_path,model_name=MODEL_NAME,model_entry_name="MLOps",model_entry_description="MLOps Model Entry",target="Risk",X=X_train,y=y_train,train_data_ref=train_ref, model_type="pytorch-onnx_1.10")
    
    nb_name = "train_models"
    nb_asset_id = "b8d38cab-e373-4303-bd09-12e1086c9132"
    CPD_URL ="https://dataplatform.cloud.ibm.com"

    nb_asset_url = "https://" + CPD_URL + "/analytics/notebooks/v2/" + nb_asset_id + "?projectid=" + project_id + "&context=cpdaas"

    latestRunId = facts_client.runs.list_runs_by_experiment('1').sort_values('start_time').iloc[-1]['run_id']
    facts_client.runs.set_tags(latestRunId, {"Notebook name": nb_name, "Notebook id": nb_asset_id, "Notebook URL" : nb_asset_url})
    facts_client.export_facts.export_payload(latestRunId)

    RUN_ID=facts_client.runs.get_current_run_id()
    facts_client.export_facts.export_payload(RUN_ID)
    
    return model_id


In [None]:
model_id = save_log_facts()

## Save Params in WS Pipeline

In [None]:
train_params = {}
train_params['auc_roc'] = float(auc(fpr, tpr))
train_params['training_done'] = True
train_params['model_name'] = MODEL_NAME
train_params['deployment_name'] = DEPLOYMENT_NAME
train_params['model_id'] = model_id
train_params['project_id'] = project_id
# train_params['model_pipeline'] = "/home/wsuser/work/model_pipeline.pkl"

pipelines_client = WSPipelines.from_apikey(apikey=CLOUD_API_KEY)
pipelines_client.store_results(train_params)