In [87]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, QuantileTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import json
import os
import joblib
from itertools import product
import warnings


# Exercise mlflow

Toy problem → Binary classification (LendingClub Dataset), the model's task is to determine whether an individual will default on a loan based on data collected during the initiation of the loan.

In [88]:
data = pd.read_csv('./train_dataset.csv')
data_test = pd.read_csv('./test_dataset.csv')

In [89]:
data.head()

Unnamed: 0,loan_amount,payments_term,monthly_payment,grade,working_years,home,annual_income,verification,purpose,debt_to_income,delinquency,inquiries,open_credit_lines,derogatory_records,revolving_balance,revolving_rate,total_accounts,bankruptcies,fico_average,loan_risk
0,3000,36 months,90.48,1,10,Rent,48000.0,Source Verified,debt_consolidation,14.15,0,0,5,0,1896,39.5,12,0,777,Paid
1,12650,60 months,316.23,4,2,Rent,61500.0,Source Verified,credit_card,12.86,0,1,9,0,11200,83.0,27,0,697,Paid
2,5000,60 months,106.24,2,10,Mortgage,90000.0,Not Verified,car,11.85,0,0,16,0,31007,80.7,35,0,737,Paid
3,16000,36 months,540.88,3,3,Mortgage,35004.0,Verified,debt_consolidation,20.81,0,0,5,0,14383,92.2,23,0,722,Paid
4,17600,60 months,409.43,3,10,Mortgage,95000.0,Verified,debt_consolidation,8.07,0,3,11,0,3858,14.8,17,0,732,Charged off


In [90]:
data['loan_risk'].value_counts()

Paid           25224
Charged off     3978
Name: loan_risk, dtype: int64

In [91]:
# transform column 'loan_risk' to numerical values 0 and 1
# REMEMBER: 0 means 'Charged off' and 1 means 'Paid'
data['loan_risk'] = data['loan_risk'].map({'Charged off': 0, 'Paid': 1})
data_test['loan_risk'] = data_test['loan_risk'].map({'Charged off': 0, 'Paid': 1})

In [92]:
data.head()

Unnamed: 0,loan_amount,payments_term,monthly_payment,grade,working_years,home,annual_income,verification,purpose,debt_to_income,delinquency,inquiries,open_credit_lines,derogatory_records,revolving_balance,revolving_rate,total_accounts,bankruptcies,fico_average,loan_risk
0,3000,36 months,90.48,1,10,Rent,48000.0,Source Verified,debt_consolidation,14.15,0,0,5,0,1896,39.5,12,0,777,1
1,12650,60 months,316.23,4,2,Rent,61500.0,Source Verified,credit_card,12.86,0,1,9,0,11200,83.0,27,0,697,1
2,5000,60 months,106.24,2,10,Mortgage,90000.0,Not Verified,car,11.85,0,0,16,0,31007,80.7,35,0,737,1
3,16000,36 months,540.88,3,3,Mortgage,35004.0,Verified,debt_consolidation,20.81,0,0,5,0,14383,92.2,23,0,722,1
4,17600,60 months,409.43,3,10,Mortgage,95000.0,Verified,debt_consolidation,8.07,0,3,11,0,3858,14.8,17,0,732,0


In [93]:
data_train, data_val = train_test_split(data, test_size=0.2)

# Definition of a classification pipeline

Within the pipeline, data is prepared, processed and fed to the model. Function below trains the pipeline on data supplied as an argument using parameters specified in the parameters dictionary

In [94]:
def train_xgboost_pipeline(data_x, data_y, parameters=None):
    # This function trains an xgboost classifier using the hyperparameters defined in parameters

    if parameters is None:
        parameters = dict(n_estimators=100, 
                          max_depth=4, 
                          scale_pos_weight=1,
                          learning_rate=0.1)
    
    # Preprocessing numerical and categorical features

    # Numerical features are imputed with the median
    numerical_features = data_x.select_dtypes(include="number").columns
    # Categorical features are imputed with the most frequent value and one-hot encoded
    categorical_features = data_x.select_dtypes(include="object").columns

    
    numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])

    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(transformers=[('ord', numerical_transformer, numerical_features),
                                                ('cat', categorical_transformer, categorical_features)])

    # Creation of a preprocessor + XGBoost pipeline
    
    xgb_clf = XGBClassifier(n_estimators=parameters['n_estimators'],
                           max_depth=parameters['max_depth'],
                           scale_pos_weight=parameters['scale_pos_weight'],
                           learning_rate=parameters['learning_rate'],
                           random_state=42,
                           n_jobs=4)

    xgb_pipeline = Pipeline(steps=[("preprocessing", preprocessor), 
                                   ("xgb_model", xgb_clf)])

    # Pipeline fit

    xgb_pipeline.fit(data_x, data_y)

    return xgb_pipeline

In [95]:
clf = train_xgboost_pipeline(data_train.drop(['loan_risk'],axis=1),data_train['loan_risk'])

In [96]:
clf

Analisi metriche di classificazione modello, classification_report genera un dizionario contenente i punteggi piu\' importanti

In [97]:
from sklearn.metrics import classification_report

# Classification report on the validation set

report_val = classification_report(data_val['loan_risk'],
                                   clf.predict(data_val.drop(['loan_risk'],axis=1)), 
                                   output_dict=True)

report_test = classification_report(data_test['loan_risk'],
                                    clf.predict(data_test.drop(['loan_risk'],axis=1)), 
                                    output_dict=True)

In [98]:
report_val

{'0': {'precision': 0.4,
  'recall': 0.008086253369272238,
  'f1-score': 0.01585204755614267,
  'support': 742},
 '1': {'precision': 0.8736697562650189,
  'recall': 0.9982349480290253,
  'f1-score': 0.9318077803203662,
  'support': 5099},
 'accuracy': 0.8724533470296182,
 'macro avg': {'precision': 0.6368348781325095,
  'recall': 0.5031606006991488,
  'f1-score': 0.4738299139382544,
  'support': 5841},
 'weighted avg': {'precision': 0.8134980460871993,
  'recall': 0.8724533470296182,
  'f1-score': 0.8154511369868522,
  'support': 5841}}

# Setup of a grid search

Let's look for the combination of parameters that performs best on the validation set. By doing this we log each experiment using mlflow


In [99]:
import mlflow

# grid search
max_depth = [3, 6]
scale_pos_weight = [0.1, 1, 10]
learning_rate = [0.01, 0.001]
n_estimators = [100, 50]

name_experiment = 'First grid-search'

parameters = product(max_depth, scale_pos_weight, learning_rate, n_estimators)
parameters_list = list(parameters)

print('Number of experiments:', len(parameters_list))

# Hyperparameter search
results = []
best_param = None
best_f1 = 0.0
warnings.filterwarnings('ignore')

for i, param in enumerate(parameters_list):
    print('Running experiment number ', i)
    with mlflow.start_run(run_name=name_experiment):
        # Tell mlflow to log the following parameters for the experiments dashboard
        mlflow.log_param('max_depth', param[0])
        mlflow.log_param('scale_pos_weight', param[1])
        mlflow.log_param('learning_rate', param[2])
        mlflow.log_param('n_estimators', param[3])
        mlflow.log_param('version', os.system('git describe --all --long'))

        try:
            parameters = dict(n_estimators=param[3], 
                              max_depth=param[0], 
                              scale_pos_weight=param[1],
                              learning_rate=param[2])

            clf = train_xgboost_pipeline(data_train.drop(['loan_risk'],axis=1),
                                         data_train['loan_risk'], 
                                         parameters=parameters)
            
            
            report_val = classification_report(data_val['loan_risk'],
                                               clf.predict(data_val.drop(['loan_risk'],axis=1)), 
                                               output_dict=True)

            report_test = classification_report(data_test['loan_risk'],
                                                clf.predict(data_test.drop(['loan_risk'],axis=1)), 
                                                output_dict=True)

            # Tell mlflow to log the following metrics
            mlflow.log_metric("recall", report_val['0']['recall'])            
            mlflow.log_metric("precision", report_val['0']['precision'])
            mlflow.log_metric("F1", report_val['0']['f1-score'])

            # Store this artifact for each run
            json.dump(report_test, open("metrics.json", "w"))
            mlflow.log_artifact('./metrics.json')

            # save the best experiment yet (in terms of precision)
            if report_val['0']['f1-score'] > best_f1:
                best_param = parameters
                best_f1 = report_val['0']['f1-score']
            
                                            
            results.append([param, report_val['0']['f1-score']])

        except ValueError:
            print('bad parameter combination:', param)
            continue

mlflow.end_run()
print('Best F1 was:', best_f1)
print('Using the following parameters')
print(best_param)

Number of experiments: 24
Running experiment number  0
heads/master-0-gfd8cd74
Running experiment number  1
heads/master-0-gfd8cd74
Running experiment number  2
heads/master-0-gfd8cd74
Running experiment number  3
heads/master-0-gfd8cd74
Running experiment number  4
heads/master-0-gfd8cd74
Running experiment number  5
heads/master-0-gfd8cd74
Running experiment number  6
heads/master-0-gfd8cd74
Running experiment number  7
heads/master-0-gfd8cd74
Running experiment number  8
heads/master-0-gfd8cd74
Running experiment number  9
heads/master-0-gfd8cd74
Running experiment number  10
heads/master-0-gfd8cd74
Running experiment number  11
heads/master-0-gfd8cd74
Running experiment number  12
heads/master-0-gfd8cd74
Running experiment number  13
heads/master-0-gfd8cd74
Running experiment number  14
heads/master-0-gfd8cd74
Running experiment number  15
heads/master-0-gfd8cd74
Running experiment number  16
heads/master-0-gfd8cd74
Running experiment number  17
heads/master-0-gfd8cd74
Running expe

### More performing parameter combination

In [100]:
best_param

{'n_estimators': 100,
 'max_depth': 6,
 'scale_pos_weight': 0.1,
 'learning_rate': 0.01}

# Saving of the model

We save the chosen template in our registry

In [102]:
from mlflow.models.signature import infer_signature

best_clf = train_xgboost_pipeline(data_train.drop(['loan_risk'], axis=1),
                                     data_train['loan_risk'], 
                                     parameters=best_param)

signature = infer_signature(data_val.drop(['loan_risk'], axis=1),
                            best_clf.predict(data_val.drop(['loan_risk'], axis=1)))


input_example = {}
for i in data_val.drop(['loan_risk'],axis=1).columns:
    input_example[i] = data_val[i].iloc[0]

    
mlflow.sklearn.save_model(best_clf, path='./model3/',signature=signature, input_example=input_example)


# to load one of the registry templates
# saved_model_path="."
# reloaded_model = mlflow.sklearn.load_model(model_uri=saved_model_path)

# Creating an API packaged in a Docker container

mlflow offers the possibility to put one of the models in our registry into production by packaging it in a docker container that exposes an API to query the model.


To create the docker image we can use the following command line
```
mlflow models build-docker -m "./best_model" -n "xgboost_loan"
```


Once the image has been created we can start the container with
```
docker run -p 5001:8080 xgboost_loan
```                             
```                                        

# Model queries

Let's take an example of saved input and create a payload for the API

In [None]:
import requests
import json

input_example = {'columns': ['loan_amount', 'payments_term', 'monthly_payment', 'grade', 'working_years', 'home', 'annual_income', 'verification', 'purpose', 'debt_to_income', 'delinquency', 'inquiries', 'open_credit_lines', 'derogatory_records', 'revolving_balance', 'revolving_rate', 'total_accounts', 'bankruptcies', 'fico_average'], 'data': [[100000000000, '60 months', 214.87, 1, 8, 'Mortgage', 50000.0, 'Not Verified', 'debt_consolidation', 18.36, 0, 0, 7, 0, 9075, 44.3, 25, 0, 752]]}
# convert input_example to json in a json file named input_example.json
with open('./model2/input_example.json', 'w') as f:
    json.dump(input_example, f)

with open('./model2/input_example.json') as f:
    sample_input = json.load(f)

In [None]:
# after the docker container is running we can make a request to the model with the following code
response = requests.post(
              url='http://127.0.0.1:5001/invocations', data=json.dumps(sample_input),
              headers={"Content-type": "application/json"})
response_json = json.loads(response.text)
print('Predicted loan outcome: '+response_json[0])