# Machine Learning Engineering for SAS Model Manager

The goal of this notebook is engineer the code to deploy a LightGBM on Viya. 

The steps are:

- Create a Preprocessor

- Create the Model Package with required files

    - score.py
    - model.pkl
    - inputVar.json
    - outputVar.json
    

- Add the optional files to the package

    - train.py
    - fileMetadata.json
    - ModelProperties.json
    - dmcas_fitstat.json
    - dmcas_roc
    - dmcas_lift

Now we could use sasctl to register the model but it is mantained anymore.

So I use a pzmm library from SAS. 

## Libraries and Environment

### Libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder, OneHotEncoder, KBinsDiscretizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import precision_score, accuracy_score, recall_score, precision_recall_curve, roc_auc_score, f1_score
# from sklearn.metrics import plot_confusion_matrix
from sklearn.utils import estimator_html_repr
from sklearn import set_config

from lightgbm import LGBMClassifier

from pzmm import *

#Utils
import os
import shutil
import glob
import pickle
import unittest
import zipfile

#Settings

# Set notebook
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
set_config(display='diagram')

### Enviroment

In [2]:
processed = '../data/processed/data_final.csv'
plots = '../reports/figures/'
models= '../models/'
src = '../src/'

data = pd.read_csv(processed)

features = data.columns.difference(['fraud_reported'])
target = 'fraud_reported'

X = data[features]
y = data[[target]]

## Data 

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 39 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   months_as_customer           1000 non-null   int64  
 1   age                          1000 non-null   int64  
 2   policy_number                1000 non-null   int64  
 3   policy_bind_date             1000 non-null   object 
 4   policy_state                 1000 non-null   object 
 5   policy_csl                   1000 non-null   object 
 6   policy_deductable            1000 non-null   int64  
 7   policy_annual_premium        1000 non-null   float64
 8   umbrella_limit               1000 non-null   int64  
 9   insured_zip                  1000 non-null   int64  
 10  insured_sex                  1000 non-null   object 
 11  insured_education_level      1000 non-null   object 
 12  insured_occupation           1000 non-null   object 
 13  insured_hobbies    

In [4]:
data.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,insured_education_level,insured_occupation,insured_hobbies,insured_relationship,capital_gains,capital_loss,incident_date,incident_type,collision_type,incident_severity,authorities_contacted,incident_state,incident_city,incident_location,incident_hour_of_the_day,number_of_vehicles_involved,property_damage,bodily_injuries,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,MALE,MD,craft-repair,sleeping,husband,53300,0,2015-01-25,Single Vehicle Collision,Side Collision,Major Damage,Police,SC,Columbus,9935 4th Drive,5,1,YES,1,2,YES,71610,6510,13020,52080,Saab,92x,2004,Y
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,MALE,MD,machine-op-inspct,reading,other-relative,0,0,2015-01-21,Vehicle Theft,?,Minor Damage,Police,VA,Riverwood,6608 MLK Hwy,8,1,?,0,0,?,5070,780,780,3510,Mercedes,E400,2007,Y
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,FEMALE,PhD,sales,board-games,own-child,35100,0,2015-02-22,Multi-vehicle Collision,Rear Collision,Minor Damage,Police,NY,Columbus,7121 Francis Lane,7,3,NO,2,3,NO,34650,7700,3850,23100,Dodge,RAM,2007,N
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,FEMALE,PhD,armed-forces,board-games,unmarried,48900,-62400,2015-01-10,Single Vehicle Collision,Front Collision,Major Damage,Police,OH,Arlington,6956 Maple Drive,5,1,?,1,2,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,MALE,Associate,sales,board-games,unmarried,66000,-46000,2015-02-17,Vehicle Theft,?,Minor Damage,,NY,Arlington,3041 3rd Ave,20,1,NO,0,1,NO,6500,1300,650,4550,Accura,RSX,2009,N


## Create a preprocessor

### Set variables

In [5]:
drop_columns = ["policy_bind_date", "policy_number", "incident_date", "incident_type", "collision_type", "policy_state",
                "incident_location", "auto_model", "auto_year", "property_damage", "police_report_available", "insured_zip", "total_claim_amount",
                "policy_deductable", "umbrella_limit", "capital_gains", "capital_loss"]
data_pp = data.drop(drop_columns, axis=1)

In [6]:
binary_features = [col for col in data_pp.columns if (
    data_pp[col].dtype == 'object' and data_pp[col].nunique() == 2)]
multinominal_features = [col for col in data_pp.columns if (
    data_pp[col].dtype == 'object' and data_pp[col].nunique() != 2)]

### Code Preprocessor

In [7]:
# Apply some preprocessing steps

drop_columns = ["policy_bind_date", "policy_number", "incident_date", "incident_type", "collision_type", "policy_state", "incident_location", "auto_model",
                "auto_year", "property_damage", "police_report_available", "insured_zip", "total_claim_amount", "policy_deductable", "umbrella_limit", "capital_gains", "capital_loss"]

binary_features = ['insured_sex']

multinominal_features = ['authorities_contacted', 'auto_make', 'incident_city', 'incident_severity', 'incident_state',
                         'insured_education_level', 'insured_hobbies', 'insured_occupation', 'insured_relationship', 'policy_csl']

numeric_features = ['age']

# Preprocessing binary variables
binary_transformer = Pipeline(steps=[

    ('binary_imputer', OrdinalEncoder(dtype=np.int64))
])

# Prepocessing multinomial variables
multinominal_trasformer = Pipeline(steps=[

    ('multinomial_imputer', OneHotEncoder(handle_unknown='ignore'))

])

# #Preprocessing numerical variable
numerical_trasformer = Pipeline(steps=[

    ('numerical_imputer', KBinsDiscretizer(n_bins=6, encode='onehot', strategy='uniform'))
])

preprocessor = ColumnTransformer(transformers=[
    ('drop_columns', 'drop', drop_columns),
    ('binaries', binary_transformer, binary_features),
    ('multinomial', multinominal_trasformer, multinominal_features),
    ('numerical', numerical_trasformer, numeric_features)

], remainder='passthrough')


preprocess_pipe = Pipeline(steps=[

    ('preprocessor', preprocessor)

])

### Visualize Pipeline

In [8]:
preprocess_pipe

### Test Preprocessor

In [9]:
testcase = X.iloc[[0]]
fit = preprocess_pipe.fit(X)
test = fit.transform(testcase)
test.toarray()

array([[1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00

**Comment**: The pipeline reproduces Pre-processing steps correctly. 

Let's dump it

In [10]:
preprocessor = preprocess_pipe.fit(X)
print('Dump preprocessor with pickle...')
# dump model with pickle
with open(models + 'preprocessor.pkl', 'wb') as pkl:
    pickle.dump(preprocessor, pkl)

Dump preprocessor with pickle...


## Create the model package

In [11]:
chmp_dir = "".join([src, 'champion'])
if not os.path.exists(chmp_dir):
    os.mkdir(chmp_dir)

In [12]:
#Misclassification threshold
oe = OrdinalEncoder()
oe.fit(y)
oe.transform(y)
y = oe.transform(y)

prob_threshold = np.mean(y)
prob_threshold

0.247

### Score Function

#### score

In [13]:
def score(age, authorities_contacted, auto_make, auto_model, auto_year, bodily_injuries, capital_gains, capital_loss, collision_type, incident_city, incident_date, incident_hour_of_the_day, incident_location, incident_severity, incident_state, incident_type, injury_claim, insured_education_level, insured_hobbies, insured_occupation, insured_relationship, insured_sex, insured_zip, months_as_customer, number_of_vehicles_involved, police_report_available, policy_annual_premium, policy_bind_date, policy_csl, policy_deductable, policy_number, policy_state, property_claim, property_damage, total_claim_amount, umbrella_limit, vehicle_claim, witnesses):
    "Output: EVENT_PROBABILITY, CLASSIFICATION_LABEL"
    
    # Define the misclassification threshold
    prob_threshold = 0.247

    # Initiate model
    try:
        _ModelPreprocessFit
        _ModelFit

    except NameError:

        _pModelPreprocessingFile = open("../models/preprocessor.pkl", "rb")
        _ModelPreprocessingFit = pickle.load(_pModelPreprocessingFile)
        _pModelPreprocessingFile.close()

        _pModelFile = open("../models/model.pkl", "rb")
        _ModelFit = pickle.load(_pModelFile)
        _pModelFile.close()

    # Construct the input array for scoring
    input_array = pd.DataFrame([[age, authorities_contacted, auto_make, auto_model, auto_year, bodily_injuries,
                                     capital_gains, capital_loss, collision_type, incident_city, incident_date,
                                     incident_hour_of_the_day, incident_location, incident_severity, incident_state,
                                     incident_type, injury_claim, insured_education_level, insured_hobbies,
                                     insured_occupation, insured_relationship, insured_sex, insured_zip,
                                     months_as_customer, number_of_vehicles_involved, police_report_available,
                                     policy_annual_premium, policy_bind_date, policy_csl, policy_deductable,
                                     policy_number, policy_state, property_claim, property_damage, total_claim_amount,
                                     umbrella_limit, vehicle_claim, witnesses]],

                                   columns=['age', 'authorities_contacted', 'auto_make', 'auto_model', 'auto_year', 'bodily_injuries',
                                            'capital_gains', 'capital_loss', 'collision_type', 'incident_city', 'incident_date',
                                            'incident_hour_of_the_day', 'incident_location', 'incident_severity', 'incident_state',
                                            'incident_type', 'injury_claim', 'insured_education_level', 'insured_hobbies', 'insured_occupation',
                                            'insured_relationship', 'insured_sex', 'insured_zip', 'months_as_customer',
                                            'number_of_vehicles_involved', 'police_report_available', 'policy_annual_premium', 'policy_bind_date',
                                            'policy_csl', 'policy_deductable', 'policy_number', 'policy_state', 'property_claim',
                                            'property_damage', 'total_claim_amount', 'umbrella_limit', 'vehicle_claim', 'witnesses'])

    # Transform inputs

    _transformed_inputs = _ModelPreprocessingFit.transform(input_array)

    # Calculate the predicted probabilities
    
    _pred_proba = _ModelFit.predict(_transformed_inputs)

#     # Retrieve the event probability
    EVENT_PROBABILITY = float(_pred_proba)

#     # Determine the predicted target category
    if (EVENT_PROBABILITY >= prob_threshold):
        CLASSIFICATION_LABEL = '1'
    else:
        CLASSIFICATION_LABEL = '0'

    return(EVENT_PROBABILITY, CLASSIFICATION_LABEL)

#### Unit test the score code

In [14]:
testcase = score(48, 'Police', 'Saab', '92x', 2004, 1, 53300, 0, 'Side Collision',
                                   'Columbus', '2015-01-25', 5, '9935 4th Drive', 'Major Damage',
                                   'SC', 'Single Vehicle Collision', 6510, 'MD', 'sleeping',
                                   'craft-repair', 'husband', 'MALE', 466132, 328, 1, 'YES',
                                   1406.91, '2014-10-17', '250/500', 1000, 521585, 'OH', 13020,
                                   'YES', 71610, 0, 52080, 2.0)

scoreOut = pd.DataFrame([testcase], columns=["EVENT_PROBABILITY", "CLASSIFICATION_LABEL"])

# class Compiler(unittest.TestCase):

#     def BasicTest(self):

#         self.assertTrue(bool(score(48, 'Police', 'Saab', '92x', 2004, 1, 53300, 0, 'Side Collision',
#                                    'Columbus', '2015-01-25', 5, '9935 4th Drive', 'Major Damage',
#                                    'SC', 'Single Vehicle Collision', 6510, 'MD', 'sleeping',
#                                    'craft-repair', 'husband', 'MALE', 466132, 328, 1, 'YES',
#                                    1406.91, '2014-10-17', '250/500', 1000, 521585, 'OH', 13020,
#                                    'YES', 71610, 0, 52080, 2)))


# unittest.main(argv=['first-arg-is-ignored'], exit=False)

#### Write the score.py

Based on the documentation :

https://go.documentation.sas.com/?docsetId=mdlmgrug&docsetTarget=n04i7s6bdu7ilgn1e350am3byuxx.htm&docsetVersion=15.3&locale=en, 

we write the function to score the model on SAS Viya Platform

In [15]:
%%writefile ../src/champion/score.py

import pandas as pd
import lightgbm
import pickle

def score(age, authorities_contacted, auto_make, auto_model, auto_year, bodily_injuries, capital_gains, capital_loss, collision_type, incident_city, incident_date, incident_hour_of_the_day, incident_location, incident_severity, incident_state, incident_type, injury_claim, insured_education_level, insured_hobbies, insured_occupation, insured_relationship, insured_sex, insured_zip, months_as_customer, number_of_vehicles_involved, police_report_available, policy_annual_premium, policy_bind_date, policy_csl, policy_deductable, policy_number, policy_state, property_claim, property_damage, total_claim_amount, umbrella_limit, vehicle_claim, witnesses):
    "Output: EVENT_PROBABILITY, CLASSIFICATION_LABEL"
    
    # Define the misclassification threshold
    prob_threshold = 0.247

    # Initiate model
    try:
        _ModelPreprocessFit
        _ModelFit

    except NameError:

        _pModelPreprocessingFile = open("/models/resources/viya/<Model-UUID>/preprocessor.pkl", "rb")
        _ModelPreprocessingFit = pickle.load(_pModelPreprocessingFile)
        _pModelPreprocessingFile.close()

        _pModelFile = open("/models/resources/viya/<Model-UUID>/model.pkl", "rb")
        _ModelFit = pickle.load(_pModelFile)
        _pModelFile.close()

    # Construct the input array for scoring
    input_array = pd.DataFrame([[age, authorities_contacted, auto_make, auto_model, auto_year, bodily_injuries,
                                     capital_gains, capital_loss, collision_type, incident_city, incident_date,
                                     incident_hour_of_the_day, incident_location, incident_severity, incident_state,
                                     incident_type, injury_claim, insured_education_level, insured_hobbies,
                                     insured_occupation, insured_relationship, insured_sex, insured_zip,
                                     months_as_customer, number_of_vehicles_involved, police_report_available,
                                     policy_annual_premium, policy_bind_date, policy_csl, policy_deductable,
                                     policy_number, policy_state, property_claim, property_damage, total_claim_amount,
                                     umbrella_limit, vehicle_claim, witnesses]],

                                   columns=['age', 'authorities_contacted', 'auto_make', 'auto_model', 'auto_year', 'bodily_injuries',
                                            'capital_gains', 'capital_loss', 'collision_type', 'incident_city', 'incident_date',
                                            'incident_hour_of_the_day', 'incident_location', 'incident_severity', 'incident_state',
                                            'incident_type', 'injury_claim', 'insured_education_level', 'insured_hobbies', 'insured_occupation',
                                            'insured_relationship', 'insured_sex', 'insured_zip', 'months_as_customer',
                                            'number_of_vehicles_involved', 'police_report_available', 'policy_annual_premium', 'policy_bind_date',
                                            'policy_csl', 'policy_deductable', 'policy_number', 'policy_state', 'property_claim',
                                            'property_damage', 'total_claim_amount', 'umbrella_limit', 'vehicle_claim', 'witnesses'])

    # Transform inputs

    _transformed_inputs = _ModelPreprocessingFit.transform(input_array)

    # Calculate the predicted probabilities
    
    _pred_proba = _ModelFit.predict(_transformed_inputs)

#     # Retrieve the event probability
    EVENT_PROBABILITY = float(_pred_proba)

#     # Determine the predicted target category
    if (EVENT_PROBABILITY >= prob_threshold):
        CLASSIFICATION_LABEL = '1'
    else:
        CLASSIFICATION_LABEL = '0'

    return(EVENT_PROBABILITY, CLASSIFICATION_LABEL)

Overwriting ../src/champion/score.py


### Model.pkl file

In [17]:
for filePath in glob.glob(models + '*'):
    shutil.copy(filePath, chmp_dir)
    
os.listdir(chmp_dir)

['fileMetadata.json',
 'inputVar.json',
 'LightGBM.zip',
 'model.pkl',
 'ModelProperties.json',
 'outputVar.json',
 'preprocessor.pkl',
 'score.py']

### Model Metadata 

In [18]:
JSONFiles = JSONFiles()

#### inputVar.json

In [19]:
JSONFiles.writeVarJSON(X, isInput=True, jPath=chmp_dir)

#### OutputVar.json

In [20]:
JSONFiles.writeVarJSON(scoreOut, isInput=False, jPath=chmp_dir)

#### ModelProperties.json

In [21]:
modelName = 'LightGBM'

JSONFiles.writeModelPropertiesJSON(modelName=modelName,
                                   modelDesc='',
                                   targetVariable=target,
                                   modelType='tree',
                                   modelPredictors=features,
                                   targetEvent='1',
                                   numTargetCategories=2,
                                   eventProbVar='EVENT_PROBABILITY',
                                   jPath=chmp_dir,
                                   modeler='sasdemo')

JSONFiles.writeFileMetadataJSON('', jPath=chmp_dir)

### Zip the package

In [22]:
ZipModel.zipFiles(fileDir=chmp_dir, modelPrefix='LightGBM')