In [1]:
import os, pandas as pd
# Add asset from file system
df = pd.read_csv('/project_data/data_asset/TON_PREV_NEW.csv')
df.head()

Unnamed: 0,BASKET_SIZE,EXTN_COMPOSITION,CARRIER_SERVICE_CODE_OL,CATEGORY,COUNTRY_OF_ORIGIN_OI,DAY_OF_MONTH,DAY_OF_WEEK,DAY_OF_YEAR,EXTN_BRAND,EXTN_DISCOUNT_ID,...,OTHER_CHARGES,OTHER_CHARGES_OL,REQ_DELIVERY_DATE,TOTAL_AMOUNT_USD,WEEKEND,ZIP_CODE,MTS_CTS,HOUR_OF_DAY,LOCKID,RETURN_FLAG
0,1,,STANDARD,Slip,CN,14,Saturday,287,XYZAA,,...,0.0,0.0,0,0.0,1,Zipcode_261,1,16,26,0
1,1,,STANDARD,Slip,CN,17,Tuesday,290,XYZAA,,...,0.0,0.0,0,0.0,0,Zipcode_165,2,16,36,0
2,1,"85% Polyamide, 15% Elastane",PREMIER_EVENING,Slip,CN,19,Thursday,292,XYZAA,,...,25.0,0.0,0,40.0,0,Zipcode_599,11,17,215,1
3,1,"54% Polyamide, 46% Polyester",STANDARD,Slip,CN,24,Tuesday,297,XYZAA,,...,0.0,0.0,0,0.0,0,Zipcode_261,1,15,25,0
4,2,"93% Cotton, 7% Elastane",STANDARD,Maniche Lunghe,PT,30,Monday,303,XYZAB,,...,13.0,0.0,0,251.192578,0,Zipcode_228,12,13,179,0


## Let's fill all NA(s) and empty values with 0

In [2]:
df=df.fillna(0)

In [3]:
df.dtypes

BASKET_SIZE                  int64
EXTN_COMPOSITION            object
CARRIER_SERVICE_CODE_OL     object
CATEGORY                    object
COUNTRY_OF_ORIGIN_OI        object
DAY_OF_MONTH                 int64
DAY_OF_WEEK                 object
DAY_OF_YEAR                  int64
EXTN_BRAND                  object
EXTN_DISCOUNT_ID            object
EXTN_IS_GIFT                object
EXTN_IS_PREORDER            object
EXTN_SHIP_TO_CITY           object
EXTN_SHIP_TO_COUNTRY        object
EXTN_SEASON                 object
LIST_PRICE                   int64
MONTH_OF_YEAR                int64
OTHER_CHARGES              float64
OTHER_CHARGES_OL           float64
REQ_DELIVERY_DATE            int64
TOTAL_AMOUNT_USD           float64
WEEKEND                      int64
ZIP_CODE                    object
MTS_CTS                      int64
HOUR_OF_DAY                  int64
LOCKID                       int64
RETURN_FLAG                  int64
dtype: object

## We can see that we have object columns which need to be converted to category codes to be fed into the model

In [4]:
qual = list( df.loc[:,df.dtypes == 'object'].columns.values )
for col in qual:
     df[col] = df[col].astype('category')
quant = list( df.loc[:,df.dtypes != 'category'].columns.values )
print(qual,quant)

['EXTN_COMPOSITION', 'CARRIER_SERVICE_CODE_OL', 'CATEGORY', 'COUNTRY_OF_ORIGIN_OI', 'DAY_OF_WEEK', 'EXTN_BRAND', 'EXTN_DISCOUNT_ID', 'EXTN_IS_GIFT', 'EXTN_IS_PREORDER', 'EXTN_SHIP_TO_CITY', 'EXTN_SHIP_TO_COUNTRY', 'EXTN_SEASON', 'ZIP_CODE'] ['BASKET_SIZE', 'DAY_OF_MONTH', 'DAY_OF_YEAR', 'LIST_PRICE', 'MONTH_OF_YEAR', 'OTHER_CHARGES', 'OTHER_CHARGES_OL', 'REQ_DELIVERY_DATE', 'TOTAL_AMOUNT_USD', 'WEEKEND', 'MTS_CTS', 'HOUR_OF_DAY', 'LOCKID', 'RETURN_FLAG']


In [5]:
cats = list( df.loc[:,df.dtypes == 'category'].columns.values)
categories={}
for col in cats:
    categories[col]= dict(enumerate(df[col].cat.categories))

In [6]:
df.dtypes

BASKET_SIZE                   int64
EXTN_COMPOSITION           category
CARRIER_SERVICE_CODE_OL    category
CATEGORY                   category
COUNTRY_OF_ORIGIN_OI       category
DAY_OF_MONTH                  int64
DAY_OF_WEEK                category
DAY_OF_YEAR                   int64
EXTN_BRAND                 category
EXTN_DISCOUNT_ID           category
EXTN_IS_GIFT               category
EXTN_IS_PREORDER           category
EXTN_SHIP_TO_CITY          category
EXTN_SHIP_TO_COUNTRY       category
EXTN_SEASON                category
LIST_PRICE                    int64
MONTH_OF_YEAR                 int64
OTHER_CHARGES               float64
OTHER_CHARGES_OL            float64
REQ_DELIVERY_DATE             int64
TOTAL_AMOUNT_USD            float64
WEEKEND                       int64
ZIP_CODE                   category
MTS_CTS                       int64
HOUR_OF_DAY                   int64
LOCKID                        int64
RETURN_FLAG                   int64
dtype: object

In [7]:
df["RETURN_FLAG"].value_counts()

0    128487
1     24287
Name: RETURN_FLAG, dtype: int64

## Here we can see that there are 24K orders that have been returned and 128K orders that haven't been returned

In [8]:
from sklearn.model_selection import train_test_split
X=(df.drop(["RETURN_FLAG"], axis=1))
y=df['RETURN_FLAG']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3., random_state=42)

##  Install Custom Modules for the Pipeline Transformations

In [9]:
!pip install --upgrade /project_data/data_asset/CustTrans-0.2.zip

Processing /project_data/data_asset/CustTrans-0.2.zip
Building wheels for collected packages: CustTrans
  Building wheel for CustTrans (setup.py) ... [?25ldone
[?25h  Created wheel for CustTrans: filename=CustTrans-0.1-cp36-none-any.whl size=1801 sha256=cfb7d37a8a81d7cc395711ff5eeff929b464770dcc08313ea1f36f96921e764c
  Stored in directory: /home/wsuser/.cache/pip/wheels/d8/13/54/c87b5cac3899188ef9b3013bce4976e8726028e908d07643c6
Successfully built CustTrans
Installing collected packages: CustTrans
  Found existing installation: CustTrans 0.1
    Uninstalling CustTrans-0.1:
      Successfully uninstalled CustTrans-0.1
Successfully installed CustTrans-0.1


In [10]:
from CustomTransformer.CustTrans import TypeSelector,StringIndexer,ConvToCategorical

In [11]:
!pip install sklearn-pandas



In [12]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn_pandas import DataFrameMapper


transformer = Pipeline([
   ('features', FeatureUnion(n_jobs=1, transformer_list=[
       # Part 1
       ('boolean', Pipeline([
           ('selector', TypeSelector('bool')),
       ])),  # booleans close

       ('numericals', Pipeline([
           ('selector', TypeSelector(np.number)),
           ('scaler', StandardScaler()),
       ])),
       # Part 2
       ('categoricals', Pipeline([
           ('convertor', ConvToCategorical()),
           ('selector', TypeSelector('category')),
           ('labeler', StringIndexer()),
           ('encoder', OneHotEncoder(handle_unknown='ignore')),
       ]))
       # categoricals close
   ])),  # features close
   ('clf' , RandomForestClassifier(n_estimators=30,criterion="entropy")),
    
])

## Let's now pass the data through the transformer(fit)

In [13]:
import timeit
start_time = timeit.default_timer()
transformer.fit(X_train, y_train)
print("Time for model training",timeit.default_timer() - start_time)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


Time for model training 106.08439615397947


## Let's now evaluate the accuracy of the model using our hold-out test data

In [14]:
scores= transformer.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, scores)
accuracy

  Xt = transform.transform(Xt)


0.8882277859597447

## Let's now save and deploy the model to WML

In [15]:
from watson_machine_learning_client import WatsonMachineLearningAPIClient

In [16]:
wml_credentials = {
                    "url": "https://zen-cpd-zen.apps.marksturpak4.ibmcodetest.us",
                    "username": "admin",
                    "password": "password",
                    "instance_id": "wml_local",
                    "version" : "2.5.0"
 }

## Before we deploy the model, let's create a custom python runtime with our custom transformer library installed

In [17]:
client = WatsonMachineLearningAPIClient(wml_credentials)
# Create a deployment space
space_details = client.spaces.store(meta_props={client.spaces.ConfigurationMetaNames.NAME: "ReturnPropensity_Space"})
space_id = client.spaces.get_uid(space_details)
print(space_id)
# Set default space
client.set.default_space(space_id)
print(client.deployments.list())

ffedc642-e2db-405c-b9fe-cf90806f9e06
----  ----  -----  -------  -------------
GUID  NAME  STATE  CREATED  ARTIFACT_TYPE
----  ----  -----  -------  -------------
None


In [18]:
lib_meta = {
        client.runtimes.LibraryMetaNames.NAME: "CustomTransformers_v0.1",
        client.runtimes.LibraryMetaNames.DESCRIPTION: "CustomTransformers_v0.1",
        client.runtimes.LibraryMetaNames.FILEPATH: "/project_data/data_asset/CustTrans-0.2.zip",
        client.runtimes.LibraryMetaNames.VERSION: "1.0",
        client.runtimes.LibraryMetaNames.PLATFORM: {"name": "python", "versions": ["3.6"]}
    }
custom_library_details = client.runtimes.store_library(lib_meta)
custom_library_uid = client.runtimes.get_library_uid(custom_library_details)
print("Custom Library UID: " + custom_library_uid)

Custom Library UID: 52d8b6f6-2de5-49eb-b564-d50fe92827de


In [19]:
runtimes_meta = {
    client.runtimes.ConfigurationMetaNames.NAME: "CustomTransformers_v0.1", 
    client.runtimes.ConfigurationMetaNames.DESCRIPTION: "CustomTransformers_v0.1", 
    client.runtimes.ConfigurationMetaNames.PLATFORM: { "name": "python", "version": "3.6" }, 
    client.runtimes.ConfigurationMetaNames.LIBRARIES_UIDS: [custom_library_uid]
}

In [20]:
runtime_details = client.runtimes.store(runtimes_meta)
runtime_details

{'metadata': {'id': '18d00ea1-9715-4de3-99d4-6356aa3882ea',
  'guid': '18d00ea1-9715-4de3-99d4-6356aa3882ea',
  'href': '/v4/runtimes/18d00ea1-9715-4de3-99d4-6356aa3882ea',
  'created_at': '2020-02-12T00:39:00.051Z'},
 'entity': {'services': ['Training', 'Scoring'],
  'name': 'CustomTransformers_v0.1',
  'description': 'CustomTransformers_v0.1',
  'custom_libraries': [{'href': '/v4/libraries/52d8b6f6-2de5-49eb-b564-d50fe92827de'}],
  'space': {'href': '/v4/spaces/ffedc642-e2db-405c-b9fe-cf90806f9e06'},
  'system_defined': False,
  'platform': {'name': 'python', 'version': '3.6'}}}

In [21]:
runtime_uid = client.runtimes.get_uid(runtime_details)
print("Runtime UID: " + runtime_uid)

Runtime UID: 18d00ea1-9715-4de3-99d4-6356aa3882ea


In [22]:
model_props = {client.repository.ModelMetaNames.NAME: "ReturnRiskPandas_v0.1",
               client.repository.ModelMetaNames.RUNTIME_UID: runtime_uid,
               client.repository.ModelMetaNames.TYPE: "scikit-learn_0.20"  #TODO confirm
              }

In [23]:
published_model = client.repository.store_model(model=transformer, meta_props=model_props,training_data=X_train, training_target=y_train)
published_model_uid = client.repository.get_model_uid(published_model)
model_details = client.repository.get_details(published_model_uid)

In [24]:
import json
print(json.dumps(model_details, indent=2))

{
  "metadata": {
    "guid": "83b41e82-819c-446b-b2f2-cafccd47e589",
    "id": "83b41e82-819c-446b-b2f2-cafccd47e589",
    "modified_at": "2020-02-12T00:39:55.002Z",
    "created_at": "2020-02-12T00:39:00.002Z",
    "owner": "1000330999",
    "href": "/v4/models/83b41e82-819c-446b-b2f2-cafccd47e589?space_id=ffedc642-e2db-405c-b9fe-cf90806f9e06"
  },
  "entity": {
    "name": "ReturnRiskPandas_v0.1",
    "training_data_references": [
      {
        "location": {
          "bucket": "not_applicable"
        },
        "type": "fs",
        "connection": {
          "access_key_id": "not_applicable",
          "secret_access_key": "not_applicable",
          "endpoint_url": "not_applicable"
        },
        "schema": {
          "id": "1",
          "type": "DataFrame",
          "fields": [
            {
              "name": "BASKET_SIZE",
              "type": "int64"
            },
            {
              "name": "EXTN_COMPOSITION",
              "type": "category"
           

In [25]:
metaProps = {
client.deployments.ConfigurationMetaNames.NAME: "ReturnRiskPandas_CustomTransformers_v0.2",
client.deployments.ConfigurationMetaNames.ONLINE: {}
}

In [26]:
created_deployment = client.deployments.create(published_model_uid, metaProps)



#######################################################################################

Synchronous deployment creation for uid: '83b41e82-819c-446b-b2f2-cafccd47e589' started

#######################################################################################


initializing........
ready


------------------------------------------------------------------------------------------------
Successfully finished deployment creation, deployment_uid='2141fcd5-cdb3-46cd-8664-e165a37c8446'
------------------------------------------------------------------------------------------------




## Now that we have deployed the model, let's get the deployment's UID using which we can score an input payload

In [27]:
deployment_id = client.deployments.get_uid(created_deployment)
print(deployment_id)

2141fcd5-cdb3-46cd-8664-e165a37c8446


## Here we have a sample payload. Let's try scoring this using our model

In [28]:
scoring_payload={client.deployments.ScoringMetaNames.INPUT_DATA: [{"fields":["BASKET_SIZE","EXTN_COMPOSITION","CARRIER_SERVICE_CODE_OL","CATEGORY","COUNTRY_OF_ORIGIN_OI","DAY_OF_MONTH","DAY_OF_WEEK","DAY_OF_YEAR","EXTN_BRAND","EXTN_DISCOUNT_ID","EXTN_IS_GIFT","EXTN_IS_PREORDER","EXTN_SHIP_TO_CITY","EXTN_SHIP_TO_COUNTRY","EXTN_SEASON","LIST_PRICE","MONTH_OF_YEAR","OTHER_CHARGES","OTHER_CHARGES_OL","REQ_DELIVERY_DATE","TOTAL_AMOUNT_USD","WEEKEND","ZIP_CODE","MTS_CTS","HOUR_OF_DAY","LOCKID"],"values":[[3, '91% Nylon, 9% Elastercell', 'STANDARD', 'Bikini', 'US', 18, 'Saturday', 322, 'XYZAI', 'None', 'N', 'N', 'Los Angeles', 'US', 'FW17', 75, 11, 0.0, 0.0, 0, 165.35, 1, 'Zipcode_401', 24, 19, 277]]}]}

In [29]:
prediction = client.deployments.score(deployment_id, scoring_payload)

In [30]:
prediction

{'predictions': [{'fields': ['prediction', 'probability'],
   'values': [[0, [0.6333333333333333, 0.36666666666666664]]]}]}