## Training a model

In [1]:
import json
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestClassifier

data = pd.read_csv("./churn.txt")
model_data = data.drop(["State", "Phone", "Area Code", "Day Charge", "Eve Charge", "Night Charge", "Intl Charge", "Int'l Plan", "VMail Plan"], axis=1)
model_data = pd.get_dummies(model_data)
model_data = pd.concat(
    [model_data["Churn?_True."], model_data.drop(["Churn?_False.", "Churn?_True."], axis=1)], axis=1
)

In [2]:
labels = model_data.columns[1:]

In [3]:
train_data, validation_data, test_data = np.split(
    model_data.sample(frac=1, random_state=1729),
    [int(0.7 * len(model_data)), int(0.9 * len(model_data))],
)
X_train, y_train = train_data.iloc[:, 1:], train_data.iloc[:, 0]
X_train.shape, y_train.shape

((3500, 11), (3500,))

In [4]:
clf = RandomForestClassifier(
    max_depth=5,
    random_state=0
)
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, random_state=0)

In [5]:
joblib.dump(clf, "sklearn_model.joblib")

['sklearn_model.joblib']

In [6]:
test_data.to_csv("test.csv", header=False, index=False)

## Creating an application and serving it

Configuration files looks like this:

```YAML
project: example-project
pipeline: example-pipeline
verbose: false
writers:
- data_collection_consent: true
  formats: ['protobuf']
  type: whylabs
```

.env file with important relevant environment variables looks like this:

```JSON
# Flask
FLASK_DEBUG=1

# WhyLabs
WHYLABS_CONFIG=.whylabs.yaml
WHYLABS_API_KEY=
WHYLABS_DEFAULT_ORG_ID=org-XXXX
WHYLABS_DEFAULT_DATASET_ID=model-1
WHYLABS_API_ENDPOINT=https://api.whylabsapp.com
```

## Setting up the baseline (training data)

In [None]:
train = train_data.drop("Churn?_True.", axis=1)

In [22]:
import datetime
from whylogs import get_or_create_session
from dotenv import load_dotenv
# load env variables
load_dotenv()

session = get_or_create_session(".whylogs.yaml")
logger = session.logger(
    dataset_name="my_deployed_model", 
    dataset_timestamp=datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=1), 
)
with logger:
    logger.log_dataframe(train)

Using API key ID: 5ISuDTANMy


In [7]:
%%writefile app.py
import datetime
import json
import numpy as np
from flask import request, Flask, Response, jsonify
from joblib import load
from whylogs import get_or_create_session
from dotenv import load_dotenv

app = Flask(__name__)
model_path = "sklearn_model.joblib"
model = load(model_path)

# load env variables
load_dotenv()

session = get_or_create_session(".whylogs.yaml")
logger = session.logger(
    dataset_name="my_deployed_model", 
    dataset_timestamp=datetime.datetime.now(datetime.timezone.utc), 
)

@app.route("/ping", methods=["GET"])
def ping():
    """Determine if the container is working and healthy.
    In this sample container, we declare
    it healthy if we can load the model successfully."""
    status = 200
    return Response(response={"state": "healthy"}, status=status, mimetype="application/json")

@app.route("/invocations", methods=["POST"])
def predict():    
    data = request.data.decode("utf-8")
    data = json.loads(data)
    vector = [float(i) for i in data.values()]
    vector = np.array(vector).reshape(1, -1)
    pred = model.predict(vector)[0]
    print(pred)
    #Log to whylabs platform
    #Log input vector as dictionary
    logger.log(data)
    
    #Log predicted class
    logger.log({"churn": pred})
    return jsonify({"prediction": str(pred)})

Overwriting app.py


## Sending requests 

Let's first do `flask run` to start the dev server with our app.

Check endpoint health.

In [23]:
import requests

url = "http://127.0.0.1:5000"
requests.get(f"{url}/ping")

<Response [200]>

In [24]:
test_dataset_size = 0
with open("test.csv", "r") as f:
    for row in f:
        if test_dataset_size < 10:
            payload = row.rstrip("\n")
            payload = dict(zip(labels, payload[2:].split(",")))
            payload = json.dumps(payload)
            print(payload)
            print(requests.post(f"{url}/invocations", data=payload).json())
            print()
        #if test_dataset_size == 15:
            
        test_dataset_size += 1

{"Account Length": "62", "VMail Message": "0", "Day Mins": "5.072152061281491", "Day Calls": "5", "Eve Mins": "6.600411338234018", "Eve Calls": "2", "Night Mins": "3.533501078944156", "Night Calls": "300", "Intl Mins": "4.3952998987652565", "Intl Calls": "7", "CustServ Calls": "6"}
{'prediction': '0'}

{"Account Length": "5", "VMail Message": "0", "Day Mins": "12.524227291335832", "Day Calls": "2", "Eve Mins": "5.639471129269479", "Eve Calls": "5", "Night Mins": "4.937219265876419", "Night Calls": "150", "Intl Mins": "5.881787271425926", "Intl Calls": "5", "CustServ Calls": "3"}
{'prediction': '1'}

{"Account Length": "21", "VMail Message": "0", "Day Mins": "0.8438585898741922", "Day Calls": "3", "Eve Mins": "1.3477250198014177", "Eve Calls": "0", "Night Mins": "4.749775942599953", "Night Calls": "350", "Intl Mins": "4.384146052764612", "Intl Calls": "8", "CustServ Calls": "7"}
{'prediction': '0'}

{"Account Length": "140", "VMail Message": "0", "Day Mins": "1.5811272910168768", "Day C