<H1>Deploying a Machine Learning Model Pipeline Test</H1>

<H3>1. Install Packages</H3>

In [1]:
# Import Packages

import os
import pickle

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier


<<H3>2. Get Census Data and Read it with Pandas</H3>

In [2]:
# Get Path Cencus Data

project_path = os.getcwd()[:-9]
data_path = os.path.join(project_path, "data", "census.csv")

print(data_path)


/home/k-irw/Udacity/DevOps/Project_2/Deploying-a-Scalable-ML-Pipeline-with-FastAPI/data/census.csv


In [3]:
# Laod Cencus Data

data = pd.read_csv(data_path)
data.shape

(32561, 15)

<<H3>3. Process Data</H3>

In [4]:

def process_data(
    X, categorical_features=[], label=None, training=True, encoder=None, lb=None
):

    if label is not None:
        y = X[label]
        X = X.drop([label], axis=1)
    else:
        y = np.array([])

    X_categorical = X[categorical_features].values
    X_continuous = X.drop(*[categorical_features], axis=1)

    if training is True:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
        lb = LabelBinarizer()
        X_categorical = encoder.fit_transform(X_categorical)
        y = lb.fit_transform(y.values).ravel()
    else:
        X_categorical = encoder.transform(X_categorical)
        try:
            y = lb.transform(y.values).ravel()
        # Catch the case where y is None because we're doing inference.
        except AttributeError:
            pass

    X = np.concatenate([X_continuous, X_categorical], axis=1)
    return X, y, encoder, lb
    

<<H3>4. Build and Train ML Model</H3>

In [5]:
#  Train Machine learning Model

def train_model(X_train, y_train):
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    return model


In [6]:

# Split Data
train, test = train_test_split(data, test_size=0.20)

cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]

# Use the process_data function provided to process the data.
X_train, y_train, encoder, lb = process_data(
    train, categorical_features=cat_features, label="salary", training=True
    )

X_test, y_test, _, _ = process_data(
    test,
    categorical_features=cat_features,
    label="salary",
    training=False,
    encoder=encoder,
    lb=lb,
)

# Use the train_model function to train the model on the training dataset
model = train_model(X_train, y_train)


<<H3>5. Calculate ML Model Metrics</H3>

In [7]:
# Run model inferences and return the predictions

def inference(model, X):
    preds = model.predict(X)
    return preds


In [8]:
# Use the inference function to run the model inferences on the test dataset.

preds = inference(model, X_test)


In [9]:
# Validates the trained machine learning model using precision, recall, and F1.

def compute_model_metrics(y, preds):
    fbeta = fbeta_score(y, preds, beta=1, zero_division=1)
    precision = precision_score(y, preds, zero_division=1)
    recall = recall_score(y, preds, zero_division=1)
    return precision, recall, fbeta
    

In [10]:
# Calculate and print the metrics

p, r, fb = compute_model_metrics(y_test, preds)
print(f"Precision: {p:.4f} | Recall: {r:.4f} | F1: {fb:.4f}")


Precision: 0.7164 | Recall: 0.6352 | F1: 0.6733


<<H3>6. Compute Performance on Model Slices</H3>

In [11]:
# Compute Model Metrics on Data Slice

def performance_on_categorical_slice(
    data, column_name, slice_value, categorical_features, label, encoder, lb, model
):
    data_slice = data[data[column_name] == slice_value]

    X_slice, y_slice, _, _ = process_data(
        X = data_slice,
        categorical_features = categorical_features, 
        label = label, 
        training = False, 
        encoder = encoder, 
        lb = lb
    )
    
    preds = inference(model, X_slice)
    precision, recall, fbeta = compute_model_metrics(y_slice, preds)
    return precision, recall, fbeta
    

In [12]:
# Iterate Through Categorical Features

for col in cat_features:
    # iterate through the unique values in one categorical feature
    # use test, col and slicevalue as part of the input
    for slicevalue in sorted(test[col].unique()):
        count = test[test[col] == slicevalue].shape[0]
                
        p, r, fb = performance_on_categorical_slice(
            data = test, 
            column_name = col, 
            slice_value = slicevalue,
            categorical_features = cat_features, 
            label = "salary",
            encoder = encoder, 
            lb = lb, 
            model = model
        )
        
        print(f"{col}: {slicevalue}, Count: {count}")
        print(f"Precision: {p:.4f} | Recall: {r:.4f} | F1: {fb:.4f}")
        

workclass: ?, Count: 385
Precision: 0.6897 | Recall: 0.5128 | F1: 0.5882
workclass: Federal-gov, Count: 197
Precision: 0.6944 | Recall: 0.7246 | F1: 0.7092
workclass: Local-gov, Count: 388
Precision: 0.7379 | Recall: 0.6609 | F1: 0.6972
workclass: Never-worked, Count: 1
Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000
workclass: Private, Count: 4580
Precision: 0.6945 | Recall: 0.6108 | F1: 0.6500
workclass: Self-emp-inc, Count: 230
Precision: 0.7630 | Recall: 0.8374 | F1: 0.7984
workclass: Self-emp-not-inc, Count: 476
Precision: 0.7684 | Recall: 0.5328 | F1: 0.6293
workclass: State-gov, Count: 255
Precision: 0.8333 | Recall: 0.7534 | F1: 0.7914
workclass: Without-pay, Count: 1
Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000
education: 10th, Count: 179
Precision: 0.6667 | Recall: 0.2667 | F1: 0.3810
education: 11th, Count: 224
Precision: 0.8000 | Recall: 0.3636 | F1: 0.5000
education: 12th, Count: 101
Precision: 0.5000 | Recall: 0.1429 | F1: 0.2222
education: 1st-4th, Count: 29
Precision