https://towardsdatascience.com/from-ml-model-to-ml-pipeline-9f95c32c6512

https://github.com/Mykrass/ml_pipeline

In [1]:
!git clone https://github.com/Mykrass/ml_pipeline

Cloning into 'ml_pipeline'...
remote: Enumerating objects: 74, done.[K
remote: Counting objects: 100% (74/74), done.[K
remote: Compressing objects: 100% (61/61), done.[K
remote: Total 74 (delta 39), reused 33 (delta 11), pack-reused 0[K
Receiving objects: 100% (74/74), 76.12 KiB | 2.17 MiB/s, done.
Resolving deltas: 100% (39/39), done.


In [2]:
!ls -la ./ml_pipeline/

total 48
drwxr-xr-x 3 root root 4096 Nov  3 12:45 .
drwxr-xr-x 1 root root 4096 Nov  3 12:45 ..
-rw-r--r-- 1 root root  322 Nov  3 12:45 functions.py
drwxr-xr-x 8 root root 4096 Nov  3 12:45 .git
-rw-r--r-- 1 root root   39 Nov  3 12:45 .gitignore
-rw-r--r-- 1 root root 8856 Nov  3 12:45 ml_pipeline.ipynb
-rw-r--r-- 1 root root 1413 Nov  3 12:45 README.md
-rw-r--r-- 1 root root 2662 Nov  3 12:45 transformers_chur.py
-rw-r--r-- 1 root root 2811 Nov  3 12:45 transformers.py
-rw-r--r-- 1 root root 2591 Nov  3 12:45 transformers_wine.py


In [3]:
# Customer functions
!python3 ./ml_pipeline/functions.py --help
!cat ./ml_pipeline/functions.py

from sklearn.metrics import roc_auc_score

def calculate_roc_auc(model_pipe, X, y):
    """Calculate roc auc score. 
    
    Parameters:
    model_pipe: sklearn model or pipeline
    X: features
    y: true target
    """
    y_proba = model_pipe.predict_proba(X)[:,1]
    return roc_auc_score(y, y_proba)

In [4]:
# Customer functions
!python3 ./ml_pipeline/transformers_wine.py --help
!cat ./ml_pipeline/transformers_wine.py

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

class FeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()
        return X

class Imputer(BaseEstimator, TransformerMixin):
    def __init__(self, features, method='constant', value='missing'):
        self.features = features
        self.method = method
        self.value = value
    
    def fit(self, X, y=None):
        if self.method=='mean':
            self.value = X[self.features].mean()
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        X_transformed[self.features] = X[self.features].fillna(self.value)
        return X_transformed

class CardinalityReducer(BaseEstimator, TransformerMixin):
    def __init__(self, features, threshold=.1):
        s

# 🔧 Load libraries and data
We will assume features that are included in `columns` are not available in the raw data. This gives us an opportunity to build FeatureExtractor custom transformer to extract useful features in the ML Pipeline.

In [5]:
# PyCaret
!git clone https://github.com/Mykrass/pycaret.git
!ls -GFlash --color ./pycaret/datasets

Cloning into 'pycaret'...
remote: Enumerating objects: 4020, done.[K
remote: Total 4020 (delta 0), reused 0 (delta 0), pack-reused 4020[K
Receiving objects: 100% (4020/4020), 57.68 MiB | 20.95 MiB/s, done.
Resolving deltas: 100% (2659/2659), done.
Updating files: 100% (278/278), done.
total 49M
 4.0K drwxr-xr-x  2 root  4.0K Nov  3 12:46 [0m[01;34m.[0m/
 4.0K drwxr-xr-x 12 root  4.0K Nov  3 12:46 [01;34m..[0m/
 3.5M -rw-r--r--  1 root  3.5M Nov  3 12:46 amazon.csv
 120K -rw-r--r--  1 root  117K Nov  3 12:46 anomaly.csv
 8.0K -rw-r--r--  1 root  7.6K Nov  3 12:46 asia_gdp.csv
  28K -rw-r--r--  1 root   25K Nov  3 12:46 automobile.csv
 3.6M -rw-r--r--  1 root  3.6M Nov  3 12:46 bank.csv
1008K -rw-r--r--  1 root 1008K Nov  3 12:46 bike.csv
  12K -rw-r--r--  1 root   12K Nov  3 12:46 blood.csv
  36K -rw-r--r--  1 root   35K Nov  3 12:46 boston.csv
  16K -rw-r--r--  1 root   15K Nov  3 12:46 cancer.csv
  60K -rw-r--r--  1 root   58K Nov  3 12:46 concrete.csv
  12K -rw-r--r--  1 root 

In [6]:
!head ./pycaret/datasets/wine.csv

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
7.4,0.7,0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5,red
7.8,0.88,0,2.6,0.098,25,67,0.9968,3.2,0.68,9.8,5,red
7.8,0.76,0.04,2.3,0.092,15,54,0.997,3.26,0.65,9.8,5,red
11.2,0.28,0.56,1.9,0.075,17,60,0.998,3.16,0.58,9.8,6,red
7.4,0.7,0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5,red
7.4,0.66,0,1.8,0.075,13,40,0.9978,3.51,0.56,9.4,5,red
7.9,0.6,0.06,1.6,0.069,15,59,0.9964,3.3,0.46,9.4,5,red
7.3,0.65,0,1.2,0.065,15,21,0.9946,3.39,0.47,10,7,red
7.8,0.58,0.02,2,0.073,9,18,0.9968,3.36,0.57,9.5,7,red


In [8]:
import pandas as pd
df_raw= pd.read_csv('./pycaret/datasets/wine.csv')
#
wine_dict={'red': 0, 'white': 1}
df_raw['type']= df_raw['type'].map(wine_dict)
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         6497 non-null   float64
 1   volatile acidity      6497 non-null   float64
 2   citric acid           6497 non-null   float64
 3   residual sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free sulfur dioxide   6497 non-null   float64
 6   total sulfur dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
 11  quality               6497 non-null   int64  
 12  type                  6497 non-null   int64  
dtypes: float64(11), int64(2)
memory usage: 660.0 KB


In [9]:
from collections import Counter
Counter(df_raw['type'])

Counter({0: 1599, 1: 4898})

In [10]:
df_raw.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'type'],
      dtype='object')

In [11]:
# Way to function
import sys
sys.path.append('/content/ml_pipeline/')

# Data manipulation
from seaborn import load_dataset
import numpy as np
import pandas as pd
from functions import calculate_roc_auc
pd.options.display.precision = 4
pd.options.mode.chained_assignment = None

# Machine learning pipeline
from sklearn.model_selection import train_test_split
from transformers_wine import FeatureExtractor, Imputer, CardinalityReducer, Encoder
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Load data
select = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'type']
df = df_raw[select].copy()
df.iloc[:, 1:4] = df.iloc[:, 1:4].astype('object')
print(df.shape)
df.head()

(6497, 13)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0


Rather than defining feature groups manually, we will divide variables into two categories depending on their type.

In [13]:
SEED = 42
TARGET = 'type'
FEATURES = df.columns.drop(TARGET)

NUMERICAL = df[FEATURES].select_dtypes('number').columns
print(f"Numerical features: {', '.join(NUMERICAL)}")

CATEGORICAL = pd.Index(np.setdiff1d(FEATURES, NUMERICAL))
print(f"Categorical features: {', '.join(CATEGORICAL)}\n")

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=TARGET), df[TARGET],
                                                    test_size=.2, random_state=SEED,
                                                    stratify=df[TARGET])

print(f"Training features shape: {X_train.shape}")
print(f"Test features shape: {X_test.shape}")

Numerical features: fixed acidity, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, quality
Categorical features: citric acid, residual sugar, volatile acidity

Training features shape: (5197, 12)
Test features shape: (1300, 12)


In [14]:
# NUMERICAL
df[FEATURES].select_dtypes('number')[:5]

Unnamed: 0,fixed acidity,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [15]:
# OBJECT
df[FEATURES].select_dtypes('O')[:5]

Unnamed: 0,volatile acidity,citric acid,residual sugar
0,0.7,0.0,1.9
1,0.88,0.0,2.6
2,0.76,0.04,2.3
3,0.28,0.56,1.9
4,0.7,0.0,1.9


In [16]:
X_train[:5]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
4110,6.2,0.3,0.49,11.2,0.058,68.0,215.0,0.9966,3.19,0.6,9.4,6
3062,8.3,0.19,0.49,1.2,0.051,11.0,137.0,0.9918,3.06,0.46,11.0,6
5236,8.7,0.3,0.34,4.8,0.018,23.0,127.0,0.9947,3.12,0.49,11.2,7
497,7.2,0.34,0.32,2.5,0.09,43.0,113.0,0.9966,3.32,0.79,11.1,5
826,7.5,0.27,0.34,2.3,0.05,4.0,8.0,0.9951,3.4,0.64,11.0,7


In [17]:
y_train[:5]

4110    1
3062    1
5236    1
497     0
826     0
Name: type, dtype: int64

# 💻 Build ML Pipeline
This pipeline is here to illustrate how custom features can be useful and not currently optimised for speed.

In [18]:
pipe = Pipeline([
    ('feature_extractor', FeatureExtractor()),
    ('cat_imputer', Imputer(CATEGORICAL)),
    ('cardinality_reducer', CardinalityReducer(CATEGORICAL, threshold=0.1)),
    ('encoder', Encoder(CATEGORICAL)),
    ('num_imputer', Imputer(NUMERICAL, method='mean')),
    ('feature_selector', RFE(LogisticRegression(random_state=SEED, max_iter=500), n_features_to_select=8)),
    ('model', LogisticRegression(random_state=SEED, max_iter=500))
])

pipe.fit(X_train, y_train)
print(f"Train ROC-AUC: {calculate_roc_auc(pipe, X_train, y_train):.4f}")
print(f"Test ROC-AUC: {calculate_roc_auc(pipe, X_test, y_test):.4f}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train ROC-AUC: 0.9902
Test ROC-AUC: 0.9914


In [19]:
pipe.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# 🔎 Explore pipeline
We will check top features as an example.

In [20]:
top_features = pipe['feature_selector'].feature_names_in_[pipe['feature_selector'].support_]
print(f"Top {len(top_features)} features: {', '.join(top_features)}")

Top 8 features: fixed acidity, chlorides, total sulfur dioxide, density, pH, sulphates, alcohol, quality


# 🔎 Explore plot
We will check top features with plot.

In [21]:
# Standard
import pandas as pd
import numpy as np
import os
# Plots
from plotly.offline import iplot
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
# Sklearn tools
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import *
# Extras
from datetime import date
import warnings
warnings.filterwarnings("ignore")

# Helper functions for structured data
## Get info about the dataset
def dataset_info(dataset, dataset_name: str):
    print(f"Dataset Name: {dataset_name} \
        | Number of Samples: {dataset.shape[0]} \
        | Number of Columns: {dataset.shape[1]}")
    print(30*"=")
    print("Column             Data Type")
    print(dataset.dtypes)
    print(30*"=")
    missing_data = dataset.isnull().sum()
    if sum(missing_data) > 0:
        print(missing_data[missing_data.values > 0])
    else:
        print("No Missing Data on this Dataset!")
    print(30*"=")
    print("Memory Usage: {} MB".\
         format(np.round(
         dataset.memory_usage(index=True).sum() / 10e5, 3
         )))
## Dataset Sampling
def data_sampling(dataset, frac: float, random_seed: int):
    data_sampled_a = dataset.sample(frac=frac, random_state=random_seed)
    data_sampled_b =  dataset.drop(data_sampled_a.index).reset_index(drop=True)
    data_sampled_a.reset_index(drop=True, inplace=True)
    return data_sampled_a, data_sampled_b
## Bar Plot
def bar_plot(data, plot_title: str, x_axis: str, y_axis: str):
    colors = ["#0080ff",] * len(data)
    colors[0] = "#ff8000"
    trace = go.Bar(y=data.values, x=data.index, text=data.values,
                    marker_color=colors)
    layout = go.Layout(autosize=False, height=600,
                    title={"text" : plot_title,
                       "y" : 0.9,
                       "x" : 0.5,
                       "xanchor" : "center",
                       "yanchor" : "top"},
                    xaxis={"title" : x_axis},
                    yaxis={"title" : y_axis},)
    fig = go.Figure(data=trace, layout=layout)
    fig.update_layout(template="simple_white")
    fig.update_traces(textposition="outside",
                    textfont_size=14,
                    marker=dict(line=dict(color="#000000", width=2)))
    fig.update_yaxes(automargin=True)
    iplot(fig)
## Plot Pie Chart
def pie_plot(data, plot_title: str):
    trace = go.Pie(labels=data.index, values=data.values)
    layout = go.Layout(autosize=False,
                    title={"text" : plot_title,
                       "y" : 0.9,
                       "x" : 0.5,
                       "xanchor" : "center",
                       "yanchor" : "top"})
    fig = go.Figure(data=trace, layout=layout)
    fig.update_traces(textfont_size=14,
                    marker=dict(line=dict(color="#000000", width=2)))
    fig.update_yaxes(automargin=True)
    iplot(fig)
## Histogram
def histogram_plot(data, plot_title: str, y_axis: str):
    trace = go.Histogram(x=data)
    layout = go.Layout(autosize=False,
                    title={"text" : plot_title,
                       "y" : 0.9,
                       "x" : 0.5,
                       "xanchor" : "center",
                       "yanchor" : "top"},
                    yaxis={"title" : y_axis})
    fig = go.Figure(data=trace, layout=layout)
    fig.update_traces(marker=dict(line=dict(color="#000000", width=2)))
    fig.update_layout(template="simple_white")
    fig.update_yaxes(automargin=True)
    iplot(fig)
# Particular case: Histogram subplot (1, 2)
def histogram_subplot(dataset_a, dataset_b, feature_a: str,
                        feature_b: str, title: str, title_a: str, title_b: str):
    fig = make_subplots(rows=1, cols=2, subplot_titles=(
                        title_a,
                        title_b
                        )
                    )
    fig.add_trace(go.Histogram(x=dataset_a[feature_a],
                               showlegend=False),
                                row=1, col=1)
    fig.add_trace(go.Histogram(x=dataset_b[feature_b],
                               showlegend=False),
                              row=1, col=2)
    fig.update_layout(template="simple_white")
    fig.update_layout(autosize=False,
                        title={"text" : title,
                        "y" : 0.9,
                        "x" : 0.5,
                        "xanchor" : "center",
                        "yanchor" : "top"},
                        yaxis={"title" : "<i>Frequency</i>"})
    fig.update_traces(marker=dict(line=dict(color="#000000", width=2)))
    fig.update_yaxes(automargin=True)
    iplot(fig)
# Calculate scores with Test/Unseen labeled data
def test_score_report(data_unseen, predict_unseen):
    le = LabelEncoder()
    data_unseen["Label"] = le.fit_transform(data_unseen.Churn.values)
    data_unseen["Label"] = data_unseen["Label"].astype(int)
    accuracy = accuracy_score(data_unseen["Label"], predict_unseen["Label"])
    roc_auc = roc_auc_score(data_unseen["Label"], predict_unseen["Label"])
    precision = precision_score(data_unseen["Label"], predict_unseen["Label"])
    recall = recall_score(data_unseen["Label"], predict_unseen["Label"])
    f1 = f1_score(data_unseen["Label"], predict_unseen["Label"])

In [22]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'type'],
      dtype='object')

In [33]:
pie_plot(df["type"].value_counts(),
         plot_title="<b>Wine Type Distribution<b>")

In [47]:
pie_plot(df["quality"].value_counts(),
         plot_title="<b>Wine Quality Distribution<b>")

In [60]:
# Top 8 features: fixed acidity, chlorides, total sulfur dioxide, density, pH, sulphates, alcohol, quality
df_aux = df.query('(quality==6) and (type == 0)')
df_aux = df_aux["alcohol"].value_counts()
bar_plot(df_aux, "<b>Quality and Alcohol Red Wine Distribution</b>",
         "<i>Alcohol</i>", "<i>Count</i>")

In [61]:
# Top 8 features: fixed acidity, chlorides, total sulfur dioxide, density, pH, sulphates, alcohol, quality
df_aux = df.query('(quality==6) and (type == 1)')
df_aux = df_aux["alcohol"].value_counts()
bar_plot(df_aux, "<b>Quality and Alcohol Wine Wine Distribution</b>",
         "<i>Alcohol</i>", "<i>Count</i>")

In [41]:
# Top 8 features: fixed acidity, chlorides, total sulfur dioxide, density, pH, sulphates, alcohol, quality
df_aux = df_raw.query('(alcohol >8) and (type == 0)')
histogram_subplot(df_aux, df_aux, "fixed acidity", "chlorides",
                  "<b>Distribution for Red Wine</b>",
                  "(a) Fixed Acidity Distribution", "(b) Chlorides Distribution")

In [39]:
# Top 8 features: fixed acidity, chlorides, total sulfur dioxide, density, pH, sulphates, alcohol, quality
df_aux = df_raw.query('(alcohol >3) and (type == 1)')
histogram_subplot(df_aux, df_aux, "fixed acidity", "chlorides",
                  "<b>Distribution for White Wine</b>",
                  "(a) Fixed Acidity Distribution", "(b) Chlorides Distribution")

THE END