In [1]:
import pandas as pd
from pycaret.regression import setup, compare_models, evaluate_model, predict_model, finalize_model, \
     save_model, load_model, create_docker

In [2]:
# path to data file
path = '../data/wine-quality.csv'

In [3]:
# read data
df = pd.read_csv(path)

In [4]:
# view sample of data
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
# setup the dataset
# must be called before executing any other function
# can configure many types of transformation operations
# by default Missing Value Imputation, One-Hot Encoding and Train-Test Split operations will be performed
# press enter to continue
grid = setup(data=df, target=df.columns[-1])

Unnamed: 0,Description,Value
0,session_id,3161
1,Target,quality
2,Original Data,"(1599, 12)"
3,Missing Values,False
4,Numeric Features,11
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(1119, 11)"


In [6]:
# train and compare all supported models
# uses cross-validation
best_model = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.4082,0.3602,0.5973,0.446,0.0933,0.0761,0.153
rf,Random Forest Regressor,0.4435,0.3781,0.6119,0.4188,0.0954,0.0824,0.198
gbr,Gradient Boosting Regressor,0.4858,0.4065,0.6347,0.3746,0.098,0.0892,0.067
lightgbm,Light Gradient Boosting Machine,0.4645,0.4105,0.6373,0.367,0.0989,0.086,0.132
ada,AdaBoost Regressor,0.5163,0.4196,0.6462,0.3557,0.1002,0.0953,0.051
ridge,Ridge Regression,0.5054,0.4342,0.6573,0.3311,0.1015,0.0929,0.007
br,Bayesian Ridge,0.5055,0.4344,0.6575,0.3308,0.1016,0.0929,0.006
lr,Linear Regression,0.5052,0.4343,0.6576,0.3302,0.1015,0.0929,0.339
huber,Huber Regressor,0.5062,0.4462,0.6661,0.3124,0.1031,0.0935,0.019
lar,Least Angle Regression,0.5213,0.4623,0.675,0.2856,0.1041,0.0956,0.007


In [7]:
# report the best model
print(best_model)

ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=-1, oob_score=False,
                    random_state=3161, verbose=0, warm_start=False)


In [8]:
# evaluate the model using a number of different plots
# click on the different plot types to exlpore
# some plots may not work depending on the data and the model
evaluate_model(best_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [9]:
# make predictions on new data
# data should be a DataFrame without label
# predict_model(best_model, new_data)

In [10]:
# finalize model
# trains the model on the entire dataset including the hold-out set
# does not change any parameter of the model
final_model = finalize_model(best_model)

In [11]:
# save model as pickle file
save_model(final_model, 'regression_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[], target='quality',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strategy...
                  ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0,
                                      criterion='mse', max_depth=None,
                                      max_features='auto', max_leaf_nodes=None,
                                      max_samples=None,
                                    

In [12]:
# load saved model for use
model = load_model('regression_model')

Transformation Pipeline and Model Successfully Loaded


In [13]:
# create Dockerfile for model
# also creates a requirements.txt file for dependencies
create_docker('regression_model')

Writing requirements.txt
Writing Dockerfile
Dockerfile and requirements.txt successfully created.
To build image you have to run --> !docker image build -f "Dockerfile" -t IMAGE_NAME:IMAGE_TAG .
        
