In [1]:
import pandas as pd
import numpy as np
from pycaret.classification import setup, compare_models, create_api, create_model, tune_model, evaluate_model, create_app
from pycaret.datasets import get_data

In [2]:
datasets=get_data('index')

Unnamed: 0,Dataset,Data Types,Default Task,Target Variable 1,Target Variable 2,# Instances,# Attributes,Missing Values
0,anomaly,Multivariate,Anomaly Detection,,,1000,10,N
1,france,Multivariate,Association Rule Mining,InvoiceNo,Description,8557,8,N
2,germany,Multivariate,Association Rule Mining,InvoiceNo,Description,9495,8,N
3,bank,Multivariate,Classification (Binary),deposit,,45211,17,N
4,blood,Multivariate,Classification (Binary),Class,,748,5,N
5,cancer,Multivariate,Classification (Binary),Class,,683,10,N
6,credit,Multivariate,Classification (Binary),default,,24000,24,N
7,diabetes,Multivariate,Classification (Binary),Class variable,,768,9,N
8,electrical_grid,Multivariate,Classification (Binary),stabf,,10000,14,N
9,employee,Multivariate,Classification (Binary),left,,14999,10,N


In [3]:
df=get_data('income', save_copy=True)

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income >50K
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   education       32561 non-null  object
 3   education-num   32561 non-null  int64 
 4   marital-status  32561 non-null  object
 5   occupation      30718 non-null  object
 6   relationship    32561 non-null  object
 7   race            32561 non-null  object
 8   sex             32561 non-null  object
 9   capital-gain    32561 non-null  int64 
 10  capital-loss    32561 non-null  int64 
 11  hours-per-week  32561 non-null  int64 
 12  native-country  31978 non-null  object
 13  income >50K     32561 non-null  int64 
dtypes: int64(6), object(8)
memory usage: 3.5+ MB


In [5]:
cat=df.select_dtypes(include='object').columns.tolist()
cat

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [6]:
for col in cat:
    print(df[col].value_counts(dropna=False))
    print('======================================')

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
NaN                  1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64
HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: education, dtype: int64
Married-civ-spouse       14976
Never-married            10683
Divorced                  4443
Separated                 1025
Widowed                    993
Married-spouse-absent      418
Married-AF-spouse           23
Name: marital-status, dtype: int64
Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         377

In [7]:
def clean_marital_status (status):
    married_list=['Married-civ-spouse', 'Married-AF-spouse']
    if status in married_list:
        return 'Married'
    else:
        return 'Not_married'

In [8]:
df['marital-status']=df['marital-status'].apply(lambda x: clean_marital_status(x))

In [9]:
df.rename(columns={'income >50K':'income'}, inplace=True)

In [10]:
df.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Not_married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,HS-grad,9,Not_married,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,11th,7,Married,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,Bachelors,13,Married,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


## Training Process

In [11]:
s=setup(data=df, target='income', fix_imbalance=True, fix_imbalance_method='SMOTEENN', 
        log_experiment=True,log_plots=True, remove_outliers=True, experiment_name='income_experiment', 
        numeric_imputation='median', normalize=True, normalize_method='minmax')

Unnamed: 0,Description,Value
0,Session id,356
1,Target,income
2,Target type,Binary
3,Original data shape,"(32561, 14)"
4,Transformed data shape,"(34772, 58)"
5,Transformed train set shape,"(25017, 58)"
6,Transformed test set shape,"(9769, 58)"
7,Ordinal features,2
8,Numeric features,5
9,Categorical features,8


In [15]:
transformed_data=s.dataset_transformed
transformed_data

Unnamed: 0,age,workclass_Self-emp-not-inc,workclass_Private,workclass_Federal-gov,workclass_Local-gov,workclass_Self-emp-inc,workclass_State-gov,workclass_Without-pay,workclass_Never-worked,education_Bachelors,...,race_White,race_Black,race_Amer-Indian-Eskimo,race_Other,sex,capital-gain,capital-loss,hours-per-week,native-country,income
12430,0.082192,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.397959,0.560445,0
12834,0.013699,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.295918,0.560445,0
23369,0.150685,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.500000,0.560445,0
24702,0.068493,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.346939,0.560445,0
27203,0.438356,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.316327,0.560445,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9268,0.232877,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.397959,0.560445,1
11227,0.561644,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.704082,0.560445,0
9702,0.123288,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.560445,0
19290,0.054795,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.346939,0.560445,0


In [16]:
top_models=compare_models(n_select=3, sort='F1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8337,0.9225,0.8508,0.6112,0.7113,0.5989,0.6151,3.19
gbc,Gradient Boosting Classifier,0.8173,0.911,0.8605,0.5818,0.6941,0.5707,0.593,4.306
rf,Random Forest Classifier,0.8231,0.8902,0.8184,0.597,0.6902,0.5707,0.5847,3.661
ada,Ada Boost Classifier,0.8093,0.9085,0.8583,0.5693,0.6844,0.5557,0.5799,3.204
et,Extra Trees Classifier,0.8123,0.874,0.7918,0.581,0.6701,0.5432,0.556,3.72
lr,Logistic Regression,0.7778,0.8967,0.8922,0.5228,0.6592,0.5105,0.5508,3.823
svm,SVM - Linear Kernel,0.7754,0.0,0.8867,0.5213,0.6558,0.5058,0.5458,2.791
dt,Decision Tree Classifier,0.7994,0.7966,0.7914,0.5592,0.6552,0.5196,0.5353,2.931
knn,K Neighbors Classifier,0.7688,0.8388,0.8185,0.5126,0.6303,0.4748,0.5024,3.647
ridge,Ridge Classifier,0.7468,0.0,0.8931,0.4861,0.6295,0.4615,0.511,2.791


In [17]:
top_models

[LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                importance_type='split', learning_rate=0.1, max_depth=-1,
                min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
                random_state=356, reg_alpha=0.0, reg_lambda=0.0, subsample=1.0,
                subsample_for_bin=200000, subsample_freq=0),
 GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                            learning_rate=0.1, loss='log_loss', max_depth=3,
                            max_features=None, max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_samples_leaf=1,
                            min_samples_split=2, min_weight_fraction_leaf=0.0,
                            n_estimators=100, n_iter_no_change=None,
                            random_state=356, subsample=1.0, tol=0.0001,
                           

In [18]:
tuned_models=[tune_model(models, fold=5) for models in top_models]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8381,0.9185,0.8497,0.6195,0.7166,0.6072,0.622
1,0.8263,0.9151,0.8443,0.5988,0.7007,0.5832,0.6003
2,0.8372,0.9233,0.8341,0.6203,0.7115,0.6015,0.6143
3,0.8306,0.918,0.8434,0.6068,0.7058,0.5913,0.6071
4,0.8302,0.9151,0.826,0.6087,0.7009,0.5861,0.5994
Mean,0.8325,0.918,0.8395,0.6108,0.7071,0.5939,0.6086
Std,0.0045,0.003,0.0084,0.0081,0.0062,0.0091,0.0086


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.839,0.914,0.826,0.6255,0.7119,0.6031,0.6144
1,0.8247,0.9089,0.8315,0.5979,0.6956,0.5771,0.5926
2,0.833,0.9197,0.8368,0.612,0.707,0.5941,0.6083
3,0.8333,0.917,0.8379,0.6125,0.7077,0.595,0.6092
4,0.8322,0.9117,0.8279,0.6121,0.7038,0.5904,0.6035
Mean,0.8324,0.9142,0.832,0.612,0.7052,0.5919,0.6056
Std,0.0045,0.0038,0.0047,0.0087,0.0054,0.0085,0.0074


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7741,0.8903,0.8661,0.5185,0.6487,0.4972,0.5328
1,0.7752,0.8823,0.8534,0.5203,0.6464,0.4954,0.5281
2,0.7821,0.8913,0.8569,0.5293,0.6544,0.508,0.5394
3,0.7799,0.8936,0.8461,0.5269,0.6494,0.5014,0.5313
4,0.7799,0.8861,0.8415,0.5271,0.6482,0.5001,0.5291
Mean,0.7783,0.8887,0.8528,0.5244,0.6494,0.5004,0.5321
Std,0.0031,0.004,0.0086,0.0042,0.0027,0.0043,0.004


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [25]:
from pycaret.classification import finalize_model, automl, evaluate_model,predict_model

In [21]:
best_model=automl(optimize='F1')
best_model

In [24]:
evaluate_model(light_gbm)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [26]:
predict_model(light_gbm)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.8256,0.9221,0.8682,0.5943,0.7056,0.5878,0.6091


Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,prediction_label,prediction_score
16442,25,Private,HS-grad,9,Not_married,Handlers-cleaners,Not-in-family,White,Male,0,0,60,England,0,0,0.9742
3982,50,Private,Bachelors,13,Not_married,Prof-specialty,Unmarried,White,Female,0,0,36,Ireland,0,0,0.8446
13702,18,Private,12th,8,Not_married,Farming-fishing,Own-child,White,Male,0,0,55,United-States,0,0,0.9988
24261,38,Private,Some-college,10,Married,Exec-managerial,Husband,White,Male,0,0,40,United-States,0,1,0.9558
11532,79,,Prof-school,15,Married,,Husband,White,Male,0,0,10,United-States,0,1,0.6034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9268,34,Local-gov,Some-college,10,Married,Exec-managerial,Husband,White,Male,0,0,40,United-States,1,1,0.8856
11227,58,Self-emp-not-inc,HS-grad,9,Not_married,Exec-managerial,Not-in-family,White,Male,0,0,70,United-States,0,0,0.7447
9702,26,State-gov,Assoc-acdm,12,Not_married,Adm-clerical,Own-child,Black,Female,0,0,8,United-States,0,0,0.9995
19290,21,Private,HS-grad,9,Not_married,Handlers-cleaners,Own-child,White,Female,0,0,35,United-States,0,0,0.9997


In [22]:
light_gbm=finalize_model(best_model)
light_gbm

In [19]:
!mlflow ui

^C


## Serve the model

In [18]:
#set the environment variable foor tracking URL where the Model Registry resides
!set MLFLOW_TRACKING_URI=http://localhost:5000

In [41]:
#serve the model
!mlflow models serve -m 'models:/light_gbm/Production'

* 'schema_extra' has been renamed to 'json_schema_extra'

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Traceback (most recent call last):
  File "C:\Users\USER\anaconda3\envs\mlops\lib\runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\USER\anaconda3\envs\mlops\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\USER\anaconda3\envs\mlops\Scripts\mlflow.exe\__main__.py", line 7, in <module>
    sys.exit(cli())
  File "C:\Users\USER\anaconda3\envs\mlops\lib\site-packages\click\core.py", line 1157, in __call__
    return self.main(*args, **kwargs)
  File "C:\Users\USER\anaconda3\envs\mlops\lib\site-packages\click\core.py", line 1078, in main
    rv = self.invoke(ctx)
  File "C:\Users\USER\anaconda3\envs\mlops\lib\site-packages\click\core.py", line 1688, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  Fil

In [27]:
light_gbm=create_model(best_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8329,0.9191,0.847,0.6102,0.7094,0.5964,0.6122
1,0.839,0.9279,0.8634,0.6188,0.7209,0.6121,0.6289
2,0.8293,0.9206,0.8504,0.6028,0.7055,0.5902,0.6075
3,0.8333,0.9172,0.8397,0.6122,0.7081,0.5954,0.61
4,0.8376,0.9303,0.8689,0.6155,0.7205,0.6108,0.6288
5,0.8385,0.9264,0.8561,0.6192,0.7187,0.6095,0.6252
6,0.8319,0.9208,0.8525,0.6078,0.7096,0.596,0.6129
7,0.8337,0.9242,0.8597,0.6098,0.7135,0.6011,0.6187
8,0.8434,0.9248,0.8707,0.6257,0.7281,0.6222,0.6389
9,0.8324,0.9163,0.8306,0.6121,0.7048,0.5915,0.6049


## Pycaret API

In [28]:
create_api(light_gbm, 'light_gbm_API')

API successfully created. This function only creates a POST API, it doesn't run it automatically. To run your API, please run this command --> !python light_gbm_API.py


In [42]:
!python light_gbm_API.py

Transformation Pipeline and Model Successfully Loaded


Traceback (most recent call last):
  File "light_gbm_API2.py", line 24, in <module>
    input_model = create_model("light_gbm_API2_input", **{'age': 50, 'workclass': 'Private', 'education': 'Some-college', 'education-num': 10, 'marital-status': 'Married', 'occupation': 'Craft-repair', 'relationship': 'Husband', 'race': 'White', 'sex': 'Male', 'capital-gain': 0, 'capital-loss': 0, 'hours-per-week': 40, 'native-country': 'United-States'})
  File "C:\Users\USER\anaconda3\envs\mlops\lib\site-packages\pydantic\main.py", line 1490, in create_model
    return meta(
  File "C:\Users\USER\anaconda3\envs\mlops\lib\site-packages\pydantic\_internal\_model_construction.py", line 92, in __new__
    private_attributes = inspect_namespace(
  File "C:\Users\USER\anaconda3\envs\mlops\lib\site-packages\pydantic\_internal\_model_construction.py", line 384, in inspect_namespace
    raise PydanticUserError(
pydantic.errors.PydanticUserError: A non-annotated attribute was detected: `age = 50`. All model fiel

In [21]:
create_app('light_gbm')

AttributeError: module 'gradio' has no attribute 'inputs'

In [22]:
pip show gradio

Name: gradio
Version: 4.19.1
Summary: Python library for easily interacting with trained machine learning models
Home-page: 
Author: 
Author-email: Abubakar Abid <gradio-team@huggingface.co>, Ali Abid <gradio-team@huggingface.co>, Ali Abdalla <gradio-team@huggingface.co>, Dawood Khan <gradio-team@huggingface.co>, Ahsen Khaliq <gradio-team@huggingface.co>, Pete Allen <gradio-team@huggingface.co>, Ömer Faruk Özdemir <gradio-team@huggingface.co>, Freddy A Boulton <gradio-team@huggingface.co>, Hannah Blair <gradio-team@huggingface.co>
License: 
Location: c:\users\user\anaconda3\envs\mlops\lib\site-packages
Requires: aiofiles, altair, fastapi, ffmpy, gradio-client, httpx, huggingface-hub, importlib-resources, jinja2, markupsafe, matplotlib, numpy, orjson, packaging, pandas, pillow, pydantic, pydub, python-multipart, pyyaml, ruff, semantic-version, tomlkit, typer, typing-extensions, uvicorn
Required-by: 
Note: you may need to restart the kernel to use updated packages.
