# Tutorial 6: Custom pipiline tutorial

## Preparing

### Step 1. Install LightAutoML

Uncomment if doesn't clone repository by git. (ex.: colab, kaggle version)

In [None]:
! pip install -U lightautoml

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lightautoml
  Downloading LightAutoML-0.3.7.3-py3-none-any.whl (319 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.6/319.6 KB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting catboost>=0.26.1
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Collecting poetry-core<2.0.0,>=1.0.0
  Downloading poetry_core-1.5.1-py3-none-any.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.2/465.2 KB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
Collecting cmaes
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting torch<1.9
  Downloading torch-1.8.1-cp38-cp38-manylinux1_x86_64.whl (804.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m804.1/804.1 MB[0m [31m2.0 MB/s[

### Step 2. Import necessary libraries 

In [None]:
# Standard python libraries
import os
import time
import requests


# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch

# Imports from our package
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector, ModelBasedImportanceEstimator
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
from lightautoml.automl.blend import WeightedBlender
#from lightautoml.transformers.categorical import LabelEncoder

import pickle

### Step 3. Parameters 

In [None]:
N_THREADS = 8 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TARGET_NAME = 'func_class' # Target column name

### Step 4. Fix torch number of threads and numpy seed 

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

### Step 5. Example data load 

Load a dataset from the repository if doesn't clone repository by git.

In [None]:
DATASET_DIR = './data/'
DATASET_NAME = 'train_feature_graph_values.csv'
DATASET_FULLNAME = os.path.join(DATASET_DIR, DATASET_NAME)
#DATASET_URL = r'.../train_feature_graph_values.csv'

In [None]:
%%time

if not os.path.exists(DATASET_FULLNAME):
    os.makedirs(DATASET_DIR, exist_ok=True)

    dataset = requests.get(DATASET_URL).text
    with open(DATASET_FULLNAME, 'w') as output:
        output.write(dataset)

CPU times: user 927 µs, sys: 3 µs, total: 930 µs
Wall time: 2.04 ms


In [None]:
%%time
#df = pandas.read_csv(filepath, sep='delimiter', header=None)
data = pd.read_csv(DATASET_FULLNAME)#, sep=';', header=None)
data.head()

CPU times: user 224 ms, sys: 6.88 ms, total: 231 ms
Wall time: 250 ms


Unnamed: 0,compiller,version,compiler_option,name_hash,instructions_count,func_body,vcount,diameter,girth,radius,average_path_length,transitivity_avglocal_undirected,func_class
0,gcc,10.2.1,Og,09e29e012476cbc7fbc8f4aea11dfc95,823,"['push r15', 'mov r15, rdx', 'push r14', 'mov ...",182,34,3.0,0,15.027724,0.151361,System
1,gcc,10.2.1,Og,7a0bf6fdc429813ac22c6214adfd4169,102,"['cmp rdx, ADDR', 'jbe FCN', 'add rdx, 1', 'mo...",22,8,4.0,0,3.386598,0.0,'String and text processing'
2,gcc,9.3.0,Ofast,671f56991dda1de967d83ddb26e000ef,87,"['push r15', 'push r14', 'mov r14, rcx', 'push...",19,10,3.0,0,3.710526,0.157407,System
3,gcc,9.3.0,Ofast,cdbc9d3aace74424320af209eac9c0bf,45,"['push r12', 'lea rax, [ADDR]', 'push rbp', 'l...",8,4,3.0,2,2.236842,0.208333,Another
4,gcc,9.3.0,Ofast,0dbef1318f69d668de203321b5c0f6b6,20,"['push rbp', 'push rbx', 'sub rsp, 8', 'mov rb...",4,2,4.0,0,1.333333,0.0,Programming


### Step 6. (Optional) Some user feature preparation 

Cell below shows some user feature preparations to create task more difficult (this block can be omitted if you don't want to change the initial data):

In [None]:
%%time

# data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
# data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
#                     ).astype(str)

# data['constant'] = 1
# data['allnan'] = np.nan

# data['report_dt'] = np.datetime64('2018-01-01')

data.drop(['compiller', 'version', 'compiler_option', 'name_hash'], axis=1, inplace=True)
data.head()

CPU times: user 3.35 ms, sys: 82 µs, total: 3.43 ms
Wall time: 3.24 ms


Unnamed: 0,instructions_count,func_body,vcount,diameter,girth,radius,average_path_length,transitivity_avglocal_undirected,func_class
0,823,"['push r15', 'mov r15, rdx', 'push r14', 'mov ...",182,34,3.0,0,15.027724,0.151361,System
1,102,"['cmp rdx, ADDR', 'jbe FCN', 'add rdx, 1', 'mo...",22,8,4.0,0,3.386598,0.0,'String and text processing'
2,87,"['push r15', 'push r14', 'mov r14, rcx', 'push...",19,10,3.0,0,3.710526,0.157407,System
3,45,"['push r12', 'lea rax, [ADDR]', 'push rbp', 'l...",8,4,3.0,2,2.236842,0.208333,Another
4,20,"['push rbp', 'push rbx', 'sub rsp, 8', 'mov rb...",4,2,4.0,0,1.333333,0.0,Programming


In [None]:
data.shape

(14437, 9)

Class encoding

In [None]:
!pip install --upgrade pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pandas
  Downloading pandas-1.5.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.2/12.2 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.3.5
    Uninstalling pandas-1.3.5:
      Successfully uninstalled pandas-1.3.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
lightautoml 0.3.7.3 requires pandas<=1.4.3; python_version >= "3.8", but you have pandas 1.5.3 which is incompatible.[0m[31m
[0mSuccessfully installed pandas-1.5.3


In [None]:
le = LabelEncoder()
le.fit(data[TARGET_NAME] )
data[TARGET_NAME] = le.transform(data[TARGET_NAME] )

Add  TFIDF data in table format. But LAMA must can do it. Skeep in next time:

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

v = TfidfVectorizer()
x = v.fit_transform(data['func_body'])
sub_data_TFIDF = pd.DataFrame(x.toarray(), columns=v.get_feature_names_out())
sub_data_TFIDF.head()

Unnamed: 0,abcdefghijklmnopqrstuvwxyz,adc,add,addr,addsd,addss,afcn,ah,al,alarm,...,ymm1,ymm2,ymm3,ymm4,ymm5,ymm6,ymm7,ymm8,ymm9,ymmword
0,0.0,0.0,0.150963,0.242271,0.0,0.0,0.0,0.0,0.139285,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.060192,0.223383,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.048827,0.411385,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.03113,0.474605,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.065743,0.316517,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Concat**. Or skip if skip

In [None]:

data.drop(['func_body'], axis=1, inplace=True)
data = pd.concat([data, sub_data_TFIDF], axis=1)


### Step 7. (Optional) Data splitting for train-test 

Block below can be omitted if you are going to train model only or you have specific train and test files:

In [None]:
%%time

train_data, test_data = train_test_split(data, 
                                         test_size=TEST_SIZE, 
                                         stratify=data[TARGET_NAME], 
                                         random_state=RANDOM_STATE)
print('Data splitted. Parts sizes: train_data = {}, test_data = {}'
              .format(train_data.shape, test_data.shape))

Data splitted. Parts sizes: train_data = (11549, 691), test_data = (2888, 691)
CPU times: user 71.7 ms, sys: 147 ms, total: 219 ms
Wall time: 220 ms


In [None]:
train_data.head()

Unnamed: 0,instructions_count,vcount,diameter,girth,radius,average_path_length,transitivity_avglocal_undirected,func_class,abcdefghijklmnopqrstuvwxyz,adc,...,ymm1,ymm2,ymm3,ymm4,ymm5,ymm6,ymm7,ymm8,ymm9,ymmword
6177,56,10,4,4.0,0,1.933333,0.0,4,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3433,38,7,4,3.0,3,2.064516,0.519048,7,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3501,24,4,2,4.0,0,1.333333,0.0,5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7928,105,24,9,3.0,0,3.855799,0.384058,5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14302,0,0,0,0.0,0,0.0,0.0,3,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## AutoML creation

![AutoML pipeline for this task](https://github.com/sberbank-ai-lab/LightAutoML/blob/master/imgs/tutorial_1_pipeline.png?raw=1)

### Step 1. Create Task and PandasReader

In [None]:
%%time

task = Task(name = 'multiclass',loss = "f1",metric = 'auc_mu')
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE)

INFO2:lightautoml.tasks.base:sklearn doesn't support in general case f1 and will not be used.
INFO2:lightautoml.tasks.base:cb doesn't support in general case f1 and will not be used.


CPU times: user 29 ms, sys: 0 ns, total: 29 ms
Wall time: 56.4 ms


### Step 2. Create feature selector (if necessary) 

In [None]:
%%time

model0 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'seed': 42, 'num_threads': N_THREADS}
)
pipe0 = LGBSimpleFeatures()
mbie = ModelBasedImportanceEstimator()
selector = ImportanceCutoffSelector(pipe0, model0, mbie, cutoff=0)

CPU times: user 654 µs, sys: 0 ns, total: 654 µs
Wall time: 669 µs


### Step 3.1. Create 1st level ML pipeline for AutoML 

Our first level ML pipeline:
- Simple features for gradient boosting built on selected features (using step 2) 
- 2 different models:
    * LightGBM with params tuning (using OptunaTuner)
    * LightGBM with heuristic params


In [None]:
%%time 

pipe = LGBSimpleFeatures()

params_tuner1 = OptunaTuner(n_trials=20, timeout=30) # stop after 20 iterations or after 30 seconds 
model1 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 128, 'seed': 1, 'num_threads': N_THREADS}
)
model2 = BoostLGBM(
    default_params={'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads': N_THREADS}
)

pipeline_lvl1 = MLPipeline([
    (model1, params_tuner1),
    model2
], pre_selection=selector, features_pipeline=pipe, post_selection=None)

CPU times: user 137 µs, sys: 34 µs, total: 171 µs
Wall time: 177 µs


### Step 3.2. Create 2nd level ML pipeline for AutoML 

Our second level ML pipeline:
- Using simple features as well, but now it will be Out-Of-Fold (OOF) predictions of algos from 1st level
- Only one LGBM model without params tuning
- Without feature selection on this stage because we want to use all OOFs here

In [None]:
%%time

pipe1 = LGBSimpleFeatures()

model = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'max_bin': 1024, 'seed': 3, 'num_threads': N_THREADS},
    freeze_defaults=True
)

pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None)

CPU times: user 146 µs, sys: 0 ns, total: 146 µs
Wall time: 154 µs


### Step 4. Create AutoML pipeline 

AutoML pipeline consist of:
- Reader for data preparation
- First level ML pipeline (as built in step 3.1)
- Second level ML pipeline (as built in step 3.2)
- `Skip_conn = False` equals here "not to use initial features on the second level pipeline"

In [None]:
%%time 

automl = AutoML(reader, [
    [pipeline_lvl1],
    [pipeline_lvl2],
], skip_conn=False)

CPU times: user 81 µs, sys: 0 ns, total: 81 µs
Wall time: 85.8 µs


### Step 5. Train AutoML on loaded data 

In cell below we train AutoML with target column `TARGET` to receive fitted model and OOF predictions:

In [None]:
%%time 

oof_pred = automl.fit_predict(train_data, roles={'target': TARGET_NAME})
print('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

INFO:lightautoml.reader.base:[1mTrain data shape: (11549, 691)[0m

INFO3:lightautoml.reader.base:Feats was rejected during automatic roles guess: []
INFO:lightautoml.automl.base:Layer [1m1[0m train process start. Time left 9999999960.68 secs
INFO2:lightautoml.ml_algo.utils:Model LightGBM failed during ml_algo.fit_predict call.

axis 1 is out of bounds for array of dimension 1
INFO3:lightautoml.ml_algo.utils:Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/lightautoml/ml_algo/utils.py", line 66, in tune_and_fit_predict
    preds = ml_algo.fit_predict(train_valid)
  File "/usr/local/lib/python3.8/dist-packages/lightautoml/ml_algo/base.py", line 273, in fit_predict
    model, pred = self.fit_predict_single_fold(train, valid)
  File "/usr/local/lib/python3.8/dist-packages/lightautoml/ml_algo/boost_lgbm.py", line 255, in fit_predict_single_fold
    model = lgb.train(
  File "/usr/local/lib/python3.8/dist-packages/lightgbm/engine.py", line 256, in train
 

AssertionError: ignored

### Step 6. Analyze fitted model  

Below we analyze feature importances of different algos:

In [None]:
print('Feature importances of selector:\n{}'
              .format(selector.get_features_score()))
print('=' * 70)

print('Feature importances of top level algorithm:\n{}'
              .format(automl.levels[-1][0].ml_algos[0].get_features_score()))
print('=' * 70)

print('Feature importances of lowest level algorithm - model 0:\n{}'
              .format(automl.levels[0][0].ml_algos[0].get_features_score()))
print('=' * 70)

print('Feature importances of lowest level algorithm - model 1:\n{}'
              .format(automl.levels[0][0].ml_algos[1].get_features_score()))
print('=' * 70)

Feature importances of selector:
None


IndexError: ignored

### Step 7. Predict to test data and check scores

In [None]:
#oof_pred.data[:, 0].sum()


def get_best_class(predict_labels):
  return np.where(predict_labels == predict_labels.max())[0][0]#  predict_labels[predict_labels.max()]

y_train_predict = pd.DataFrame(oof_pred.data).apply( lambda x: get_best_class(x), axis=1)


In [None]:
%%time

#test_pred = automl.predict(test_data)
# print('Prediction for test data:\n{}\nShape = {}'
#               .format(test_pred, test_pred.shape))

print('Check scores...')
#print('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))
#print('TEST score: {}'.format(roc_auc_score(test_data[TARGET_NAME].values, test_pred.data[:, 0])))
print('TRAIN F1 (macro) score: {}'.format(f1_score(train_data[TARGET_NAME], y_train_predict, average = 'macro')))
print('TRAIN F1 (micro) score: {}'.format(f1_score(train_data[TARGET_NAME], y_train_predict, average = 'micro')))

# print('TEST F1 (macro) score: {}'.format(f1_score(test_data[TARGET_NAME].values, test_pred.data[:, 0], average = 'macro')))
# print('TEST F1 (micro) score: {}'.format(f1_score(test_data[TARGET_NAME].values, test_pred.data[:, 0], average = 'micro')))



Check scores...
TRAIN F1 (macro) score: 0.0932700528769926
TRAIN F1 (micro) score: 0.011342973417611913
CPU times: user 22.5 ms, sys: 1.09 ms, total: 23.6 ms
Wall time: 48.7 ms


Fast (fast) - this method uses feature importances from feature selector LGBM model inside LightAutoML. It works extremely fast and almost always (almost because of situations, when feature selection is turned off or selector was removed from the final models with all GBM models). no need to use new labelled data.

In [None]:
%%time

# Fast feature importances calculation
fast_fi = automl_rd.model.get_feature_scores('fast')
fast_fi.set_index('Feature')['Importance'].plot.bar(figsize = (30, 10), grid = True)

Accurate (accurate) - this method calculate features permutation importances for the whole LightAutoML model based on the new labelled data. It always works but can take a lot of time to finish (depending on the model structure, new labelled dataset size etc.)

In [None]:
%%time

# Accurate feature importances calculation (Permutation importances) -  can take long time to calculate
accurate_fi = automl_rd.model.get_feature_scores('accurate', te_data, silent = False)

## Save model

In [None]:
with open('apperance_model.pkl', 'wb') as f:
    pickle.dump(automl, f)

In [None]:
with open('apperance_model.pkl', 'rb') as f:
    automl = pickle.load(f)
automl.set_verbosity_level(2)