In [1]:
from item_clustering.item_clustering import ItemClustering
from mlflow_model import conda_env
import mlflow_model.item_clustering
import pandas as pd
from utils.read_files import (
    get_items
)
import mlflow

#  Train model

In [2]:
model = ItemClustering(config={'artifacts_path' : '../data/output/test/'})

In [3]:
model.config.artifacts_path

'../data/output/test/'

In [4]:
model.config.get_config_dict()

{'word_embeddings_path': '../data/embeddings/fasttext/skip_s100.txt',
 'algorithm': 'hdbscan',
 'categories': ['unidades_medida', 'numeros'],
 'tags': ['N', 'MED'],
 'operation': 'concatenate',
 'n_process': 4,
 'artifacts_path': '../data/output/test/'}

In [5]:
file = '../data/items_sample.csv.zip'
items = get_items(file)

In [6]:
len(items)

1000

In [7]:
items[:5]

[['591 FUCSINA BASICA 0,5% 1000',
  3741539,
  107368,
  39.11,
  'FRASCO',
  2014,
  2,
  '2013-12-20',
  'IMBE DE MINAS',
  'PREFEITURA MUNICIPAL DE IMBE DE MINAS'],
 ['RECEPTACULO E27',
  2214949,
  67774,
  2.5,
  'UNIDADES',
  2017,
  2,
  '2017-01-02',
  'COROACI',
  'PREFEITURA MUNICIPAL DE COROACI'],
 ['ROLO DE ESPUMA 23 CM',
  6555568,
  195354,
  7.0,
  'UNIDADE',
  2014,
  7,
  '2014-05-14',
  'OURO FINO',
  'PREFEITURA MUNICIPAL DE OURO FINO'],
 ['JANELA DE ACO DE CORRER COM GRADE IMOLA E COM BASCULANTE 120A X 150L X 120RQ - 1 LINHA',
  1182966,
  37446,
  282.2,
  'UN',
  2015,
  11,
  '2015-11-11',
  'CALDAS',
  'PREFEITURA MUNICIPAL DE CALDAS'],
 ['PECA DE MADEIRA PARAJU 8X8',
  14212303,
  443387,
  10.4,
  'UN',
  2018,
  9,
  '2018-08-17',
  'SAO JOAO DEL REI',
  'PREFEITURA MUNICIPAL DE SAO JOAO DEL REI']]

## Test

# Save model

In [8]:
from mlflow_model import wrapper

In [9]:
mlflow.set_experiment(experiment_name='test')

In [10]:
mlflow_pyfunc_model_path = "item_clustering_mlflow_pyfunc"

In [11]:
conda_env.conda_env

{'channels': ['defaults'],
 'dependencies': ['python=3.7.6',
  'pip',
  {'pip': ['mlflow', 'hdbscan', 'umap-learn']}],
 'name': 'item_clustering_env'}

In [12]:
with mlflow.start_run(run_name='test'):
    mlflow.log_params(model.config.get_config_dict())
    model.fit(items)
    model.save_model()
    
#     metrics = model.evaluate()
#     print(metrics)
    
    # log metrics
    mlflow.log_metrics({'avg_calinski': 5000.0, 'avg_davies': 0.20})

#     mlflow_model.item_clustering.log_model(model, mlflow_pyfunc_model_path)
#     mlflow.log_artifacts(mlflow_pyfunc_model_path + model.config.artifacts_path, "artifacts")

    artifacts = {
        "artifacts_path": model.config.artifacts_path
    }
    mlflow.pyfunc.log_model(artifact_path=mlflow_pyfunc_model_path, python_model=wrapper.ItemClustering(),
                            artifacts=artifacts, conda_env=conda_env.conda_env)

Read ranges
([0, 101, 202, 302, 403, 503, 604, 704, 805, 905], [100, 201, 301, 402, 502, 603, 703, 804, 904, 999])
Fri Jun 11 13:53:31 2021 Loading word embeddings
Read ranges
([0, 132, 264, 395], [131, 263, 394, 525])
0

12
3


# Load model

In [None]:
model = ItemClustering.ItemClustering()

In [None]:
model.config.artifacts_path

In [None]:
model.load_model('./item_clustering_mlflow_pyfunc/artifacts/')

In [None]:
# Load the model in `python_function` format
loaded_model = mlflow.pyfunc.load_model(mlflow_pyfunc_model_path)