<a href="https://colab.research.google.com/github/KondrashovIgor/ML/blob/main/Best_fit_TFDF_for_CDEK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import os
!pip install tensorflow tensorflow_decision_forests
import tensorflow_decision_forests as tfdf
import tensorflow_datasets as tfds
from sklearn.model_selection import train_test_split



In [None]:
import pandas as pd
path_to_file = ('Консолидации СПБ СЦ Софийская TFDF.xlsx - document_document_consolidation (1).csv')
consoles = pd.read_csv(path_to_file)
print(consoles, 10)
consoles.columns = [
    'Cage_type',
    'length',
    'tall',
    'width',
    'Weight',
    'Destination'
]

consoles.head
print(consoles.columns)

      Тип тары  Длина, см  Высота, см  Ширина, см Факт. вес, кг Направление
0       Паллет         33          30          23           6,5  Магистраль
1       Паллет         40          15          40           6,4  Магистраль
2       Паллет         30          35          30             7        АВИА
3       Паллет         64           8           8           1,4        АВИА
4       Паллет         32          20          31         3,456         ОБЛ
...        ...        ...         ...         ...           ...         ...
14993   Телега         33          15          26          0,45         ПВЗ
14994   Телега         41          11          26           2,6         ОБЛ
14995   Телега         24          47          38          5,68         ОБЛ
14996   Паллет         29          14          22             1         ОБЛ
14997   Телега         41          77          49            18         ПВЗ

[14998 rows x 6 columns] 10
Index(['Cage_type', 'length', 'tall', 'width', 'Weight', 'D

In [None]:
def split_dataset(dataset, test_ratio=0.30, seed=1234):
  """
  Splits a panda dataframe in two, usually for train/test sets.
  Using the same random seed ensures we get the same split so
  that the description in this tutorial line up with generated images.
  """
  np.random.seed(seed)
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]

In [None]:
Cage_label = "Cage_type"   # Name of the classification target label
classes = list(consoles[Cage_label].unique())
consoles[Cage_label] = consoles[Cage_label].map(classes.index)

print(f"Target '{Cage_label}'' classes: {classes}")
consoles.head(3)

print(Cage_label)

Target 'Cage_type'' classes: ['Паллет', 'Телега', 'Короб Монолит', 'Короб Обл', 'Короб ПВЗ', 'Мешок']
Cage_type


# Новый раздел

In [None]:
train_ds_pd, test_ds_pd = split_dataset(consoles)
print(f"{len(train_ds_pd)} examples in training, {len(test_ds_pd)} examples for testing.")

# Convert to tensorflow data sets
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=Cage_label)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd, label=Cage_label)

10486 examples in training, 4512 examples for testing.


In [None]:
cmodel = tfdf.keras.RandomForestModel(verbose=0, random_seed=1234)
cmodel.fit(train_ds)

<keras.src.callbacks.History at 0x78e53528c2e0>

In [None]:
self_evaluation = cmodel.make_inspector().evaluation()
print(f"Accuracy: {self_evaluation.accuracy} Loss:{self_evaluation.loss}")

Accuracy: 0.8520884989509823 Loss:0.8331160896039231


In [None]:
input_features = list(consoles.columns)
input_features.remove("Cage_type")

print(input_features)

['length', 'tall', 'width', 'Weight', 'Destination']


In [None]:
model = tfdf.keras.GradientBoostedTreesModel(
    verbose=0, # Very few logs
    features=[tfdf.keras.FeatureUsage(name=n) for n in input_features],
    exclude_non_specified_features=True,
    min_examples=1,
    categorical_algorithm="RANDOM",
    #max_depth=4,
    shrinkage=0.05,
    #num_candidate_attributes_ratio=0.2,
    split_axis="SPARSE_OBLIQUE",
    sparse_oblique_normalization="MIN_MAX",
    sparse_oblique_num_projections_exponent=2.0,
    num_trees=2000,
    #validation_ratio=0.0,
    random_seed=1234,

)
model.fit(train_ds)

<keras.src.callbacks.History at 0x78e534899180>

In [None]:
self_evaluation = model.make_inspector().evaluation()
print(f"Accuracy: {self_evaluation.accuracy} Loss:{self_evaluation.loss}")

Accuracy: 0.8541666865348816 Loss:0.3780648112297058


In [None]:
model.summary()

Model: "gradient_boosted_trees_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1 (1.00 Byte)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 1 (1.00 Byte)
_________________________________________________________________
Type: "GRADIENT_BOOSTED_TREES"
Task: CLASSIFICATION
Label: "__LABEL"

Input Features (5):
	Destination
	Weight
	length
	tall
	width

No weights

Variable Importance: INV_MEAN_MIN_DEPTH:
    1. "Destination"  0.651331 ################
    2.      "length"  0.372843 ######
    3.        "tall"  0.227463 #
    4.      "Weight"  0.226663 #
    5.       "width"  0.170651 

Variable Importance: NUM_AS_ROOT:
    1. "Destination" 407.000000 ################
    2.      "length" 47.000000 #
    3.      "Weight" 17.000000 
    4.        "tall" 15.000000 

Variable Importance: NUM_NODES:
    1.      "length" 6891.000000 ################
    2.      "Weight" 3375.000000 #

In [None]:
tuner = tfdf.tuner.RandomSearch(num_trials=1000)
tuner.choice("min_examples", [2, 5, 7, 10])
tuner.choice("categorical_algorithm", ["CART", "RANDOM"])

local_search_space = tuner.choice("growing_strategy", ["LOCAL"])
local_search_space.choice("max_depth", [3, 4, 5, 6, 8])

global_search_space = tuner.choice("growing_strategy", ["BEST_FIRST_GLOBAL"], merge=True)
global_search_space.choice("max_num_nodes", [16, 32, 64, 128, 256])

#tuner.choice("use_hessian_gain", [True, False])
tuner.choice("shrinkage", [0.02, 0.05, 0.10, 0.15])
tuner.choice("num_candidate_attributes_ratio", [0.2, 0.5, 0.9, 1.0])


tuner.choice("split_axis", ["AXIS_ALIGNED"])
oblique_space = tuner.choice("split_axis", ["SPARSE_OBLIQUE"], merge=True)
oblique_space.choice("sparse_oblique_normalization",
                     ["NONE", "STANDARD_DEVIATION", "MIN_MAX"])
oblique_space.choice("sparse_oblique_weights", ["BINARY", "CONTINUOUS"])
oblique_space.choice("sparse_oblique_num_projections_exponent", [1.0, 1.5])
tuned_model = tfdf.keras.GradientBoostedTreesModel(tuner=tuner)
tuned_model.fit(train_ds, verbose=0)

tuned_self_evaluation = tuned_model.make_inspector().evaluation()
print(f"Accuracy: {tuned_self_evaluation.accuracy} Loss:{tuned_self_evaluation.loss}")

Use /tmp/tmp0ji1zpxz as temporary training directory
Accuracy: 0.8795656561851501 Loss:0.31780943274497986


In [None]:
print(f"Accuracy: {tuned_self_evaluation.accuracy} Loss:{tuned_self_evaluation.loss}")

Accuracy: 0.8795656561851501 Loss:0.31780943274497986
