In [1]:
import os
import sys

#Import config file. Update config.py according to your environment
import config

import pandas as pd
import numpy as np

import tensorflow as tf

from Rakuten_preprocessing import Rakuten_img_path

from src.text.classifiers import TFbertClassifier

from src.utils.batch import fit_save_all
from src.utils.plot import plot_training_history

import datetime


2024-03-11 21:11:32.848725: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-11 21:11:32.950880: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-11 21:11:32.950959: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-11 21:11:32.955046: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-11 21:11:32.984329: I tensorflow/core/platform/cpu_feature_guar

In [2]:
data_train = pd.read_csv(os.path.join(config.path_to_data, 'df_train_index.csv'))
data_train['testset'] = False
data_test = pd.read_csv(os.path.join(config.path_to_data, 'df_test_index.csv'))
data_test['testset'] = True
data = pd.concat([data_train, data_test], axis=0)

#merging text into token column
colnames = ['designation_translated', 'description_translated'] #['designation', 'description']#
data['tokens'] = data[colnames].apply(lambda row: ' '.join(s.lower() for s in row if isinstance(s, str)), axis=1)

#path to images into img_path column
data['img_path'] = Rakuten_img_path(img_folder=config.path_to_images,
                             imageid=data['imageid'], productid=data['productid'], suffix='_resized')


In [3]:
#labels of encoded classes
class_labels = data.groupby('prdtypedesignation')['prdtypeindex'].first().reset_index()
class_labels.index = class_labels['prdtypeindex']
class_labels = class_labels.drop(columns='prdtypeindex').sort_index()

## Creating train and test sets

In [4]:
Img_train = data.loc[~data['testset'], 'img_path']
Img_test = data.loc[data['testset'], 'img_path']

Txt_train = data.loc[~data['testset'], 'tokens']
Txt_test = data.loc[data['testset'], 'tokens']

y_train = data.loc[~data['testset'],'prdtypeindex']
y_test = data.loc[data['testset'],'prdtypeindex']

#To be fed into any of our sklearn classifiers, X_train and X_test
#should be dataframes with columns tokens and img_path
X_train = pd.DataFrame({'tokens': Txt_train, 'img_path': Img_train})
X_test = pd.DataFrame({'tokens': Txt_test, 'img_path': Img_test})

#All data for cross-validated scores
X = pd.concat([X_train, X_test], axis=0)
y = pd.concat([y_train, y_test], axis=0)

#Number of classes
num_classes = len(np.unique(data['prdtypeindex']))

## Bert benchmarks

In [5]:
#Name of the summary csv file to save results to
result_file_name = 'results_benchmark_bert.csv'

#type of modality
modality = 'text'

#Type of classifier
class_type = 'TFbertClassifier'

#training parameters (or list of parameters for gridsearchCV)
num_class = 27
max_length = 256
n_epochs = 1
batch_size = 32
drop_rate = 0.2
lr0 = 5e-5
lr_min=1e-6
lr_decay_rate = 0.8

callbacks = []
#adding earlystopping callback
callbacks.append(('EarlyStopping', {'monitor': 'val_accuracy', 'min_delta': 0, 'mode': 'max', 'patience': 2, 'restore_best_weights': True, 'verbose': 1}))
#Adding tensorboard callback as the last one
callbacks.append(('TensorBoard', {'log_dir': np.nan, 'histogram_freq': 1, 'update_freq': 'epoch'}))

#grid search number of folds
nfolds_grid = 0

#cross-validation of f1-score
nfolds_cv = 0

#network to test
base_name_list = ['camembert-base', 'camembert-base-ccnet', 'flaubert_base_uncased']

#Initializing the list of parameters to batch over
params_list = []

for base_name in base_name_list:
  #Adjusting tensorboard log directory
  log_dir = os.path.join(config.path_to_tflogs, base_name, datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
  callbacks[-1][1]['log_dir'] = log_dir
  print(callbacks[-1][1]['log_dir'])
  #adding the set of parameters to the list
  params_list.append({'modality': modality,
                      'class': class_type,
                      'base_name': base_name,
                      'param_grid': {'max_length': max_length, 'num_class': num_class, 'drop_rate': drop_rate, 
                                    'epochs': n_epochs, 'batch_size': batch_size, 
                                    'learning_rate': lr0, 'lr_decay_rate': lr_decay_rate, 'lr_min': lr_min,
                                    'validation_data': (X_test[:1000], y_test[:1000]), 'callbacks': [callbacks]},
                      'nfolds_grid': nfolds_grid, 'nfolds_cv': nfolds_cv
                    })
  
#Running the batch over params_list
results = fit_save_all(params_list, X_train=X_train[:1000], y_train=y_train[:1000], X_test=X_test[:1000], y_test=y_test[:1000], result_file_name = result_file_name)

/mnt/g/My Drive/DST/DST-Rakuten Project/tf_logs/camembert-base/20240311-104452
/mnt/g/My Drive/DST/DST-Rakuten Project/tf_logs/camembert-base-ccnet/20240311-104452
Fitting:  camembert-base nan


2024-03-11 10:44:53.116908: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:17:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-11 10:44:53.117313: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:73:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-11 10:44:53.181921: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:17:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-11 10:44:53.182010: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:73:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-11 10:44:53.182058: I external/local_xla/xla/stream_executor

loading from Local


All model checkpoint layers were used when initializing TFCamembertModel.

All the layers of TFCamembertModel were initialized from the model checkpoint at /mnt/c/Users/Julien Fournier/Documents/DST/RakutenProject/models/base_models/camembert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCamembertModel for predictions without further training.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
2024-03-11 10:45:14.197724: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory




2024-03-11 10:45:43.491978: I external/local_xla/xla/service/service.cc:168] XLA service 0x70876560 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-03-11 10:45:43.492095: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA RTX A5000, Compute Capability 8.6
2024-03-11 10:45:43.492118: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (1): NVIDIA RTX A5000, Compute Capability 8.6
2024-03-11 10:45:43.509338: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-03-11 10:45:44.182681: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
I0000 00:00:1710150344.367869 3633497 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Test set, f1score:  0.29346212684999823
Fitting:  camembert-base-ccnet nan
loading from Local


All model checkpoint layers were used when initializing TFCamembertModel.

All the layers of TFCamembertModel were initialized from the model checkpoint at /mnt/c/Users/Julien Fournier/Documents/DST/RakutenProject/models/base_models/camembert-base-ccnet.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCamembertModel for predictions without further training.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Test set, f1score:  0.4585894602199284


## Visualize tensorflow logs in tensorboard

In [15]:
log_path = os.path.join(config.path_to_tflogs, 'camembert-base-ccnet')

# Ensure the log_path is quoted to handle spaces
quoted_log_path = f'"{log_path}"'

%reload_ext tensorboard
%tensorboard --logdir {quoted_log_path}

Reusing TensorBoard on port 6008 (pid 3180493), started 0:00:03 ago. (Use '!kill 3180493' to kill it.)

## Example usage

In [6]:
#defining callbacks
callbacks = []
callbacks.append(('EarlyStopping', {'monitor': 'val_accuracy', 'min_delta': 0, 'mode': 'max', 'patience': 2, 'restore_best_weights': True, 'verbose': 1}))

clf = TFbertClassifier(base_name='flaubert_large_cased', from_trained=None, max_length=256, num_class=27, epochs=8, batch_size=32, drop_rate=0.2, learning_rate=5e-5, lr_decay_rate=0.6, lr_min=1e-5, validation_data=(X_test, y_test), callbacks=callbacks)

clf.fit(X_train, y_train)
# clf.classification_score(X_test, y_test)

loading from Huggingface


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFFlaubertModel: ['pred_layer.proj.weight', 'pred_layer.proj.bias']
- This IS expected if you are initializing TFFlaubertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFFlaubertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFFlaubertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFFlaubertModel for predictions without further training.


Epoch 1/8


2024-03-11 21:01:16.847634: W external/local_tsl/tsl/framework/bfc_allocator.cc:485] Allocator (GPU_0_bfc) ran out of memory trying to allocate 128.00MiB (rounded to 134217728)requested by op model_1/txt_base_layers/transformer/ffns_._20/lin1/Tensordot/MatMul
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2024-03-11 21:01:16.848015: I external/local_tsl/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for GPU_0_bfc
2024-03-11 21:01:16.848106: I external/local_tsl/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 210, Chunks in use: 210. 52.5KiB allocated for chunks. 52.5KiB in use in bin. 2.4KiB client-requested in use in bin.
2024-03-11 21:01:16.848253: I external/local_tsl/tsl/framework/bfc_allocator.cc:1046] Bin (512): 	Total Chunks: 1, Chunks in use: 0. 768B allocated for chunks. 0B in use in bin. 0B client

ResourceExhaustedError: Graph execution error:

Detected at node model_1/txt_base_layers/transformer/ffns_._20/lin1/Tensordot/MatMul defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/ipykernel_launcher.py", line 17, in <module>

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 701, in start

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/tornado/platform/asyncio.py", line 195, in start

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/asyncio/base_events.py", line 607, in run_forever

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/asyncio/base_events.py", line 1922, in _run_once

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/asyncio/events.py", line 80, in _run

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 534, in dispatch_queue

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 523, in process_one

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 429, in dispatch_shell

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 767, in execute_request

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 429, in do_execute

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3051, in run_cell

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3106, in _run_cell

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3311, in run_cell_async

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3493, in run_ast_nodes

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "/tmp/ipykernel_3842158/2114056205.py", line 7, in <module>

  File "/mnt/c/Users/Julien Fournier/Documents/GitHub/RakutenTeam/src/text/classifiers.py", line 388, in fit

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/engine/training.py", line 1807, in fit

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/engine/training.py", line 1401, in train_function

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/engine/training.py", line 1384, in step_function

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/engine/training.py", line 1373, in run_step

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/engine/training.py", line 1150, in train_step

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/engine/training.py", line 590, in __call__

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/engine/functional.py", line 515, in call

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/engine/functional.py", line 672, in _run_internal_graph

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/engine/training.py", line 590, in __call__

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/transformers/modeling_tf_utils.py", line 270, in run_call_with_unpacked_inputs

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/transformers/models/flaubert/modeling_tf_flaubert.py", line 276, in call

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/transformers/modeling_tf_utils.py", line 270, in run_call_with_unpacked_inputs

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/transformers/models/flaubert/modeling_tf_flaubert.py", line 660, in call

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/transformers/models/flaubert/modeling_tf_flaubert.py", line 660, in call

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/transformers/models/flaubert/modeling_tf_flaubert.py", line 716, in call

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/transformers/models/flaubert/modeling_tf_flaubert.py", line 721, in call

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/transformers/models/flaubert/modeling_tf_flaubert.py", line 427, in call

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/home/jul/anaconda3/envs/Rakuten/lib/python3.11/site-packages/keras/src/layers/core/dense.py", line 244, in call

OOM when allocating tensor with shape[8192,4096] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node model_1/txt_base_layers/transformer/ffns_._20/lin1/Tensordot/MatMul}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_134454]