In [2]:
import os
import gc
import tensorflow as tf

tf.config.set_visible_devices([], 'GPU')

if tf.test.gpu_device_name():
    print('GPU found')
else:
    print("No GPU found")

from typing import Union

import numpy as np
import pandas as pd

import transformers
from sklearn.preprocessing import LabelEncoder

from modelling.models import TextProductMatch
from sklearn.model_selection import KFold
from transformers import BertTokenizer, TFBertModel

import os

params = {
    "N_CLASSES": 11014,
    "MAX_LEN": 70,
    "MODEL_NAME": 'bert-base-multilingual-uncased',
    "POOLING": "global_avg_1d",
    "EPOCHS": 5,
    "BATCH_SIZE": 32,
    "METRIC": "adacos",
    "LAST_HIDDEN_STATES": 3 
}
PATH_NAME = 'saved/arcface/v1'
os.makedirs(PATH_NAME,exist_ok=True)

GPU found


In [3]:
from modelling.metrics import *
from modelling.pooling import *

In [4]:
def encoder(titles: Union[str]):
    ct = len(titles)

    input_ids = np.ones((ct, params["MAX_LEN"]), dtype='int32')
    att_masks = np.zeros((ct, params["MAX_LEN"]), dtype='int32')
    token_type_ids = np.zeros((ct, params["MAX_LEN"]), dtype='int32')

    for i in range(len(titles)):
        enc = tokenizer.encode_plus(titles[i],
                                    padding="max_length",
                                    max_length=params["MAX_LEN"],
                                    truncation=True,
                                    add_special_tokens=True,
                                    return_tensors='tf',
                                    return_attention_mask=True,
                                    return_token_type_ids=True)
        input_ids[i] = enc["input_ids"]
        att_masks[i] = enc["attention_mask"]
        token_type_ids[i] = enc["token_type_ids"]

    return input_ids, att_masks, token_type_ids

In [5]:
import pandas as pd
df = pd.read_csv("train.csv")

In [6]:
from sklearn.preprocessing import LabelEncoder
df["label"] = LabelEncoder().fit_transform(df["label_group"].tolist())

In [7]:
df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,label
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,666
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,7572
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,6172
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,10509
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,9425


In [8]:
from dataloader.semi_loader import RandomTextSemiLoader
from features.pool import BertLastHiddenState

# Define model

In [9]:
config = transformers.BertConfig.from_pretrained(params["MODEL_NAME"])
config.output_hidden_states = True
word_model = transformers.TFAutoModel.from_pretrained(params["MODEL_NAME"],config=config)
tokenizer = transformers.AutoTokenizer.from_pretrained(params["MODEL_NAME"])

Some layers from the model checkpoint at bert-base-multilingual-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [10]:
ids = tf.keras.layers.Input((params["MAX_LEN"],), dtype=tf.int32)
att = tf.keras.layers.Input((params["MAX_LEN"],), dtype=tf.int32)
tok = tf.keras.layers.Input((params["MAX_LEN"],), dtype=tf.int32)
x1 = word_model(ids, attention_mask=att, token_type_ids=tok)[-1]
embedding = BertLastHiddenState(multi_sample_dropout=True)(x1)
model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[embedding])

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


In [11]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 70)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 70)]         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 70)]         0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 167356416   input_1[0][0]                    
                                                                 input_2[0][0]         

## Training

In [12]:
a = RandomTextSemiLoader(df["title"].to_numpy(), df["label"].to_numpy(),8)

In [13]:
import time
from modelling.loss import contrastive
from modelling.dist import pairwise_dist

In [14]:
X_title = df["title"].to_numpy()

In [16]:
opt = tf.keras.optimizers.Adam()

for epoch in range(params["EPOCHS"]):
    print("Start epoch {}/{}".format((epoch+1),params["EPOCHS"]))
    epochStart = time.time()
    
    for i in range(len(a)):
        print(i)
        X_idx, y = a.__getitem__(i)
        print(X_idx.shape)
        
#         X_1 = encoder(X_title[X_idx[:,0]])
#         X_2 = encoder(X_title[X_idx[:,1]])        
        
#         with tf.GradientTape() as tape:
#             X_emb1 = model(X_1)        
#             X_emb2 = model(X_2)

#             y_pred = pairwise_dist(X_emb1, X_emb2)
#             print("Compute prediction")
#             loss = contrastive(y_true=y, y_pred=y_pred)
            
#             del X_1, X_2, X_emb1, X_emb2, X_idx, y
#             gc.collect()
        
#         print("Update loss")
#         grads = tape.gradient(loss, model.trainable_variables)
#         opt.apply_gradients(zip(grads, model.trainable_variables))
        
#         print("Loss: {}".format(loss), end=" ")

Start epoch 1/5
0
(192, 2)
1
(192, 2)
2
(192, 2)
3
(192, 2)
4


ValueError: 'a' cannot be empty unless no samples are taken