In [2]:
!pip3 install kaggle



In [17]:
!touch /root/.kaggle/kaggle.json
!echo '{"username":"nhuantd","key":"bd50957194c60fa1c8e8afb3232bead4"}' > /root/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json
!cd ./input/shopee-product-matching && kaggle competitions download shopee-product-matching
!unzip train.csv.zip

touch: /root/.kaggle/kaggle.json: No such file or directory
zsh:1: no such file or directory: /root/.kaggle/kaggle.json
chmod: /root/.kaggle/kaggle.json: No such file or directory
Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.8/site-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.8/site-packages/kaggle/api/kaggle_api_extended.py", line 164, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /Users/stevetran/.kaggle. Or use the environment method.
unzip:  cannot find or open train.csv.zip, train.csv.zip.zip or train.csv.zip.ZIP.


In [None]:
from typing import Union

import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from sklearn.preprocessing import LabelEncoder

from modelling.models import TextProductMatch
from sklearn.model_selection import KFold


params = {
    "N_CLASSES": 11011,
    "MAX_LEN": 70,
    "MODEL_NAME": 'bert-base-multilingual-uncased',
    "POOLING": "global_avg_1d",
    "EPOCHS": 5,
    "BATCH_SIZE": 32,
    "METRIC": "circle_cl"
}

word_model = transformers.TFAutoModel.from_pretrained(params["MODEL_NAME"])
tokenizer = transformers.AutoTokenizer.from_pretrained(params["MODEL_NAME"])


def encoder(titles: Union[str]):
    ct = len(titles)

    input_ids = np.ones((ct, params["MAX_LEN"]), dtype='int32')
    att_masks = np.zeros((ct, params["MAX_LEN"]), dtype='int32')
    token_type_ids = np.zeros((ct, params["MAX_LEN"]), dtype='int32')

    for i in range(len(titles)):
        enc = tokenizer.encode_plus(titles[i],
                                    padding="max_length",
                                    max_length=params["MAX_LEN"],
                                    truncation=True,
                                    add_special_tokens=True,
                                    return_tensors='tf',
                                    return_attention_mask=True,
                                    return_token_type_ids=True)
        input_ids[i] = enc["input_ids"]
        att_masks[i] = enc["attention_mask"]
        token_type_ids[i] = enc["token_type_ids"]

    return input_ids, att_masks, token_type_ids


def main():
    ids = tf.keras.layers.Input((params["MAX_LEN"],), dtype=tf.int32)
    att = tf.keras.layers.Input((params["MAX_LEN"],), dtype=tf.int32)
    tok = tf.keras.layers.Input((params["MAX_LEN"],), dtype=tf.int32)

    labels_onehot = tf.keras.layers.Input(shape=(params["N_CLASSES"]), dtype=tf.int32)

    x = word_model(ids, attention_mask=att, token_type_ids=tok)[0]
    x = TextProductMatch(params["N_CLASSES"],
                         params["POOLING"],
                         metric=params["METRIC"],
                         use_fc=True)([x, labels_onehot])

    model = tf.keras.Model(inputs=[[ids, att, tok], labels_onehot], outputs=[x])
    print(model.summary())

    print("Loading data")
    dat = pd.read_csv("train.csv")
    X = encoder(dat["title"].tolist())
    y = np.array(LabelEncoder().fit_transform(dat["label_group"].tolist()))
    y = tf.keras.utils.to_categorical(y, num_classes=params["N_CLASSES"])
    
    
    cv = KFold(5, random_state=4111, shuffle=True)
    for (train_idx, test_idx) in cv.split(X,y):
        X_train, y_train, X_test, y_test = X[train_idx], y[train_idx],X[test_idx], y[test_idx]
        
        model.compile(
            optimizer=tf.keras.optimizers.Adam(),
            loss=tf.keras.losses.CategoricalCrossentropy(),
            metrics=tf.keras.metrics.Accuracy(),
        )

        callbacks = [
            tf.keras.callbacks.TensorBoard(write_graph=False)
        ]

        model.fit([X,y], y,
                  epochs=params["EPOCHS"],
                  batch_size=params["BATCH_SIZE"],
                  validation_data=(X_test, y_test)
                  callbacks=callbacks)