In [2]:
import os
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import transformers
#~/.cache/huggingface/transformers$ 중복 주의 in tensorflowCPU
import pandas as pd
from tokenizers import BertWordPieceTokenizer
from tensorflow.keras.utils import to_categorical

In [3]:
import numpy as np # linear algebra
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Dropout, SpatialDropout1D, BatchNormalization, GlobalAveragePooling1D 

from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.preprocessing import sequence, text
from tensorflow.keras.callbacks import EarlyStopping


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [4]:
config = tf.compat.v1.ConfigProto() 
config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config=config)

In [5]:
#IMP DATA FOR CONFIG
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 3
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 192

REPLICAS:  1


In [6]:
import pandas as pd
train = pd.read_csv("../data/train.csv")
test = pd.read_csv('../data/test_x.csv')
sub = pd.read_csv('../data/sample_submission.csv')

In [7]:
def fast_encode(texts, tokenizer, chunk_size=256, max_len=512):
    """
    Encoder for encoding the text into sequence of integers for BERT Input
    """
    tokenizer.enable_truncation(max_length=max_len)
    tokenizer.enable_padding(length=max_len)
    #if enable_padding:
    #    tokenizer.enable_padding(max_length=max_len)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [8]:
#IMP DATA FOR CONFIG

AUTO = tf.data.experimental.AUTOTUNE


# Configuration
EPOCHS = 3
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 192

### 사전학습 Tokenizer DistilBert(기존 bert tokenizer)

In [9]:
# First load the real tokenizer
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
# Save the loaded tokenizer locally
tokenizer.save_pretrained('.')
# Reload it with the huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

Tokenizer(vocabulary_size=119547, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=False, wordpieces_prefix=##)

In [10]:
from sklearn.model_selection import train_test_split
x_train, x_valid , y_train, y_valid = train_test_split(train.text.values, train.author.values, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)

In [11]:
x_train = fast_encode(x_train.astype(str), fast_tokenizer, max_len=MAX_LEN)
x_valid = fast_encode(x_valid.astype(str), fast_tokenizer, max_len=MAX_LEN)
x_test = fast_encode(test.text.astype(str), fast_tokenizer, max_len=MAX_LEN)
y_train = to_categorical(y_train)
y_valid = to_categorical(y_valid)

100%|██████████| 172/172 [00:03<00:00, 55.76it/s]
100%|██████████| 43/43 [00:00<00:00, 65.01it/s]
100%|██████████| 77/77 [00:01<00:00, 39.04it/s]


In [33]:
trn = fast_encode(train.text.astype(str), fast_tokenizer, max_len=MAX_LEN)
tst = fast_encode(test.text.astype(str), fast_tokenizer, max_len=MAX_LEN)

y = train['author'].values
print(trn[0].shape, tst[0].shape, y.shape)
#(54879, 100) (19617, 100) (54879,)

100%|██████████| 215/215 [00:03<00:00, 55.06it/s]
100%|██████████| 77/77 [00:01<00:00, 40.64it/s]


(192,) (192,) (54879,)


In [29]:
print(trn.shape[0], tst.shape[0], y.shape)

54879 19617 (54879,)


In [12]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

In [13]:
def build_model(transformer, max_len=512):
    """
    function for training the BERT model
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(5, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [14]:
%%time
with strategy.scope():
    transformer_layer = (
        transformers.TFDistilBertModel.from_pretrained('distilbert-base-multilingual-cased')
    )
    #transformer_layer.trainable = False
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

Some layers from the model checkpoint at distilbert-base-multilingual-cased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_layer_norm', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 192)]             0         
_________________________________________________________________
tf_distil_bert_model (TFDist TFBaseModelOutput(last_hi 134734080 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 768)]             0         
_________________________________________________________________
dense (Dense)                (None, 5)                 3845      
Total params: 134,737,925
Trainable params: 134,737,925
Non-trainable params: 0
_________________________________________________________________
CPU times: user 9.43 s, sys: 2.84 s, total: 12.3 s
Wall time: 13.2 s


In [15]:
EPOCHS

3

In [None]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

Epoch 1/3
  20/2743 [..............................] - ETA: 3:05:19 - loss: 0.5331 - accuracy: 0.2469

In [None]:
n_steps = x_valid.shape[0] // BATCH_SIZE
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=EPOCHS*2
)

In [20]:
from sklearn import metrics
def roc_auc(predictions,target):
    '''
    This methods returns the AUC Score when given the Predictions
    and Labels
    '''
    
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

In [1]:
#scores = model.predict(x_valid)
#print("Auc: %.4f%%" % (roc_auc(scores,y_valid)))

In [None]:
pred = model.predict(test_dataset, verbose=1)
sub.to_csv('Att_Bert_NoCV.csv', index=False)#1.6094

In [22]:
pred

array([[0.22269672, 0.1832239 , 0.17082787, 0.21379298, 0.12907481],
       [0.21312165, 0.16466647, 0.19293475, 0.23145074, 0.12725165],
       [0.34081018, 0.17865244, 0.16295913, 0.23505121, 0.18112281],
       ...,
       [0.3177494 , 0.19475693, 0.18126225, 0.2432454 , 0.16435102],
       [0.2707198 , 0.18079194, 0.17996547, 0.23241323, 0.16291106],
       [0.20487002, 0.23129016, 0.24473265, 0.23980466, 0.1762262 ]],
      dtype=float32)

In [19]:
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
p_val = np.zeros((trn.shape[0], 5))
p_tst = np.zeros((tst.shape[0], 5))
for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
    print(f'training model for CV #{i}')
    clf =  build_model(transformer_layer, max_len=MAX_LEN)
    
    clf.fit(trn[i_trn], 
            to_categorical(y[i_trn]),
            validation_data=(trn[i_val], to_categorical(y[i_val])),
            epochs=3,
            batch_size=32,
            )
    p_val[i_val, :] = clf.predict(trn[i_val])
    p_tst += clf.predict(tst) / 5
    
    del clf
    clear_session()
    gc.collect()

training model for CV #1
Epoch 1/3


In [None]:
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p_val, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p_val):8.4f}')