In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as K
import tensorflow.keras.layers as layers
import tensorflow.keras.models as models
from tensorflow.keras.utils import multi_gpu_model
from sklearn.metrics import f1_score
from sklearn.model_selection import GroupKFold
from transformers import TFElectraModel, ElectraTokenizer
os.environ["CUDA_VISIBLE_DEVICES"] = '0,1,2,3'
print(tf.__version__)
tf.test.is_gpu_available()

In [13]:
df_train = pd.read_csv('data/train.csv').sort_values(['query_id', 'reply_id'])
df_test = pd.read_csv('data/test.csv').sort_values(['query_id', 'reply_id'])

In [14]:
df_train.head()

Unnamed: 0,query_id,reply_id,query,reply,label
0,0,0,采荷一小是分校吧,杭州市采荷第一小学钱江苑校区，杭州市钱江新城实验学校。,1
1,0,1,采荷一小是分校吧,是的,0
2,0,2,采荷一小是分校吧,这是5楼,0
3,1,0,毛坯吗？,因为公积金贷款贷的少,0
4,1,1,毛坯吗？,是呢,0


In [15]:
MAX_SEQUENCE_LENGTH = 100
MODEL_PATH = 'model/electra_180g_large_tf/'

In [16]:
tokenizer = ElectraTokenizer.from_pretrained(MODEL_PATH)

In [17]:
inputs_train = tokenizer(df_train['query'].values.tolist(),
                         df_train['reply'].values.tolist(),
                         max_length=MAX_SEQUENCE_LENGTH,
                         truncation=True,
                         padding=True,
                         return_tensors='np')
inputs_test = tokenizer(df_test['query'].values.tolist(),
                        df_test['reply'].values.tolist(),
                        max_length=MAX_SEQUENCE_LENGTH,
                        truncation=True,
                        padding=True,
                        return_tensors='np')
labels_train = df_train.label.values

In [18]:
def create_model():
    input_ids = layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    token_type_ids = layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    attention_mask = layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    
    base_model = TFElectraModel.from_pretrained(MODEL_PATH)
    
    h = base_model({'input_ids':input_ids,
                    'token_type_ids':token_type_ids,
                    'attention_mask':attention_mask})[0]
    h0 = h[:,0]
    h1 = layers.GlobalAveragePooling1D()(h)
    h2 = layers.GlobalMaxPool1D()(h)
    h = layers.concatenate([h0,h1,h2])
    h = layers.Dense(1, activation='sigmoid')(h)
    
    model = models.Model(inputs=[input_ids,token_type_ids,attention_mask], outputs=h)
    return model

In [19]:
gkf = GroupKFold(n_splits=5)
oof = np.zeros_like(df_train.label)
sub = np.zeros_like(df_test.label)

In [None]:
for i, (trn_idx, val_idx) in enumerate(gkf.split(df_train, groups=df_train.query_id)):
    inputs_trn = [x[trn_idx] for x in inputs_train.values()]
    inputs_val = [x[val_idx] for x in inputs_train.values()]
    inputs_sub = inputs_test.values()
    labels_trn = labels_train[trn_idx]
    labels_val = labels_train[val_idx]
    
    K.clear_session()
    model = create_model()
    model = multi_gpu_model(model, gpus=4)
    model.compile(loss='binary_crossentropy',
                  optimizer=keras.optimizers.Adam(learning_rate=2e-5),
                  metrics=[keras.metrics.AUC()])
    model.fit(inputs_trn, labels_trn,
              validation_data = (inputs_val, labels_val),
              epochs=3,
              batch_size=64)
    
    oof[val_idx] = model.predict(inputs_val, batch_size=512)
    sub += model.predict(inputs_sub, batch_size=512) / gkf.n_splits