In [1]:
import os
import numpy as np
import pandas as pd
import warnings
from gensim.models import Word2Vec
from tqdm import tqdm
import random
import tensorflow as tf
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

from keras import backend as K
from keras.preprocessing import text, sequence
from keras import Model
from keras.layers import Conv1D, Embedding, Input, Bidirectional, CuDNNLSTM, Dense, Concatenate, Masking, LSTM, SpatialDropout1D
from keras.layers import BatchNormalization, Dropout, Activation, Add
from keras.layers import GlobalMaxPool1D, GlobalAveragePooling1D, GlobalAvgPool1D, GlobalMaxPooling1D
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, Callback
from keras.utils import to_categorical
from keras_radam import RAdam
from keras_lookahead import Lookahead
import keras.backend.tensorflow_backend as KTF


from keras_multi_head import MultiHead, MultiHeadAttention
from keras_self_attention import SeqSelfAttention
from keras_position_wise_feed_forward import FeedForward
from keras_layer_normalization import LayerNormalization

config = tf.ConfigProto()  
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
KTF.set_session(session)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
# !pip install keras-rectified-adam
# !pip install keras-lookahead

In [3]:
def fix_seed(seed):
    np.random.seed(seed)
    tf.set_random_seed(seed)

seed = 2020
fix_seed(seed)

In [4]:
df_train = pd.read_csv('raw_data/train_set.csv', sep='\t')
df_test = pd.read_csv('raw_data/test_a.csv', sep='\t')
df_data = df_train.append(df_test)
df_data = df_data.reset_index(drop=True)
df_data.shape

(250000, 2)

In [5]:
df_data.head()

Unnamed: 0,label,text
0,2.0,2967 6758 339 2021 1854 3731 4109 3792 4149 15...
1,11.0,4464 486 6352 5619 2465 4802 1452 3137 5778 54...
2,3.0,7346 4068 5074 3747 5681 6093 1777 2226 7354 6...
3,2.0,7159 948 4866 2109 5520 2490 211 3956 5520 549...
4,3.0,3646 3055 3055 2490 4659 6065 3370 5814 2465 5...


In [6]:
max_words_num = None
seq_len = 1000
embedding_dim = 128
col = 'text'

print('Generate seqs')
os.makedirs('seqs', exist_ok=True)
seq_path = 'seqs/seqs_{}_{}.npy'.format(max_words_num, seq_len)
word_index_path = 'seqs/word_index_{}_{}.npy'.format(max_words_num, seq_len)
if not os.path.exists(seq_path) or not os.path.exists(word_index_path):
    tokenizer = text.Tokenizer(num_words=max_words_num, lower=False, filters='')
    tokenizer.fit_on_texts(df_data[col].values.tolist())
    seqs = sequence.pad_sequences(tokenizer.texts_to_sequences(df_data[col].values.tolist()), maxlen=seq_len,
                                  padding='post', truncating='pre')
    word_index = tokenizer.word_index
        
    np.save(seq_path, seqs)
    np.save(word_index_path, word_index)

else:
    seqs = np.load(seq_path)
    word_index = np.load(word_index_path, allow_pickle=True).item()
    
# print('Generate embedding')
# os.makedirs('embedding', exist_ok=True)
# embedding_path = 'embedding/w2v_{}_{}.m'.format(col, embedding_dim)
# if not os.path.exists(embedding_path):
#     print('Training w2v')
#     model = Word2Vec([[word for word in senetnce.split(' ')] for senetnce in df_data[col].values],
#                       size=embedding_dim, window=20, workers=32, seed=seed, min_count=1, sg=1, hs=1)

#     model.save(embedding_path)
# else:
#     model = Word2Vec.load(embedding_path)

embedding = np.zeros((len(word_index) + 1, embedding_dim))
# for word, i in tqdm(word_index.items()):
#     embedding_vector = model[word] if word in model else None
#     if embedding_vector is not None:
#         embedding[i] = embedding_vector

Generate seqs


In [7]:
df_data['label'].value_counts()

0.0     38918
1.0     36945
2.0     31425
3.0     22133
4.0     15016
5.0     12232
6.0      9985
7.0      8841
8.0      7847
9.0      5878
10.0     4920
11.0     3131
12.0     1821
13.0      908
Name: label, dtype: int64

# 模型训练

In [8]:
os.makedirs('model', exist_ok=True)
os.makedirs('sub', exist_ok=True)
os.makedirs('prob', exist_ok=True)

In [9]:
all_index = df_data[df_data['label'].notnull()].index.tolist()
test_index = df_data[df_data['label'].isnull()].index.tolist()

In [10]:
def build_model(emb, seq_len):
    inp = Input(shape=(seq_len,))

    emb_layer = Embedding(
        input_dim=emb.shape[0],
        output_dim=emb.shape[1],
        input_length=seq_len)(inp)

    sdrop = SpatialDropout1D(rate=0.2)
    emb_layer = sdrop(emb_layer)

    mha1 = MultiHeadAttention(head_num=16)(emb_layer)
    mha1 = Dropout(0.01)(mha1)
    mha1 = Add()([emb_layer, mha1])
    mha1 = LayerNormalization()(mha1)
    mha1 = Dropout(0.01)(mha1)
    mha1_ff = FeedForward(128)(mha1)
    mha1_out = Add()([mha1, mha1_ff])
    mha1_out = LayerNormalization()(mha1_out)

    mha2 = MultiHeadAttention(head_num=16)(mha1_out)
    mha2 = Dropout(0.01)(mha2)
    mha2 = Add()([mha1_out, mha2])
    mha2 = LayerNormalization()(mha2)
    mha2 = Dropout(0.01)(mha2)
    mha2_ff = FeedForward(128)(mha2)
    mha2_out = Add()([mha2, mha2_ff])
    mha2_out = LayerNormalization()(mha2_out)
    
    lstm = Bidirectional(LSTM(128, return_sequences=True))(mha2_out)

    avg_pool = GlobalAveragePooling1D()(lstm)
    max_pool = GlobalMaxPool1D()(lstm)

    x = Concatenate()([avg_pool, max_pool])

    x = Dense(128, activation='relu')(x)
    x = BatchNormalization()(x)

    x = Dense(64, activation='relu')(x)
    x = BatchNormalization()(x)

    x = Dropout(0.2)(x)

    out = Dense(14, activation='softmax')(x)
    model = Model(inputs=inp, outputs=out)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=Adam(1e-4),
                  metrics=['accuracy'])
    
    return model

In [11]:
class Evaluator(Callback):
    def __init__(self, validation_data):
        super().__init__()
        self.best_val_f1 = 0.
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]

    def evaluate(self):
        y_true = self.y_val
        y_pred = self.model.predict(self.x_val).argmax(axis=1)
        f1 = f1_score(y_true, y_pred, average='macro')
        return f1

    def on_epoch_end(self, epoch, logs=None):
        val_f1 = self.evaluate()
        if val_f1 > self.best_val_f1:
            self.best_val_f1 = val_f1
        logs['val_f1'] = val_f1
        print(f'val_f1: {val_f1:.5f}, best_val_f1: {self.best_val_f1:.5f}')

In [None]:
bs = 32
monitor = 'val_f1'

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
for fold_id, (train_index, val_index) in enumerate(kfold.split(all_index, df_data.iloc[all_index]['label'])):
    train_x = seqs[train_index]
    val_x = seqs[val_index]

    label = df_data['label'].values
    train_y = label[train_index]
    val_y = label[val_index]
    
    model_path = 'model/transformer_{}.h5'.format(fold_id)
    checkpoint = ModelCheckpoint(model_path, monitor=monitor, verbose=1, save_best_only=True, mode='max', save_weights_only=True)
    earlystopping = EarlyStopping(monitor=monitor, patience=5, verbose=1, mode='max')
    reduce_lr = ReduceLROnPlateau(monitor=monitor, factor=0.1, patience=2, mode='max', verbose=1)
    
    model = build_model(embedding, seq_len)
    model.fit(train_x, train_y, batch_size=bs, epochs=30,
              validation_data=(val_x, val_y),
              callbacks=[Evaluator(validation_data=(val_x, val_y)), checkpoint, reduce_lr, earlystopping], verbose=1, shuffle=True)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 160000 samples, validate on 40000 samples
Epoch 1/30
val_f1: 0.85578, best_val_f1: 0.85578

Epoch 00001: val_f1 improved from -inf to 0.85578, saving model to model/transformer_0.h5
Epoch 2/30
val_f1: 0.90114, best_val_f1: 0.90114

Epoch 00002: val_f1 improved from 0.85578 to 0.90114, saving model to model/transformer_0.h5
Epoch 3/30
val_f1: 0.91846, best_val_f1: 0.91846

Epoch 00003: val_f1 improved from 0.90114 to 0.91846, saving model to model/transformer_0.h5
Epoch 4/30

# 模型预测

In [None]:
oof_pred = np.zeros((len(all_index), 14))
test_pred = np.zeros((len(test_index), 14))

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
for fold_id, (train_index, val_index) in enumerate(kfold.split(all_index, df_data.iloc[all_index]['label'])):
    model = build_model(embedding, seq_len)
    model_path = 'model/transformer_{}.h5'.format(fold_id)
    model.load_weights(model_path)
    
    val_x = seqs[val_index]
    prob = model.predict(val_x, batch_size=bs, verbose=1)
    oof_pred[val_index] = prob
    
    test_x = seqs[test_index]
    prob = model.predict(test_x, batch_size=bs, verbose=1)
    test_pred += prob / 5

In [None]:
df_oof = df_data.loc[all_index][['label']]
df_oof['predict'] = np.argmax(oof_pred, axis=1)
f1score = f1_score(df_oof['label'], df_oof['predict'], average='macro')
print(f1score)

In [18]:
f1score

0.9363675328073761

In [None]:
np.save('prob/sub_5fold_transformer_{}.npy'.format(f1score), test_pred)
np.save('prob/oof_5fold_transformer_{}.npy'.format(f1score), oof_pred)

In [None]:
sub = pd.DataFrame()
sub['label'] = np.argmax(test_pred, axis=1)
sub.to_csv('sub/transformer_{}.csv'.format(f1score), index=False)