In [None]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 1000)

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import gc
import logging
from collections import Counter

from tqdm.autonotebook import *

import gensim
from gensim.models import FastText, Word2Vec

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import keras
from keras import layers
from keras import callbacks

from bert4keras.snippets import sequence_padding, DataGenerator

from keras_multi_head import MultiHead, MultiHeadAttention
from keras_self_attention import SeqSelfAttention
from keras_position_wise_feed_forward import FeedForward
from keras_layer_normalization import LayerNormalization

In [None]:
df_train = pd.read_csv('raw_data/train_set.csv', sep='\t')
df_test = pd.read_csv('raw_data/test_a.csv', sep='\t')

df_train['text'] = df_train['text'].apply(lambda x: list(map(lambda y: int(y), x.split())))
df_test['text'] = df_test['text'].apply(lambda x: list(map(lambda y: int(y), x.split())))

In [None]:
df_train, df_valid = train_test_split(df_train, test_size=0.2, random_state=2020)

In [None]:
num_classes = 14
vocabulary_size = 7600

maxlen = 256
batch_size = 128
embedding_dim = 128

In [None]:
def load_data(df):
    """加载数据"""
    D = list()
    for _, row in df.iterrows():
        text = row['text']
        label = row['label']
        D.append((text, int(label)))
    return D

In [None]:
train_data = load_data(df_train)
valid_data = load_data(df_valid)

In [None]:
class data_generator(DataGenerator):
    """数据生成器"""

    def __init__(self, data, batch_size=32, buffer_size=None, random=False):
        super().__init__(data, batch_size, buffer_size)
        self.random = random

    def __iter__(self, random=False):
        batch_token_ids, batch_labels = [], []
        for is_end, (text, label) in self.sample(random):
            token_ids = text[:maxlen] if len(text) > maxlen else text + (maxlen - len(text)) * [0]
            batch_token_ids.append(token_ids)
            batch_labels.append([label])
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_labels = sequence_padding(batch_labels)
                yield [batch_token_ids], batch_labels
                batch_token_ids, batch_labels = [], []

    def forfit(self):
        while True:
            for d in self.__iter__(self.random):
                yield d

In [None]:
train_generator = data_generator(train_data, batch_size, random=True)
valid_generator = data_generator(valid_data, batch_size)

In [None]:
def build_model():

    inp = layers.Input(shape=(maxlen,))

    emb_layer = layers.Embedding(
        input_dim=vocabulary_size,
        output_dim=embedding_dim,
        input_length=maxlen
    )(inp)

    sdrop = layers.SpatialDropout1D(rate=0.2)

    emb_layer = sdrop(emb_layer)

    mha1 = MultiHeadAttention(head_num=16)(emb_layer)
    mha1 = layers.Dropout(0.01)(mha1)
    mha1 = layers.Add()([emb_layer, mha1])
    mha1 = LayerNormalization()(mha1)
    mha1 = layers.Dropout(0.01)(mha1)
    mha1_ff = FeedForward(128)(mha1)
    mha1_out = layers.Add()([mha1, mha1_ff])
    mha1_out = LayerNormalization()(mha1_out)

    mha2 = MultiHeadAttention(head_num=16)(mha1_out)
    mha2 = layers.Dropout(0.01)(mha2)
    mha2 = layers.Add()([mha1_out, mha2])
    mha2 = LayerNormalization()(mha2)
    mha2 = layers.Dropout(0.01)(mha2)
    mha2_ff = FeedForward(128)(mha2)
    mha2_out = layers.Add()([mha2, mha2_ff])
    mha2_out = LayerNormalization()(mha2_out)
    
    lstm = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(mha2_out)

    avg_pool = layers.GlobalAveragePooling1D()(lstm)
    max_pool = layers.GlobalMaxPool1D()(lstm)

    x = layers.Concatenate()([avg_pool, max_pool])

    x = layers.Dense(128, activation='relu')(x)
    x = layers.BatchNormalization()(x)

    x = layers.Dense(64, activation='relu')(x)
    x = layers.BatchNormalization()(x)

    x = layers.Dropout(0.2)(x)

    out = layers.Dense(num_classes, activation='softmax')(x)
    model = keras.Model(inputs=inp, outputs=out)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=keras.optimizers.Adam(1e-4),
                  metrics=['accuracy'])
    
    return model

model = build_model()

In [None]:
model.summary()

In [None]:
class Evaluator(callbacks.Callback):
    def __init__(self):
        super().__init__()
        self.best_val_f1 = 0.

    def evaluate(self):
        y_true, y_pred = list(), list()
        for x, y in valid_generator:
            y_true.append(y)
            y_pred.append(self.model.predict(x).argmax(axis=1))
        y_true = np.concatenate(y_true)
        y_pred = np.concatenate(y_pred)
        f1 = f1_score(y_true, y_pred, average='macro')
        return f1

    def on_epoch_end(self, epoch, logs=None):
        val_f1 = self.evaluate()
        if val_f1 > self.best_val_f1:
            self.best_val_f1 = val_f1
        logs['val_f1'] = val_f1
        print(f'val_f1: {val_f1:.5f}, best_val_f1: {self.best_val_f1:.5f}')

In [None]:
callbacks = [
    Evaluator(),
    callbacks.EarlyStopping(
        monitor='val_accuracy', 
        mode='max',
        patience=5, 
        verbose=1
    ),
    callbacks.ModelCheckpoint(
        './models/model.h5',
        monitor='val_f1',
        save_weights_only=True,
        save_best_only=True,
        verbose=1,
        mode='max'
    ),
    callbacks.ReduceLROnPlateau(
        monitor='val_f1',
        factor=0.1,
        patience=2,
        verbose=1,
        mode='max',
        epsilon=1e-6
    )
    
]

In [None]:
model.fit(
    train_generator.forfit(),
    steps_per_epoch=len(train_generator),
    epochs=100,
    callbacks=callbacks,
    validation_data=valid_generator.forfit(),
    validation_steps=len(valid_generator)
)

In [None]:
df_test['label'] = 0
test_data = load_data(df_test)
test_generator = data_generator(test_data, batch_size)

In [None]:
result = model.predict_generator(test_generator.forfit(), steps=len(test_generator))
result = result.argmax(axis=1)

In [None]:
df_test['label'] = result
df_test.to_csv('submission.csv', index=False, columns=['label'])