# 基于文本卷积网络的Baseline
* 线上0.9258

In [None]:
# 安装扩展包时请使用阿里云镜像源  install packages
!pip install pandas==0.23.4 -i "https://mirrors.aliyun.com/pypi/simple/"
!pip install numpy==1.19.0 -i "https://mirrors.aliyun.com/pypi/simple/"
!pip install wrapt --ignore-installed -i "https://mirrors.aliyun.com/pypi/simple/"
!pip install tensorflow==1.14.0 -i "https://mirrors.aliyun.com/pypi/simple/"
!pip install keras==2.3.1 -i "https://mirrors.aliyun.com/pypi/simple/"
!pip install bert4keras -i "https://mirrors.aliyun.com/pypi/simple/"

In [None]:
import numpy as np
import pandas as pd

from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from keras.layers import *
from keras.models import Model
from keras.optimizers import Adam

from bert4keras.snippets import sequence_padding, DataGenerator

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

# EDA

In [None]:
%%time
df_train = pd.read_csv('datalab/72510/train_set.csv', sep='\t')
df_test = pd.read_csv('datalab/72510/test_a.csv', sep='\t')

In [None]:
df_train.head()

In [None]:
df_test.head()

## 类别分布
* 科技类的样本数量最多：38918
* 星座类的样本数量最少：908

In [None]:
df_train['label'].value_counts()

In [None]:
%%time
df_train['text'] = df_train['text'].apply(lambda x: list(map(lambda y: int(y), x.split())))
df_test['text'] = df_test['text'].apply(lambda x: list(map(lambda y: int(y), x.split())))

## 文本长度分布
* 训练集有20万条文本，测试集a有5万条文本
* 训练集和测试集a的文本长度分布基本一致，应该是从相同分布的数据集中划分出来的
* 训练集：文本长度的均值约为908字符，中值为676字符，最短文本为2字符，最长文本为57921字符
* 测试集a：文本长度的均值约为910字符，中值为676字符，最短文本为14字符，最长文本为41861字符

In [None]:
df_train['text'].map(lambda x: len(x)).describe()

In [None]:
df_test['text'].map(lambda x: len(x)).describe()

## 构造词典
* 总共6869个字符，从0到7549，中间不连续，有缺失
* 出现频率最高的字符Top10：3750，648，900，3370，6122，4464，7399，4939，3659，4811

In [None]:
%%time
vocab = dict()
for text in df_test['text']:
    for word in text:
        if vocab.get(word):
            vocab[word] += 1
        else:
            vocab[word] = 1

In [None]:
len(vocab)

In [None]:
chars = sorted(vocab.items(), key=lambda x: x[0])

In [None]:
chars[:10]

In [None]:
chars[-10:]

In [None]:
chars = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
chars[:10]

# 训练

## 超参数

In [None]:
SEED = 2020
num_classes = 14
vocabulary_size = 7600

maxlen = 1024
batch_size = 512
embedding_dim = 256
num_filters = 512
filter_sizes = [3, 4, 5]
drop = 0.5
lr = 1e-4
epochs = 20

## 加载数据

In [None]:
df_train, df_valid = train_test_split(df_train, test_size=0.2, random_state=SEED)

In [None]:
def load_data(df):
    """加载数据"""
    D = list()
    for _, row in df.iterrows():
        text = row['text']
        label = row['label']
        D.append((text, int(label)))
    return D

In [None]:
train_data = load_data(df_train)
valid_data = load_data(df_valid)

In [None]:
class data_generator(DataGenerator):
    """数据生成器"""

    def __init__(self, data, batch_size=32, buffer_size=None, random=False):
        super().__init__(data, batch_size, buffer_size)
        self.random = random

    def __iter__(self, random=False):
        batch_token_ids, batch_labels = [], []
        for is_end, (text, label) in self.sample(random):
            token_ids = text[:maxlen] if len(text) > maxlen else text + (maxlen - len(text)) * [0]
            batch_token_ids.append(token_ids)
            batch_labels.append([label])
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_labels = sequence_padding(batch_labels)
                yield [batch_token_ids], batch_labels
                batch_token_ids, batch_labels = [], []

    def forfit(self):
        while True:
            for d in self.__iter__(self.random):
                yield d

In [None]:
train_generator = data_generator(train_data, batch_size, random=True)
valid_generator = data_generator(valid_data, batch_size)

## 构建模型（文本卷积网络）

In [None]:
# 输入
inputs = Input(shape=(maxlen,), dtype='int32')

# 嵌入层
embedding = Embedding(
    input_dim=vocabulary_size,
    output_dim=embedding_dim,
    input_length=maxlen
)(inputs)
reshape = Reshape((maxlen, embedding_dim, 1))(embedding)

# 卷积层
conv_0 = Conv2D(
    num_filters,
    kernel_size=(filter_sizes[0], embedding_dim),
    padding='valid',
    kernel_initializer='normal',
    activation='relu'
)(reshape)
conv_1 = Conv2D(
    num_filters,
    kernel_size=(filter_sizes[1], embedding_dim),
    padding='valid',
    kernel_initializer='normal',
    activation='relu'
)(reshape)
conv_2 = Conv2D(
    num_filters,
    kernel_size=(filter_sizes[2], embedding_dim),
    padding='valid',
    kernel_initializer='normal',
    activation='relu'
)(reshape)

# 池化层
maxpool_0 = MaxPool2D(
    pool_size=(maxlen - filter_sizes[0] + 1, 1),
    strides=(1, 1),
    padding='valid'
)(conv_0)
maxpool_1 = MaxPool2D(
    pool_size=(maxlen - filter_sizes[1] + 1, 1),
    strides=(1, 1),
    padding='valid'
)(conv_1)
maxpool_2 = MaxPool2D(
    pool_size=(maxlen - filter_sizes[2] + 1, 1),
    strides=(1, 1),
    padding='valid'
)(conv_2)

# 输出层
concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(units=num_classes, activation='softmax')(dropout)

model = Model(inputs=inputs, outputs=output)
model.summary()

model.compile(
    optimizer=Adam(lr=lr),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

## 回调函数

In [None]:
class Evaluator(Callback):
    def __init__(self):
        super().__init__()
        self.best_val_f1 = 0.

    def evaluate(self):
        y_true, y_pred = list(), list()
        for x, y in valid_generator:
            y_true.append(y)
            y_pred.append(self.model.predict(x).argmax(axis=1))
        y_true = np.concatenate(y_true)
        y_pred = np.concatenate(y_pred)
        f1 = f1_score(y_true, y_pred, average='macro')
        return f1

    def on_epoch_end(self, epoch, logs=None):
        val_f1 = self.evaluate()
        if val_f1 > self.best_val_f1:
            self.best_val_f1 = val_f1
        logs['val_f1'] = val_f1
        print(f'val_f1: {val_f1:.5f}, best_val_f1: {self.best_val_f1:.5f}')

In [None]:
callbacks = [
    Evaluator(),
    EarlyStopping(
        monitor='val_loss', 
        patience=1, 
        verbose=1
    ),
    ModelCheckpoint(
        'best_model.weights',
        monitor='val_f1',
        save_weights_only=True,
        save_best_only=True,
        verbose=1,
        mode='max'
    ),
]

## 拟合模型

In [None]:
model.fit(
    train_generator.forfit(),
    steps_per_epoch=len(train_generator),
    epochs=epochs,
    callbacks=callbacks,
    validation_data=valid_generator.forfit(),
    validation_steps=len(valid_generator)
)

# 预测

## 加载数据

In [None]:
df_test['label'] = 0
test_data = load_data(df_test)
test_generator = data_generator(test_data, batch_size)

## 模型预测

In [None]:
result = model.predict_generator(test_generator.forfit(), steps=len(test_generator))
result = result.argmax(axis=1)

## 将结果处理为提交的格式

In [None]:

df_test['label'] = result
df_test.to_csv('submission.csv', index=False, columns=['label'])