<a href="https://colab.research.google.com/github/YapingWu/GoogleColab/blob/main/keras/CharCNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

参考资料：
1.   https://blog.csdn.net/liuchonge/article/details/70947995
2.   https://github.com/mhjabreel/CharCnn_Keras

论文：Character-level Convolutional Networks for Text Classification

# 加载数据

In [None]:
from keras.models import Sequential, load_model
from keras.layers import Embedding, Dense, Conv1D, MaxPool1D, AlphaDropout, Flatten
from keras.initializers import RandomNormal
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

In [None]:
SEED = 7
data_names = ['myspace', 'phpbb', 'rockyou']
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789-;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
alphabet_size = len(alphabet)
length = 100
no_of_classes = 3
char_to_int = dict((c, i+1) for i, c in enumerate(alphabet))  # 索引0保留
int_to_char = dict((i+1, c) for i, c in enumerate(alphabet))

In [None]:
# 加载数据
import pandas as pd
data_list = []
for index, data_name in enumerate(data_names):
    print('数据集名称：%s' % data_name)
    file_name = '/content/' + data_name + '.txt'
    for df_tmp in pd.read_csv(file_name, header=None, names=['pwd'], chunksize=5000000):
        df_tmp['category'] = index
        print('loading... %s' % df_tmp.shape[0])
        data_list.append(df_tmp)
data = pd.concat(data_list, ignore_index=True)

print('对输入输出进行编码...')
x = data['pwd'].apply(lambda s: [char_to_int[char] if char in alphabet else 0
                                  for char in str(s).lower()])  # 对x进行编码
x = pad_sequences(x, length, padding='post')
y = to_categorical(data['category'], num_classes=no_of_classes, dtype='int32')
print("x Shape: %s, y Shape: %s" % (x.shape, y.shape))

In [None]:
logger.info("划分训练集、验证集和测试集")
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=SEED)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=SEED)

print("x_train Shape: %s, y_train Shape: %s" % (x_train.shape, y_train.shape))
print("x_val Shape: %s, y_val Shape: %s" % (x_val.shape, y_val.shape))
print("x_test Shape: %s, y_test Shape: %s" % (x_test.shape, y_test.shape))

# 创建和训练模型

In [None]:
# 超参数
batch_size = 128
epochs = 2
conv_layers = [
                  [256, 7, 3],
                  [256, 7, 3],
                  [256, 3, None],
                  [256, 3, None],
                  [256, 3, None],
                  [256, 3, 3]
              ]
fully_layers = [1024, 1024]
embedding_size = 50
th = 1e-6
dropout_p = 0.5
optimizer = 'adam'
initializer_stddev = 0.05
loss = 'categorical_crossentropy'

In [None]:
print('创建CNN模型...')
model = Sequential()
# Embedding layers
model.add(Embedding(input_dim=alphabet_size + 1, output_dim=embedding_size, input_length=length))
# Convolution layers
for num_filters, filter_width, pool_size in conv_layers:
    model.add(Conv1D(filters=num_filters,
                      kernel_size=filter_width,
                      kernel_initializer=RandomNormal(mean=0, stddev=initializer_stddev),
                      activation='tanh'))
    if pool_size is not None:
        model.add(MaxPool1D(pool_size))
# Fully connected layers
model.add(Flatten())
for units in fully_layers:
    model.add(Dense(units, activation='selu', kernel_initializer='lecun_normal'))
    model.add(AlphaDropout(dropout_p))
model.add(Dense(no_of_classes, activation='softmax'))

# Build and compile model
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
print('创建模型：\n')
model.summary()

print('开始训练CNN模型')
model.fit(x_train, y_train,
          validation_data=(x_val, y_val),
          epochs=epochs,
          batch_size=batch_size,
          verbose=1)
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
print("Model Accuracy on test: %.2f%%, Loss: %.2f" % (accuracy * 100, loss))
model_file = './model/cnn.h5'
print("保存模型：%s" % model_file)
model.save(model_file)

# 其他命令

In [None]:
!unzip '/content/raw_data.zip'