<a href="https://colab.research.google.com/github/YapingWu/GoogleColab/blob/main/keras/CharCNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

参考资料：
1.   https://blog.csdn.net/liuchonge/article/details/70947995
2.   https://github.com/mhjabreel/CharCnn_Keras

论文：Character-level Convolutional Networks for Text Classification

# 加载数据

In [1]:
from keras.models import Sequential, load_model
from keras.layers import Embedding, Dense, Conv1D, MaxPool1D, AlphaDropout, Flatten
from keras.initializers import RandomNormal
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard
import time
import pandas as pd

SEED = 7
data_names = ['myspace', 'phpbb', 'rockyou']
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789-;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
alphabet_size = len(alphabet)
length = 100
no_of_classes = 3
char_to_int = dict((c, i+1) for i, c in enumerate(alphabet))  # 索引0保留
int_to_char = dict((i+1, c) for i, c in enumerate(alphabet))
data_size = 40000

In [7]:

def encode_xy(x, y):
    x_ = x.apply(lambda s: [char_to_int[char] if char in alphabet else 0
                            for char in str(s).lower()])  # 对x进行编码
    x_ = pad_sequences(x_, length, padding='post')
    y_ = to_categorical(y, num_classes=no_of_classes, dtype='int32')
    return x_, y_

def generate_data(data, data_ratio=0.6):
    print('生成数据...')
    n = int(data.shape[0] * data_ratio)
    data = data[:n]
    num_batch = int(data.shape[0] / batch_size) + 1
    i = 0
    while True:
        i = i % num_batch
        cur_data = data[batch_size * i: batch_size * (i + 1)]
        # 对输入输出数据进行编码
        x, y = encode_xy(cur_data['pwd'], cur_data['category'])
        i += 1
        yield x, y

def load_data():
    print('开始加载数据...')
    data_list = []
    for index, data_name in enumerate(data_names):
        file_name = '/content/{}.txt'.format(data_name)
        df_tmp = pd.read_csv(file_name, header=None, names=['pwd'])
        df_tmp['category'] = index
        data_list.append(df_tmp[:data_size])  # 加载指定大小的数据
        print('{}：{}'.format(data_name, df_tmp.shape[0]))

    data = pd.concat(data_list, ignore_index=True)  # 合并数据
    data = data.sample(frac=1.0, random_state=SEED)  # 打乱数据
    print('训练模型的总数据量：{}'.format(data.shape[0]))
    return data

# 创建和训练模型

In [9]:
# 超参数
batch_size = 128
epochs = 10
conv_layers = [
                  [256, 7, 3],
                  [256, 7, 3],
                  [256, 3, None],
                  [256, 3, None],
                  [256, 3, None],
                  [256, 3, 3]
              ]
fully_layers = [1024, 1024]
embedding_size = 50
th = 1e-6
dropout_p = 0.5
optimizer = 'adam'
initializer_stddev = 0.05
loss = 'categorical_crossentropy'
log_name = './logs/{}'.format(time.time())

In [4]:
data = load_data()  # 加载数据
data.shape

开始加载数据...
myspace：37144
phpbb：184379
rockyou：14339373
训练模型的总数据量：117144


(117144, 2)

In [5]:
# 创建和训练模型
train_ratio = 0.8
# 划分测试集和训练集
print("划分验证集和测试集")
train_size = int(data.shape[0] * train_ratio)
test_size = int(data.shape[0] * (1 - train_ratio) / 2)
val_data = data[train_size: train_size + test_size]
test_data = data[-test_size:]
x_val, y_val = encode_xy(val_data['pwd'], val_data['category'])
x_test, y_test = encode_xy(test_data['pwd'], test_data['category'])
print("x_val Shape: %s, y_val Shape: %s" % (x_val.shape, y_val.shape))
print("x_test Shape: %s, y_test Shape: %s" % (x_test.shape, y_test.shape))

划分验证集和测试集
x_val Shape: (11714, 100), y_val Shape: (11714, 3)
x_test Shape: (11714, 100), y_test Shape: (11714, 3)


In [None]:
print('创建CNN模型...')
model = Sequential()
# Embedding layers
model.add(Embedding(input_dim=alphabet_size + 1, output_dim=embedding_size, input_length=length))
# Convolution layers
initializer = RandomNormal(0, 0.05)
for num_filters, filter_width, pool_size in conv_layers:
    model.add(Conv1D(filters=num_filters,
                     kernel_size=filter_width,
                     kernel_initializer=initializer,
                     activation='tanh',
                     padding='same'))
    if pool_size is not None:
        model.add(MaxPool1D(pool_size))
# Fully connected layers
model.add(Flatten())
for units in fully_layers:
    model.add(Dense(units, activation='selu', kernel_initializer='lecun_normal'))
    model.add(AlphaDropout(dropout_p))
model.add(Dense(no_of_classes, activation='softmax'))

# Build and compile model
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
print('创建模型：')
model.summary()

print('开始训练CNN模型')
tensorboard = TensorBoard(log_dir=log_name)
num_batch = int(data.shape[0] / batch_size)
# 开始训练
model.fit(generate_data(data, data_ratio=train_ratio),
          validation_data=(x_val, y_val),
          steps_per_epoch=num_batch,
          epochs=epochs,
          verbose=1,
          callbacks=[tensorboard])
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
print("Model Accuracy on test: %.2f%%, Loss: %.2f" % (accuracy * 100, loss))


创建CNN模型...
创建模型：
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 50)           3450      
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 100, 256)          89856     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 33, 256)           0         
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 33, 256)           459008    
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 11, 256)           0         
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 11, 256)           196864    
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 1

In [None]:
model_file = './model/cnn.h5'
print("保存模型：%s" % model_file)
model.save(model_file)

# 其他命令

In [1]:
!unzip '/content/raw_data.zip'

Archive:  /content/raw_data.zip
  inflating: phpbb.txt               
  inflating: rockyou.txt             
  inflating: myspace.txt             
