In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display

from keras.preprocessing import text
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.models import Model

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

  'Matplotlib is building the font cache using fc-list. '
Using MXNet backend


In [2]:
# 数据读取
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# read data
train = pd.read_csv('data/train.csv').fillna(' ')
test = pd.read_csv('data/test.csv').fillna(' ')
submission = pd.read_csv('data/sample_submission.csv')

# 单独保存comment_text
train_text = train['comment_text']
test_text = test['comment_text']
# 获得y_train
y_train = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

# 连接所有文字用于分词
all_text = pd.concat([train_text, test_text], axis = 0, ignore_index = True)
# glove预训练数据
EMBEDDING_FILE = 'words_vector/glove.840B.300d.txt'

In [3]:
y_train.head(10)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
5,0,0,0,0,0,0
6,1,1,1,0,1,0
7,0,0,0,0,0,0
8,0,0,0,0,0,0
9,0,0,0,0,0,0


In [4]:
# 原始数据可视化分析
# all_text = all_text[:100]

In [5]:
# 数据预处理
MAX_NUM_WORDS = 100000
MAX_SEQUENCE_LENGTH = 200
EMBEDDING_DIM = 300

# 分词
tokenizer = text.Tokenizer(num_words = MAX_NUM_WORDS)
tokenizer.fit_on_texts(all_text)
sequences = tokenizer.texts_to_sequences(all_text)

# 分词完成
# Pads sequences to the same length， return(len(sequence, maxlen))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)


# A dictionary of words and their uniquely assigned integers
word_index = tokenizer.word_index
print('Number of Unique Tokens',len(word_index))

# summarize what was learned
# print(tokenizer.word_counts) # A dictionary of words and their counts
# print(tokenizer.document_count) # A dictionary of words and how many documents each appeared in.
# print(tokenizer.word_docs) # An integer count of the total number of documents that were used to fit the Tokenizer.

Number of Unique Tokens 394787


In [6]:
# 重塑train与test数据
x_train = data[:len(train_text)]
x_test = data[len(train_text):]

# 拆分train数据
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state = 255)

In [8]:
# 加入Glove预训练词
embeddings_index = {}

# 读取glove文件
f = open(EMBEDDING_FILE, encoding = 'utf-8')
for line in f:

    # 按空格分词
    values = line.split()
    word = ''.join(values[:-300])
    coefs = np.asarray(values[-300:], dtype='float32')
    embeddings_index[word] = coefs
# 关闭glove文件
f.close()

print('Total %s word vectors in glove.840B.300d.' % len(embeddings_index))

Total 2195892 word vectors in glove.840B.300d.


In [11]:
from keras.initializers import Constant

# 生成embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# 载入预训练词向量作为Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [13]:
# 构建CNN模型
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype = 'int32')
embedded_sequence = embedding_layer(sequence_input)

x = Conv1D(128, 5, activation='relu')(embedded_sequence)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)

preds = Dense(6, activation='softmax')(x)

model = Model(inputs=sequence_input, outputs=preds)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [14]:
# 开始训练
model.fit(x_train, y_train, batch_size=256, epochs=2, validation_data=(x_val, y_val), verbose = 2)

# 计算验证集团数据得分
y_val_pred = model.predict(x_val, verbose = 2)
score = roc_auc_score(y_val, y_val_pred)
print("Validation data ROC-AUC score: ", score)

Train on 127656 samples, validate on 31915 samples
Epoch 1/2


  force_init=force_init)


 - 104s - loss: 0.2274 - acc: 0.9633 - val_loss: 0.2234 - val_acc: 0.9642
Epoch 2/2
 - 37s - loss: 0.2231 - acc: 0.9640 - val_loss: 0.2215 - val_acc: 0.9647


NameError: name 'y_pred' is not defined

In [22]:
submission = pd.read_csv('data/sample_submission.csv')

In [24]:
# test数据预测值
y_prediction = model.predict(x_test, batch_size =1024)

# 生成语言各分类概率
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_prediction

# 输出submission.csv
submission.to_csv('submission.csv', index=False)