In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display

from keras.preprocessing import text
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.models import Model

In [5]:
# 数据读取
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# read data
train = pd.read_csv('data/train.csv').fillna(' ')
test = pd.read_csv('data/test.csv').fillna(' ')

# 单独保存comment_text
train_text = train['comment_text']
test_text = test['comment_text']
# 连接所有文字用于分词
all_text = pd.concat([train_text, test_text])
# glove预训练数据
EMBEDDING_FILE = 'words_vector/glove.840B.300d.txt'

In [None]:
# 原始数据可视化分析


In [34]:
# 数据预处理
MAX_NUM_WORDS = 100000
MAX_SEQUENCE_LENGTH = 200
EMBEDDING_DIM = 300

# 分词
tokenizer = text.Tokenizer(num_words = MAX_NUM_WORDS)
tokenizA dictionary of words and their uniquely assigned integerser.fit_on_texts(all_text)
sequences = tokenizer.texts_to_sequences(all_text)
# Pads sequences to the same length， return(len(sequence, maxlen))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)


# A dictionary of words and their uniquely assigned integers
word_index = tokenizer.word_index
print('Number of Unique Tokens',len(word_index))

# summarize what was learned
print(tokenizer.word_counts) # A dictionary of words and their counts
print(tokenizer.document_count) # A dictionary of words and how many documents each appeared in.
print(tokenizer.word_docs) # An integer count of the total number of documents that were used to fit the Tokenizer.

In [None]:
# 加入Glove预训练词
embendings_index = {}

# 读取glove文件
f = open(EMBEDDING_FILE, encoding = 'utf-8')
for line in f:
    # 按空格分词
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embendings_index[word] = coefs
# 关闭glove文件
f.close()

print('Total %s word vectors in glove.840B.300d.' % len(embeddings_index))

In [None]:
# 生成embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# 载入预训练词向量作为Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [None]:
# 构建CNN模型
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype = 'int32')
embedded_sequence = embedding_layer(sequence_input)

x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)

preds = Dense(6, activation='softmax')(x)

model = Model(inputs=sequence_input, outputs=preds)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# 开始训练
model.fit(x_train, y_train, batch_size=128, epochs=10, validation_data=(x_val, y_val))
# 预测值
prediction = model.predict(x_test, batch_size =1024)

# 生成语言各分类概率


In [None]:
# 输出submission.csv
submission.to_csv('submission.csv', index=False)

In [None]:
scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})

# 遍历7个分类标签
for class_name in class_names:
    # 获取每个分类标签列训练数据
    train_target = train[class_name]
    # 生成逻辑回归对象
    classifier = LogisticRegression(C=0.1, solver='sag')
    # 通过ROC-AUC评估模型得分
    cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    # 保存得分
    scores.append(cv_score)
    
    print('CV score for class {} is {}'.format(class_name, cv_score))
    # fit数据
    classifier.fit(train_features, train_target)
    # 预测得分
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

# 保存输出文件
submission.to_csv('submission.csv', index=False)


In [None]:
# submission