In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
import re

from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize
from nltk import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import Constant

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D, SpatialDropout1D, concatenate
from keras.layers import Bidirectional, GRU
from keras.models import Model
from keras.callbacks import Callback

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

Using TensorFlow backend.


In [2]:
# 数据读取
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# read data
train = pd.read_csv('data/train.csv').fillna(' ')
test = pd.read_csv('data/test.csv').fillna(' ')
submission = pd.read_csv('data/sample_submission.csv')

# 单独保存comment_text
train_text = train['comment_text'].str.lower()
test_text = test['comment_text'].str.lower()
# 获得y_train
y_train = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

# 连接所有文字用于分词
all_text = pd.concat([train_text, test_text], axis = 0, ignore_index = True)
# glove预训练数据
EMBEDDING_FILE = 'words_vector/glove.840B.300d.txt'

In [3]:
y_train.head(10)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
5,0,0,0,0,0,0
6,1,1,1,0,1,0
7,0,0,0,0,0,0
8,0,0,0,0,0,0
9,0,0,0,0,0,0


In [4]:
# 原始数据可视化分析
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
"""
# 数据预处理
# all_text = all_text[:100]
def lemmatize_all(sentence):
    wnl = WordNetLemmatizer()
    for word, tag in pos_tag(word_tokenize(sentence)):
        if tag.startswith("NN"):
            yield wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            yield wnl.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            yield wnl.lemmatize(word, pos='a')
        elif tag.startswith('R'):
            yield wnl.lemmatize(word, pos='r')
            
        else:
            yield word

def text_cleaned_process(text_raw):    
    text_raw = str(text_raw)
    text_raw = str(text_raw.lower())
    text_raw = re.sub(r'[^a-zA-Z]', ' ', text_raw)
    
    words = text_raw.split()
    
    # 移除长度小于3的词语
    words2 = []
    for i in words:
        if len(i) >= 0:
            words2.append(i)
    # 去停止词
    stops = set(stopwords.words('english'))
    
    result_text = []
    result_text = " ".join([w for w in words2 if not w in stops])
    
    return(" ".join(lemmatize_all(result_text)))

# 去掉数字
all_text.replace({r'[^\x00-\x7F]+':''},regex=True,inplace=True)

num_all_text = all_text.size

# 输出清洗后的数据
all_text_cleaned = []

for i in range(0, num_all_text):
    all_text_cleaned.append(text_cleaned_process(all_text[i]))

# 构建pd形式
all_text_cleaned = pd.Series(all_text_cleaned)
    

special_character_removal=re.compile(r'[^a-z\d ]',re.IGNORECASE)
replace_numbers=re.compile(r'\d+',re.IGNORECASE)

def text_to_words(text):
    # 去掉一些特殊字符
    text = text.split()
    text = ' '.join(text)
    
    # 进一步去掉特殊字母
    text=special_character_removal.sub('',text)
    
    # 用空格代替数字
    text=replace_numbers.sub('', text)
    
    # 去停止词
    stops = set(stopwords.words("english"))
    text = text.split()
    text = [w for w in text if not w in stops]

    # 去时态后缀
    '''
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    '''
    text = ' '.join(text)
     
    return text

# 输出清洗后数据
all_text_cleaned = []
for text in all_text:
    all_text_cleaned.append(text_to_words(text))

# 构建pd格式
all_text_cleaned = pd.Series(all_text_cleaned)
"""

'\n# 数据预处理\n# all_text = all_text[:100]\ndef lemmatize_all(sentence):\n    wnl = WordNetLemmatizer()\n    for word, tag in pos_tag(word_tokenize(sentence)):\n        if tag.startswith("NN"):\n            yield wnl.lemmatize(word, pos=\'n\')\n        elif tag.startswith(\'VB\'):\n            yield wnl.lemmatize(word, pos=\'v\')\n        elif tag.startswith(\'JJ\'):\n            yield wnl.lemmatize(word, pos=\'a\')\n        elif tag.startswith(\'R\'):\n            yield wnl.lemmatize(word, pos=\'r\')\n            \n        else:\n            yield word\n\ndef text_cleaned_process(text_raw):    \n    text_raw = str(text_raw)\n    text_raw = str(text_raw.lower())\n    text_raw = re.sub(r\'[^a-zA-Z]\', \' \', text_raw)\n    \n    words = text_raw.split()\n    \n    # 移除长度小于3的词语\n    words2 = []\n    for i in words:\n        if len(i) >= 0:\n            words2.append(i)\n    # 去停止词\n    stops = set(stopwords.words(\'english\'))\n    \n    result_text = []\n    result_text = " ".join([w for w

In [8]:
# 原始数据可视化分析
# all_text = all_text[:100]
# print("Len of all_text_cleaned:", len(all_text_cleaned))

# print("Text[0] before cleaned: \n", all_text[6])
# print("Text[0] after cleaned: \n", all_text_cleaned[6])

In [9]:
# 数据预处理
MAX_NUM_WORDS = 100000
MAX_SEQUENCE_LENGTH = 200
EMBEDDING_DIM = 300

# 分词
tokenizer = Tokenizer(num_words = MAX_NUM_WORDS)
tokenizer.fit_on_texts(all_text)
sequences = tokenizer.texts_to_sequences(all_text)

# 分词完成
# Pads sequences to the same length， return(len(sequence, maxlen))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)


# A dictionary of words and their uniquely assigned integers
word_index = tokenizer.word_index
print('Number of Unique Tokens',len(word_index))

# summarize what was learned
# print(tokenizer.word_counts) # A dictionary of words and their counts
# print(tokenizer.document_count) # A dictionary of words and how many documents each appeared in.
# print(tokenizer.word_docs) # An integer count of the total number of documents that were used to fit the Tokenizer.

Number of Unique Tokens 394787


In [10]:
# 重塑train与test数据
x_train = data[:len(train_text)]
x_test = data[len(train_text):]

print("Len of x_train: ", len(x_train))
print("Len of y_train: ", len(y_train))

# 拆分train数据
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.15, random_state = 255)

Len of x_train:  159571
Len of y_train:  159571


In [11]:
# 加入Glove预训练词
embeddings_index = {}

# 读取glove文件
f = open(EMBEDDING_FILE, encoding = 'utf-8')
for line in f:

    # 按空格分词
    values = line.split()
    word = ''.join(values[:-300])
    coefs = np.asarray(values[-300:], dtype='float32')
    embeddings_index[word] = coefs
# 关闭glove文件
f.close()

print('Total %s word vectors in glove.840B.300d.' % len(embeddings_index))

Total 2195892 word vectors in glove.840B.300d.


In [12]:
# 生成embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# 载入预训练词向量作为Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [13]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))

In [14]:
# 构建CNN模型
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype = 'int32')
# 加入词向量
embedded_sequence = embedding_layer(sequence_input)

x = SpatialDropout1D(0.2)(embedded_sequence)

x = Bidirectional(GRU(128, return_sequences = True, unroll = True))(x)
# model0
conv_0 = Conv1D(128, 1, kernel_initializer='normal', activation = 'relu')(x)
drop_0= Dropout(0.4)(conv_0)
max_pool0= MaxPooling1D(pool_size = 4)(drop_0)
gru_0= GRU(100, dropout=0.2, recurrent_dropout=0.2)(max_pool0)
# model1
conv_1 = Conv1D(128, 2, kernel_initializer='normal', activation = 'relu')(x)
drop_1= Dropout(0.45)(conv_1)
max_pool1 = MaxPooling1D(pool_size = 4)(drop_1)
gru_1= GRU(100, dropout=0.2, recurrent_dropout=0.2)(max_pool1)
# model2
conv_2 = Conv1D(128, 4, kernel_initializer='normal', activation = 'relu')(x)
drop_2= Dropout(0.5)(conv_2)
max_pool2 = MaxPooling1D(pool_size = 4)(drop_2)
gru_2= GRU(100, dropout=0.2, recurrent_dropout=0.2)(max_pool2)

# 模型融合
conv_sum = concatenate([gru_0, gru_1, gru_2], axis = 1)

# 压缩成对应6个标签
#conv_sum = Flatten()(conv_sum)
dense1 = Dense(100, activation='relu')(conv_sum)
preds = Dense(6, activation = "sigmoid")(dense1)

# 生成模型
model = Model(inputs = sequence_input, outputs = preds)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [15]:
# 开始训练
# 回调函数查看训练得分
ROC_AUC = RocAucEvaluation(validation_data = (x_val, y_val), interval = 1)

history = model.fit(x_train, y_train, batch_size = 256, epochs = 5, validation_data = (x_val, y_val), verbose = 2, 
                   callbacks = [ROC_AUC])

# 计算验证集数据得分
# y_val_pred = model.predict(x_val, verbose = 2)
# score = roc_auc_score(y_val, y_val_pred)
# print("Validation data ROC-AUC score: ", score)

Train on 135635 samples, validate on 23936 samples
Epoch 1/5
 - 482s - loss: 0.0608 - acc: 0.9785 - val_loss: 0.0448 - val_acc: 0.9832

 ROC-AUC - epoch: 1 - score: 0.978918
Epoch 2/5
 - 384s - loss: 0.0453 - acc: 0.9826 - val_loss: 0.0413 - val_acc: 0.9840

 ROC-AUC - epoch: 2 - score: 0.986701
Epoch 3/5
 - 384s - loss: 0.0422 - acc: 0.9835 - val_loss: 0.0399 - val_acc: 0.9843

 ROC-AUC - epoch: 3 - score: 0.988243
Epoch 4/5
 - 384s - loss: 0.0399 - acc: 0.9843 - val_loss: 0.0395 - val_acc: 0.9844

 ROC-AUC - epoch: 4 - score: 0.988890
Epoch 5/5
 - 383s - loss: 0.0380 - acc: 0.9849 - val_loss: 0.0391 - val_acc: 0.9845

 ROC-AUC - epoch: 5 - score: 0.989630


In [16]:
submission = pd.read_csv('data/sample_submission.csv')

In [17]:
# test数据预测值
y_prediction = model.predict(x_test, batch_size =1024, verbose = 1)

# 生成语言各分类概率
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_prediction

# 输出submission.csv
submission.to_csv('submission.csv', index=False)



In [18]:
# 初步检查输出结果
submission.head(10)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.996242,0.3177698,0.979641,0.144245,0.918645,0.562831
1,0000247867823ef7,0.002732,4.287464e-06,0.000414,5.2e-05,0.000103,3.3e-05
2,00013b17ad220c46,0.001898,8.5612e-06,0.000472,5.6e-05,0.000108,2.5e-05
3,00017563c3f7919a,0.000737,1.246457e-06,0.000145,4.6e-05,4.3e-05,8e-06
4,00017695ad8997eb,0.047193,0.0001591872,0.007537,0.000803,0.001472,0.000275
5,0001ea8717f6de06,0.001328,1.380382e-06,0.000197,3.8e-05,7.8e-05,1.1e-05
6,00024115d4cbde0f,0.003797,1.66087e-06,0.000212,5.6e-05,0.000106,1.4e-05
7,000247e83dcc1211,0.555155,0.001232047,0.056318,0.001231,0.108808,0.00382
8,00025358d4737918,0.083074,2.363799e-05,0.003229,0.000322,0.008495,0.000262
9,00026d1092fe71cc,0.000383,5.514399e-07,6.6e-05,1.5e-05,9e-06,4e-06
