In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_union

from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize
from nltk import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import Constant

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

Using TensorFlow backend.


In [31]:
# 数据读取
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# read data
train = pd.read_csv('data/train.csv').fillna(' ')
test = pd.read_csv('data/test.csv').fillna(' ')
submission = pd.read_csv('data/sample_submission.csv')

# 单独保存comment_text
train_text = train['comment_text'].str.lower()
test_text = test['comment_text'].str.lower()
# 获得y_train
y_train = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

# 连接所有文字用于分词
train_text = train_text
test_text = test_text
all_text = pd.concat([train_text, test_text], axis = 0, ignore_index = True)

In [3]:
y_train.head(10)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
5,0,0,0,0,0,0
6,1,1,1,0,1,0
7,0,0,0,0,0,0
8,0,0,0,0,0,0
9,0,0,0,0,0,0


In [4]:
# 原始数据可视化分析
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# 数据预处理
# all_text = all_text[:100]
def lemmatize_all(sentence):
    wnl = WordNetLemmatizer()
    for word, tag in pos_tag(word_tokenize(sentence)):
        if tag.startswith("NN"):
            yield wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            yield wnl.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            yield wnl.lemmatize(word, pos='a')
        elif tag.startswith('R'):
            yield wnl.lemmatize(word, pos='r')
            
        else:
            yield word

def text_cleaned_process(text_raw):    
    text_raw = str(text_raw)
    text_raw = str(text_raw.lower())
    text_raw = re.sub(r'[^a-zA-Z]', ' ', text_raw)
    
    words = text_raw.split()
    
    # 移除长度小于3的词语
    words2 = []
    for i in words:
        if len(i) >= 0:
            words2.append(i)
    # 去停止词
    stops = set(stopwords.words('english'))
    
    result_text = []
    result_text = " ".join([w for w in words2 if not w in stops])
    
    return(" ".join(lemmatize_all(result_text)))

# 去掉数字
all_text.replace({r'[^\x00-\x7F]+':''},regex=True,inplace=True)

num_all_text = all_text.size

# 输出清洗后的数据
all_text_cleaned = []

for i in range(0, num_all_text):
    all_text_cleaned.append(text_cleaned_process(all_text[i]))

# 构建pd形式
all_text_cleaned = pd.Series(all_text_cleaned)

In [7]:
# 原始数据可视化分析
# all_text = all_text[:100]
print("Len of all_text_cleaned:", len(all_text_cleaned))
# 数据清理前后对比
print("Text[0] before cleaned: \n", all_text[0])
print("Text[0] after cleaned: \n", all_text_cleaned[0])

Len of all_text_cleaned: 312735
Text[0] before cleaned: 
 explanation
why the edits made under my username hardcore metallica fan were reverted? they weren't vandalisms, just closure on some gas after i voted at new york dolls fac. and please don't remove the template from the talk page since i'm retired now.89.205.38.27
Text[0] after cleaned: 
 explanation edits make username hardcore metallica fan revert vandalisms closure gas vote new york doll fac please remove template talk page since retire


In [32]:
# 数据分词处理
vectorizer = TfidfVectorizer(
    ngram_range=(1,2), 
    analyzer = 'word',           
    min_df=3, 
    max_df=0.9, 
    strip_accents='unicode', 
    use_idf=1,
    smooth_idf=1, 
    sublinear_tf=1 )

# 获取向量模型
# vectorizer = make_union(word_vectorizer, word_vectorizer)

# 训练tf-idf模型
vectorizer.fit(all_text_cleaned)

# 获取词向量
train_features = vectorizer.transform(train_text)
test_features = vectorizer.transform(test_text)

In [33]:
# 拆分train数据，用于验证模型好坏。
x_train, x_val, y_train, y_val = train_test_split(train_features, y_train, test_size = 0.05, random_state = 255)
print("x_train shape:", x_train.shape[0], x_train.shape[1])
print("y_train shape:", y_train.shape)

x_train shape: 151592 544855
y_train shape: (151592, 6)


In [34]:
# 基于NBSVM模型修改为NB+LR模型
# 贝叶斯特征方程
def pr(x, y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

# 模型建立
def get_model(x,y):
    y = y.values
    # 贝叶斯概率
    r = np.log(pr(x,1,y) / pr(x,0,y))
    # 机器学习模型导入
    classifier = LogisticRegression(C=4, dual=True)
    # svm 速度太慢
    # classifier = SVC(kernel = 'linear', random_state = 255, C = 4, probability=True)
    # 向量相乘
    x_nb = x.multiply(r)
    # 返回训练后模型
    return classifier.fit(x_nb, y), r

In [35]:
# 保存得分
from sklearn.metrics import roc_auc_score

submission = pd.DataFrame.from_dict({'id': test['id']})
scores = []

print("Training starts!")
for class_name in class_names:
    # 依次读取训练目标数据
    train_target = y_train[class_name]
    print("Training starts! ", class_name)
    # 导入并测试模型
    classifier, r = get_model(x_train, train_target)
    
    # 模型验证roc-auc score
    preds_val = classifier.predict_proba(x_val.multiply(r))[:, 1]
    # 计算roc-auc得分
    roc_auc = roc_auc_score(y_val[class_name], preds_val)
    # 保存并输出得分
    scores.append(roc_auc)
    print("ROC-AUC Score: ", roc_auc)
    
    # 输出test数据预测概率
    submission[class_name] = classifier.predict_proba(test_features.multiply(r))[:, 1]

# 计算所有类型的平均得分
print('Total mean ROC-AUC score is {}'.format(np.mean(scores)))

Training starts!
Training starts!  toxic
ROC-AUC Score:  0.9745194732890752
Training starts!  severe_toxic
ROC-AUC Score:  0.9806229520986113
Training starts!  obscene
ROC-AUC Score:  0.9893626238529425
Training starts!  threat
ROC-AUC Score:  0.9877854598784831
Training starts!  insult
ROC-AUC Score:  0.9737127417804029
Training starts!  identity_hate
ROC-AUC Score:  0.9857800538068858
Total mean ROC-AUC score is 0.9826305507844001


In [36]:
# 输出结果文件 
submission.to_csv('submission_tf-idf_nblr.csv', index=False)

In [37]:
# 初步检查输出结果
submission.head(10)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999954,0.022472,0.999854,0.002673,0.972567,0.159247
1,0000247867823ef7,0.002022,0.000495,0.000883,5.6e-05,0.002245,0.000345
2,00013b17ad220c46,0.019914,0.000875,0.003614,0.000112,0.00486,0.000597
3,00017563c3f7919a,0.001712,0.000734,0.001881,0.000299,0.002172,0.000397
4,00017695ad8997eb,0.025718,0.000776,0.004733,0.00015,0.006023,0.000362
5,0001ea8717f6de06,0.00198,0.000222,0.001759,0.000127,0.002114,0.000433
6,00024115d4cbde0f,0.00086,4.2e-05,0.000865,4.3e-05,0.001213,0.000766
7,000247e83dcc1211,0.544172,0.000839,0.004133,0.000104,0.013233,0.000438
8,00025358d4737918,0.073006,0.000593,0.006754,0.00092,0.017635,0.001338
9,00026d1092fe71cc,0.001725,0.000234,0.000997,6.6e-05,0.002036,0.000347
