In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_union

from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize
from nltk import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import Constant

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

Using TensorFlow backend.


In [2]:
# 数据读取
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# read data
train = pd.read_csv('data/train.csv').fillna(' ')
test = pd.read_csv('data/test.csv').fillna(' ')
submission = pd.read_csv('data/sample_submission.csv')

# 单独保存comment_text
train_text = train['comment_text'].str.lower()
test_text = test['comment_text'].str.lower()
# 获得y_train
y_train = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

# 连接所有文字用于分词
all_text = pd.concat([train_text, test_text], axis = 0, ignore_index = True)

In [3]:
y_train.head(10)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
5,0,0,0,0,0,0
6,1,1,1,0,1,0
7,0,0,0,0,0,0
8,0,0,0,0,0,0
9,0,0,0,0,0,0


In [4]:
# 原始数据可视化分析
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# 数据预处理
# all_text = all_text[:100]
def lemmatize_all(sentence):
    wnl = WordNetLemmatizer()
    for word, tag in pos_tag(word_tokenize(sentence)):
        if tag.startswith("NN"):
            yield wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            yield wnl.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            yield wnl.lemmatize(word, pos='a')
        elif tag.startswith('R'):
            yield wnl.lemmatize(word, pos='r')
            
        else:
            yield word

def text_cleaned_process(text_raw):    
    text_raw = str(text_raw)
    text_raw = str(text_raw.lower())
    text_raw = re.sub(r'[^a-zA-Z]', ' ', text_raw)
    
    words = text_raw.split()
    
    # 移除长度小于3的词语
    words2 = []
    for i in words:
        if len(i) >= 0:
            words2.append(i)
    # 去停止词
    stops = set(stopwords.words('english'))
    
    result_text = []
    result_text = " ".join([w for w in words2 if not w in stops])
    
    return(" ".join(lemmatize_all(result_text)))

# 去掉数字
all_text.replace({r'[^\x00-\x7F]+':''},regex=True,inplace=True)

num_all_text = all_text.size

# 输出清洗后的数据
all_text_cleaned = []

for i in range(0, num_all_text):
    all_text_cleaned.append(text_cleaned_process(all_text[i]))

# 构建pd形式
all_text_cleaned = pd.Series(all_text_cleaned)

In [7]:
# 原始数据可视化分析
# all_text = all_text[:100]
print("Len of all_text_cleaned:", len(all_text_cleaned))

print("Text[0] before cleaned: \n", all_text[0])
print("Text[0] after cleaned: \n", all_text_cleaned[0])

Len of all_text_cleaned: 312735
Text[0] before cleaned: 
 explanation
why the edits made under my username hardcore metallica fan were reverted? they weren't vandalisms, just closure on some gas after i voted at new york dolls fac. and please don't remove the template from the talk page since i'm retired now.89.205.38.27
Text[0] after cleaned: 
 explanation edits make username hardcore metallica fan revert vandalisms closure gas vote new york doll fac please remove template talk page since retire


In [8]:
# 数据分词处理
# 按词训练
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 2),
    max_features=10000)

# 按字母训练
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(1,4),
    max_features=50000)

# 获取向量模型
vectorizer = make_union(word_vectorizer, word_vectorizer)

# 训练tf-idf模型
vectorizer.fit(all_text_cleaned)

# 获取词向量
train_features = vectorizer.transform(train_text)
test_features = vectorizer.transform(test_text)

In [9]:
# 不需要拆分train数据
# x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.10, random_state = 255)


In [None]:
# 保存得分
scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})

# 开始训练
for class_name in class_names:
    # 依次读取训练目标数据
    train_target = train[class_name]
    # 导入机器学习模型
    #classifier = LogisticRegression(C=4, dual=True)
    classifier = SVC(kernel = 'rbf', random_state = 255, gamma = 0.1, C = 0.3)
    # 计算模型的ROC-AUC得分
    cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    # 保存得分
    scores.append(cv_score)
    # 输出得分
    print('CV score for class {} is {}'.format(class_name, cv_score))
    
    # 训练模型
    classifier.fit(train_features, train_target)
    # 输出test数据预测概率
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]

# 计算所有类型的平均得分
print('Total CV score is {}'.format(np.mean(scores)))


In [None]:
# 输出结果文件 
submission.to_csv('submission_tf-idf_svm.csv', index=False)

In [None]:
# 初步检查输出结果
submission.head(10)