In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
pd.set_option('max_row', 1000)

In [2]:
data = pd.read_csv("data/train_data.csv")

In [73]:
X = data['sentence']
Y = data['label']
null_filter = X.notnull() & Y.notnull()
X = X.loc[null_filter]
Y = Y.loc[null_filter]

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, stratify = Y, random_state = 42)

In [74]:
X_TEST_TEXT = X_test.copy()

In [132]:
vectorizer = TfidfVectorizer( ngram_range=(1, 5),max_features = 4000)

In [76]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [77]:
from sklearn.naive_bayes import MultinomialNB

In [87]:
model = MultinomialNB()

model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9298831385642737

In [90]:
pred = (model.predict_proba(X_test) > 0.3)[:,1]
# pred = model.predict(X_test)

In [91]:
from sklearn.metrics import roc_auc_score, confusion_matrix
roc_auc_score(y_test, pred)

0.8320098027081219

In [92]:
confusion_matrix(y_test, pred)

array([[994,   1],
       [ 68, 135]])

In [93]:
false_prediction = pred !=  y_test
find_sensitive = (pred == 1) & (y_test == 1)

### 没有找到的敏感词

In [94]:
pd.DataFrame(zip(X_TEST_TEXT[false_prediction],y_test[false_prediction], model.predict(X_test)[false_prediction]), columns = ['Text',"True Value","Prediction"])

Unnamed: 0,Text,True Value,Prediction
0,香港 澳门 彩票网,1,0
1,核弹,1,0
2,弹药 制造 方法,1,0
3,阴唇 太大 了,1,0
4,用 按摩棒 好爽,1,0
5,你 呻吟 的 声 让 我 好爽,1,0
6,你 看过 兽交 吗,1,0
7,你 听 过 秘唇 吗,1,0
8,你 真是 个 白痴,1,0
9,爽 啊 来 一个,1,0


## 找到的敏感词

In [95]:
pd.DataFrame(zip(X_TEST_TEXT[find_sensitive],y_test[find_sensitive], pred[find_sensitive]), columns = ['Text',"True Value","Prediction"])

Unnamed: 0,Text,True Value,Prediction
0,陰戶 真的 很 带感,1,True
1,麻醉枪 猎枪 军用,1,True
2,我 那天 被 人 操 了,1,True
3,军刀 电棍 销售,1,True
4,高压 狗,1,True
5,有没有 迷幻药 跟 我 整点,1,True
6,冷 钢刀 具 专卖,1,True
7,黑火药 比例,1,True
8,暴奸 是不是 比 强奸 更爽,1,True
9,供应 麻醉 箭,1,True


In [96]:
from sklearn.pipeline import Pipeline

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, stratify = Y, random_state = 42)
X_TEST_TEXT = X_test.copy()

In [133]:
pipe = Pipeline([('vec',TfidfVectorizer( ngram_range=(1, 5),max_features = 4000)),
         ('model', MultinomialNB(class_prior=[0.2,0.1]))])

In [134]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('vec', TfidfVectorizer(max_features=4000, ngram_range=(1, 5))),
                ('model', MultinomialNB(class_prior=[0.2, 0.1]))])

In [135]:
roc_auc_score(y_test, pipe.predict(X_test))

0.819142510582469

In [137]:
pipe.predict(['那个 气枪 我 想要'])

array([1])

In [138]:
pipe.predict(['今天 你 手淫 了 没'])

array([0])