In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
pd.set_option('max_row', 1000)

In [2]:
data = pd.read_csv("data/train_data.csv")

In [3]:
X = data['sentence']
Y = data['label']
null_filter = X.notnull() & Y.notnull()
X = X.loc[null_filter]
Y = Y.loc[null_filter]

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, stratify = Y, random_state = 42)

In [4]:
X_TEST_TEXT = X_test.copy()

In [132]:
vectorizer = TfidfVectorizer( ngram_range=(1, 5),max_features = 4000)

In [5]:
vectorizer = CountVectorizer(ngram_range=(1, 5),max_features = 4000)

In [6]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [7]:
from sklearn.naive_bayes import MultinomialNB

In [8]:
model = MultinomialNB()

model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9248747913188647

In [9]:
pred = (model.predict_proba(X_test) > 0.3)[:,1]
# pred = model.predict(X_test)

In [10]:
from sklearn.metrics import roc_auc_score, confusion_matrix
roc_auc_score(y_test, pred)

0.8135653637646361

In [11]:
confusion_matrix(y_test, pred)

array([[972,  23],
       [ 71, 132]])

In [12]:
false_prediction = pred !=  y_test
find_sensitive = (pred == 1) & (y_test == 1)

### 没有找到的敏感词

In [13]:
pd.DataFrame(zip(X_TEST_TEXT[false_prediction],y_test[false_prediction], model.predict(X_test)[false_prediction]), columns = ['Text',"True Value","Prediction"])

Unnamed: 0,Text,True Value,Prediction
0,香港 澳门 彩票网,1,0
1,核弹,1,0
2,弹药 制造 方法,1,0
3,生于 心有 节于 外 谓之 音,0,0
4,至于 破解 谜思 是 彼得,0,1
5,日本 当局 也 让 她 随时 报告 孙之 行止 年 以 患病 为 由 回到 静冈 老家,0,0
6,各类 地形 面积 依序 为 高原 山地 平原 盆地 丘陵,0,1
7,暴奸 是不是 比 强奸 更爽,1,0
8,年末 中国工商银行 中国建设银行 和 中国银行 名列 世界银行 前十名,0,0
9,阴唇 太大 了,1,0


## 找到的敏感词

In [14]:
pd.DataFrame(zip(X_TEST_TEXT[find_sensitive],y_test[find_sensitive], pred[find_sensitive]), columns = ['Text',"True Value","Prediction"])

Unnamed: 0,Text,True Value,Prediction
0,陰戶 真的 很 带感,1,True
1,麻醉枪 猎枪 军用,1,True
2,我 那天 被 人 操 了,1,True
3,军刀 电棍 销售,1,True
4,高压 狗,1,True
5,有没有 迷幻药 跟 我 整点,1,True
6,冷 钢刀 具 专卖,1,True
7,黑火药 比例,1,True
8,供应 麻醉 箭,1,True
9,出售 步枪,1,True


In [16]:
from sklearn.pipeline import Pipeline

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, stratify = Y, random_state = 42)
X_TEST_TEXT = X_test.copy()

In [18]:
pipe = Pipeline([('vec',TfidfVectorizer( ngram_range=(1, 5),max_features = 4000)),
         ('model', MultinomialNB(class_prior=[0.2,0.1]))])


pipe = Pipeline([('vec',CountVectorizer( ngram_range=(1, 5),max_features = 4000)),
         ('model', MultinomialNB(class_prior=[0.2,0.1]))])

In [19]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('vec', CountVectorizer(max_features=4000, ngram_range=(1, 5))),
                ('model', MultinomialNB(class_prior=[0.2, 0.1]))])

In [20]:
roc_auc_score(y_test, pipe.predict(X_test))

0.8160284179518281

In [21]:
pipe.predict(['那个 气枪 我 想要'])

array([1])

In [23]:
pipe.predict(['你 特么 太 淫荡 了'])

array([1])