### 新闻抄袭模型

In [1]:
import re
import jieba
import timeit
def token(string):
    return ' '.join(re.findall(r'[^\d\W]+', string))
def cut(string):
    return ' '.join(jieba.cut(string))
def clock(func):
    def clocked(*args, **kwargs):
        t0 = timeit.default_timer()
        result = func(*args, **kwargs)
        elapsed = timeit.default_timer() - t0
        name = func.__name__
        arg_str = ', '.join(repr(arg) for arg in args)
        print(elapsed,'s')
        #print('[%0.8fs] %s(%s) -> %r' % (elapsed, name, arg_str, result))
        return result
    return clocked
def typeof(variate):
    type=None
    if isinstance(variate,int):
        type = "int"
    elif isinstance(variate,str):
        type = "str"
    elif isinstance(variate,float):
        type = "float"
    elif isinstance(variate,list):
        type = "list"
    elif isinstance(variate,tuple):
        type = "tuple"
    elif isinstance(variate,dict):
        type = "dict"
    elif isinstance(variate,set):
        type = "set"
    print(type)
    return type

In [2]:
import pandas as pd
dataset = pd.read_csv('sqlResult_1558435.csv', encoding='gb18030').fillna(' ')
train = [cut(token(dataset['content'][i])) for i in range(len(dataset))]
y = [1 if dataset['source'][i] == '新华社' else 0 for i in range(len(dataset))]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Administrator\AppData\Local\Temp\jieba.cache
Loading model cost 0.743 seconds.
Prefix dict has been built succesfully.


#### 1. 数据不平衡

In [3]:
data_pos = sum(y)/len(y)
print(data_pos)

0.8778051801676133


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, token_pattern=r"(?u)\b\w+\b", max_df = 1.0,
                                 stop_words = None,vocabulary = None )
train_vec = vectorizer.fit_transform(train)

x_train, x_test, y_train, y_test = train_test_split(train_vec, y, test_size = 0.2)

In [5]:
len(vectorizer.vocabulary_)

5000

#### 2.评估指标

In [13]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
@clock
def get_performance(clf, x_, y_):
    y_hat = clf.predict(x_)
    print('f1_score is: {}'.format(f1_score(y_, y_hat)))
    print('accuracy is: {}'.format(accuracy_score(y_, y_hat)))
    print('percision is: {}'.format(precision_score(y_, y_hat)))
    print('recall is: {}'.format(recall_score(y_, y_hat)))
    #print('roc_auc is: {}'.format(roc_auc_score(y_, y_hat)))
    #print('confusion matrix: \n{}'.format(confusion_matrix(y_, y_hat, labels=[0, 1])))

#### 3. 模型

In [None]:
##### KNeighborsClassifier
"""
时间：几乎不需要训练，测试时间长，因为需要和所有的训练集作计算。
空间：占用内存大（需要保存所有样本）
用法：算法原理简单，超参数少
"""

In [7]:
from sklearn.neighbors import KNeighborsClassifier
@clock
def KNN(x_train, y_train):
    clf = KNeighborsClassifier(n_neighbors = 5, weights = 'uniform', algorithm = 'auto')
    clf.fit(x_train, y_train)
    return clf
#clf.fit(X_test,y_test)
get_performance(KNN(x_train,y_train),x_test,y_test)

0.028504303769475284 s
f1_score is: 0.8935138691692349
accuracy is: 0.8284327400546784
percision is: 0.9839829151094501
recall is: 0.8182798426994798
72.51482860189614 s


In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
@clock
def KNN(x_train, y_train):
    clf = KNeighborsClassifier()
    param_grid = { 'n_neighbors' : [i for i in range(1,3)]}
    grid = GridSearchCV(clf, param_grid, cv = 2, scoring='accuracy',n_jobs = -1)
    clf = grid.fit(x_train, y_train)
    clf.fit(x_train, y_train)
    print(clf.best_params_)
    return clf
#clf.fit(X_test,y_test)
get_performance(KNN(x_train,y_train),x_test,y_test)

{'n_neighbors': 1}
194.28275526579648 s
f1_score is: 0.9484034331941202
accuracy is: 0.9124588517547285
percision is: 0.9847708802840948
recall is: 0.914626411264747
64.72239086886356 s


In [10]:
##### BeyesisClassifier
"""
时间：训练和测试均很快
空间：占用内存少
用法：条件独立假设，通常不一定可行；不需要调参
"""

In [12]:
@clock
def GNB(x_train, y_train):
    from sklearn.naive_bayes import BernoulliNB
    gnb = BernoulliNB()
    gnb.fit(x_train, y_train)
    return gnb

#clf.fit(X_test,y_test)
get_performance(GNB(x_train,y_train),x_test,y_test)

0.2201603363319009 s
f1_score is: 0.8884335001847683
accuracy is: 0.8147073592590526
percision is: 0.9444325405328191
recall is: 0.8387035392617024
0.08664537942968309 s


In [17]:
##### LogisticRegression Classifier
"""
时间：线性模型，训练和预测的速度快
空间：占用内存少
用法：模型简单，但是泛化能力强
"""

In [22]:
@clock
def LG(x_train, y_train):
    from sklearn.linear_model import LogisticRegression
    clf = LogisticRegression(random_state=0, class_weight='balanced' 
                             ,solver='liblinear').fit(x_train, y_train)
    clf.fit(x_train, y_train)
    return clf

#clf.fit(X_test,y_test)
get_performance(LG(x_train,y_train),x_test,y_test)

2.895269985026971 s
f1_score is: 0.9943888286679845
accuracy is: 0.9901802153657312
percision is: 0.9996794871794872
recall is: 0.9891538754281365
0.03404405297987978 s


In [24]:
##### SVM Classifier
"""
缺点：模型本质上是求解二次规划问题，复杂且时间开销大，不实用于大型数据集
优点：
1）占用内存少，只需保存少数支持向量，通过支持向量便可以计算出分类器
2）利用核函数性质可以轻松地映射到高维空间进行分类，非常高效
3）即使在数据维度比样本数量大的情况下仍然有效
4）需要调优的超参数：C-误差惩罚项；核函数选择
"""

In [27]:
@clock
def SVM(x_train, y_train):
    from sklearn.svm import SVC
    clf = SVC(C=1.0, class_weight='balanced',kernel='rbf')
    clf.fit(x_train, y_train)
    return clf

#clf.fit(X_test,y_test)
get_performance(SVM(x_train,y_train),x_test,y_test)



6608.065565968143 s
f1_score is: 0.706880997293529
accuracy is: 0.6011828376945824
percision is: 0.9998839907192575
recall is: 0.5466827349993657
653.6304521249003 s


In [None]:
##### RandomForest Classifier
"""
缺点：
1）小数据或者低维数据（特征较少的数据），可能不能产生很好的分类
2）相比用于分类，回归的效果差一些
优点：
1）相比同类算法，准确率高
2）有效地运行在大数据、高维度、有缺失值的输入样本
3）能够评估各个特征在分类问题上的重要性
4）在生成过程中，能够获取到内部生成误差的一种无偏估计
"""

In [18]:
@clock
def RF(x_train, y_train):
    from sklearn.ensemble import RandomForestClassifier
    
    clf = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=0)
    clf.fit(x_train, y_train)
    return clf

#clf.fit(X_test,y_test)
get_performance(RF(x_train,y_train),x_test,y_test)

0.3704868411049347 s
f1_score is: 0.9350093607916554
accuracy is: 0.8779780170730347
percision is: 0.8779507785032646
recall is: 1.0
0.0829685387795962 s
