# 新华社新闻抄袭自动判别
任务要求：
1. 构建一个机器学习模型，判断这个文章是不是新华社的
2. 当这个模型的acc 大于 0.8778， recall， precision，f1等值都较高的时候
3. 用该模型 判断一篇文章是否是新华社的文章，如果判断出来是新华社的，但是，它的source并不是新华社的，那么，我们就说，这个文章是抄袭的新华社的文章
4. Text Representation uses "https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html"

## 数据预处理

### 获取数据

In [3]:
import pandas as pd
import numpy as np
import math
import jieba

In [4]:
filename = 'E:\\MYGIT\\DataSources\\sqlResult_1558435.csv'
pandas_data = pd.read_csv(filename, encoding='gb18030')
pandas_data = pandas_data.dropna(subset = ['content']) #剔除内容为空的行

In [5]:
content = pandas_data['content'].tolist()
source = pandas_data['source'].tolist()

In [7]:
source = [name if isinstance(name, str) else 'unknow' for name in source] #把没有来源的信息标记为unknow

### content清洗切词，source分好标签

In [8]:
article_labels = [1 if name.strip() == '新华社' else 0 for name in source]

In [9]:
print(sum(article_labels), len(article_labels),len(content))

78661 87054 87054


以上可见正负样本非常不均衡，如何解决？

In [7]:
import os
from pyltp import Segmentor
#获取停用词集
def get_stopwords():
    stopwords = []
    with open('stopwords.txt') as f:
        line_str = f.readline()
        while line_str!= '':
            line_str = line_str.strip()
            stopwords.append(line_str)
            line_str = f.readline()
    return set(stopwords)

def text_deal_cut(text_list):
    stopwords = get_stopwords()
    cws_model_path = 'E:/MYGIT/Project/ltp_data/cws.model'
    
    segmentor = Segmentor()  # 初始化实例
    segmentor.load(cws_model_path)  # 加载模型 
    corpus = []
    i = 0
    for string in text_list:
        i += 1
        if(i%3000 == 0):print(i)
        string = string.strip()
        string_temp = ''
        words = list(segmentor.segment(string))
        for word in words:
            if word not in stopwords:
                string_temp += word + ' '    
        corpus.append(string_temp)
    segmentor.release()  # 释放模型
    return corpus

In [8]:
len(content)

87054

In [9]:
corpus = text_deal_cut(content)

3000
6000
9000
12000
15000
18000
21000
24000
27000
30000
33000
36000
39000
42000
45000
48000
51000
54000
57000
60000
63000
66000
69000
72000
75000
78000
81000
84000
87000


到目前为止，已经获得了处理好的标签，和分词好的文章

In [12]:
import pickle
with open('./temp_file/corpus_list','wb') as f:
    pickle.dump(corpus,f)
with open('./temp_file/corpus_list_label','wb') as f:
    pickle.dump(article_labels,f)

In [1]:
#读取
import pickle
try:
    print(corpus[0])
    print(article_labels[:10])
except NameError:
    with open('./temp_file/corpus_list','rb') as f:
        corpus = pickle.load(f)
    with open('./temp_file/corpus_list_label','rb') as f:
        corpus_label = pickle.load(f)


In [2]:
corpus_label[:10]

[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]

In [3]:
corpus[0]

'本周 6月 12日 小米 手机 15 款 机型 外 机型 暂停 更新 发布 含 开发版 体验版 内测 稳定 版 暂 受 影响 确保 工程师 精力 系统 优化 工作 有人 猜测 精力 主要 MIUI 研发 之中  MIUI 去年 5月 发布 年 有余 更新换代  MIUI 确切 信息 等待 官方 消息 '

In [4]:
ss = corpus[0]

In [5]:
ss.replace('\r\n','').replace('。','')

'本周 6月 12日 小米 手机 15 款 机型 外 机型 暂停 更新 发布 含 开发版 体验版 内测 稳定 版 暂 受 影响 确保 工程师 精力 系统 优化 工作 有人 猜测 精力 主要 MIUI 研发 之中  MIUI 去年 5月 发布 年 有余 更新换代  MIUI 确切 信息 等待 官方 消息 '

In [6]:
corpus = [line.replace('\r\n','').replace('。','') for line in corpus]
corpus[0]

'本周 6月 12日 小米 手机 15 款 机型 外 机型 暂停 更新 发布 含 开发版 体验版 内测 稳定 版 暂 受 影响 确保 工程师 精力 系统 优化 工作 有人 猜测 精力 主要 MIUI 研发 之中  MIUI 去年 5月 发布 年 有余 更新换代  MIUI 确切 信息 等待 官方 消息 '

## TFIDF向量化

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
vectorized = TfidfVectorizer(max_features= 10000) #设置文本单词个数最大值
X = vectorized.fit_transform(corpus)

In [21]:
vectorized = TfidfVectorizer(max_features= 5000) #设置文本单词个数最大值

In [22]:
sub_samples = corpus[:1000]

In [46]:
X = vectorized.fit_transform(corpus)

In [47]:
X.shape

(87054, 10000)

## 各类模型测试分析

In [2]:
import time
def clock(func):
    def clocked(*args, **kwargs):
        t0 = time.time()
        result = func(*args, **kwargs)
        elapsed = time.time() - t0
        name = func.__name__
        print('函数 {} 运行时间:{:.2f}s'.format(name,elapsed))
        return result
    return clocked

### 模型评估函数

In [3]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pandas as pd

#@clock


def get_performance(clf, x_, y_):
    y_hat = clf.predict(x_)
    #result=pd.DataFrame({'F1':f1_score(y_, y_hat),
    #              'ACC':accuracy_score(y_, y_hat),
    #              'Precision':precision_score(y_, y_hat),
    #              'Recall':recall_score(y_, y_hat)})   #不设置index
    return [format(f1_score(y_, y_hat),'.4f'),format(accuracy_score(y_, y_hat),'.4f'),
            format(f1_score(y_, y_hat),'.4f'),format(recall_score(y_, y_hat),'.4f')]
    #print('f1_score is: {}'.format(f1_score(y_, y_hat)))
    #print('accuracy is: {}'.format(accuracy_score(y_, y_hat)))
    #print('percision is: {}'.format(precision_score(y_, y_hat)))
    #print('recall is: {}'.format(recall_score(y_, y_hat)))
    #print('roc_auc is: {}'.format(roc_auc_score(y_, y_hat)))
    #print('confusion matrix: \n{}'.format(confusion_matrix(y_, y_hat, labels=[0, 1])))
    #print(classification_report(y_, y_hat))

### 数据预处理

In [4]:
from sklearn.model_selection import train_test_split
import random

In [5]:
corpus_0 = [a for a,i in zip(corpus, corpus_label) if i==0]
corpus_1 = [a for a,i in zip(corpus, corpus_label) if i==1]

均衡样本,这里把类别为1的砍掉一部分,只为调试试验

In [6]:
X = corpus_0 + random.sample(corpus_1, len(corpus_0))

In [7]:
y = [0]*len(corpus_0) + [1]*len(corpus_0)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
vectorized = TfidfVectorizer(max_features= 5000) #设置文本单词个数最大值
X = vectorized.fit_transform(X)
X = X.toarray()

In [9]:
#选取30%左右数据作为测试数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [10]:
print(sum(y_train),len(y_train))

5847 11750


### 各类模型建立

In [28]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np

In [26]:
from sqlalchemy import create_engine
engine = create_engine("mysql://root:forever@127.0.0.1:3306/sklearn_result", echo=False)

In [41]:
def KNN(X_train, X_test, y_train, y_test,metric='minkowski'):
    nbrs = KNeighborsClassifier(n_neighbors=5, algorithm='brute',metric =metric)
    nbrs.fit(X_train, y_train)
    return get_performance(nbrs,X_test,y_test)

def logical_regression(X_train, X_test, y_train, y_test, solver='liblinear',C=1, penalty='l2', l1_ratio=None):
    clf = LogisticRegression(solver=solver, C=C, penalty=penalty,l1_ratio=l1_ratio)
    clf.fit(X_train, y_train)
    return get_performance(clf,X_test,y_test)

def decision_tree(X_train, X_test, y_train, y_test,criterion = 'gini',max_depth=100):
    clf = DecisionTreeClassifier(criterion = criterion,max_depth= max_depth)
    clf.fit(X_train, y_train)
    return get_performance(clf,X_test,y_test)  

def Navie_baye(X_train, X_test, y_train, y_test, norm = False, method=1):
    #method 只能是 0,1,2,3
    method_list = [SVC, MultinomialNB,ComplementNB,BernoulliNB]
    if method == 2:
        clf = method_list[method](norm = norm)
        clf.fit(X_train, y_train)
    else:
        clf = method_list[method]()
        clf.fit(X_train, y_train)
    return get_performance(clf,X_test,y_test) 

#SVM核函数的选择和参数比较重要，不合适的选择可能导致一直无法找到解，或求解时间过长
def svm_svc(X_train, X_test, y_train, y_test, C=1, kernel='linear', gamma='auto'):
    clf = SVC(C=C, kernel=kernel, gamma= gamma,cache_size=1000.0,max_iter=300)
    clf.fit(X_train, y_train)
    return get_performance(clf,X_test,y_test) 
    
def svm_nusvc(X_train, X_test, y_train, y_test, nu=0.5, kernel='linear', gamma='auto'):
    clf = NuSVC(nu=nu, kernel=kernel, gamma= gamma,cache_size=1000.0,max_iter=300)
    clf.fit(X_train, y_train)
    return get_performance(clf,X_test,y_test) 

def svm_linearsvc(X_train, X_test, y_train, y_test, penalty='l2',loss='squared_hinge', C=1):
    clf = LinearSVC(penalty=penalty,loss=loss, C=C)
    clf.fit(X_train, y_train)
    return get_performance(clf,X_test,y_test) 

def random_forest(X_train, X_test, y_train, y_test,n_estimators=10,criterion='gini',max_depth=None):
    clf = RandomForestClassifier(n_estimators=n_estimators,criterion=criterion,max_depth=max_depth)
    clf.fit(X_train, y_train)
    return get_performance(clf,X_test,y_test)

In [53]:
#存放结果
array_0 = np.zeros((1,5))
colums = ['Method','F1', 'ACC','Precision','Recall']
result = pd.DataFrame(array_0, index=[0], columns=colums,dtype='float32')
for n_estimators in [10, 15, 20]:   
    for criterion in ['gini', 'entropy']: #['rbf']:#
        for max_depth in [80, 120, 160]: 
            new_result = random_forest(X_train, X_test, y_train, y_test,
                          n_estimators=n_estimators,criterion=criterion,max_depth=max_depth)
            result.loc[len(result)] = ['n:'+str(n_estimators)+'-'+
                                      criterion+ '-depth:'+str(max_depth)]+new_result
#result.to_sql('de_tree_0', con=engine ,if_exists='replace')

In [58]:
result['ACC']=result['ACC'].astype('float')
result.sort_values('ACC',ascending=False)

Unnamed: 0,Method,F1,ACC,Precision,Recall
8,n:15-gini-depth:120,0.9828,0.9827,0.9828,0.9772
15,n:20-gini-depth:160,0.98,0.9799,0.98,0.9705
18,n:20-entropy-depth:160,0.9779,0.978,0.9779,0.9654
12,n:15-entropy-depth:160,0.9774,0.9774,0.9774,0.9698
16,n:20-entropy-depth:80,0.9769,0.977,0.9769,0.965
14,n:20-gini-depth:120,0.9768,0.9768,0.9768,0.9662
10,n:15-entropy-depth:80,0.9766,0.9766,0.9766,0.9674
13,n:20-gini-depth:80,0.9765,0.9766,0.9765,0.9643
1,n:10-gini-depth:80,0.9764,0.9762,0.9764,0.9729
7,n:15-gini-depth:80,0.976,0.976,0.976,0.967


In [55]:
result.to_sql('forest_0', con=engine ,if_exists='replace')

In [15]:
#存放结果
array_0 = np.zeros((1,5))
colums = ['Method','F1', 'ACC','Precision','Recall']
result = pd.DataFrame(array_0, index=[0], columns=colums,dtype='float32')
for C in [0.5, 1, 5]:   
    for kernel in ['linear', 'poly', 'rbf', 'sigmoid']: #['rbf']:#
        for gamma in ['auto', 'scale']: 
            new_result = svm_svc(X_train, X_test, y_train, y_test,C=C, kernel=kernel, gamma= gamma)
            result.loc[len(result)] = ['C:'+str(C)+'-'+ kernel+'-'+ gamma]+new_result
#result.to_sql('de_tree_0', con=engine ,if_exists='replace')



In [27]:
result.to_sql('SVM_0', con=engine ,if_exists='replace')



In [23]:
result['ACC']=result['ACC'].astype('float')

In [25]:
result.sort_values('ACC',ascending=False)

Unnamed: 0,Method,F1,ACC,Precision,Recall
18,C:5-linear-scale,0.9351,0.9335,0.9351,0.9485
17,C:5-linear-auto,0.9351,0.9335,0.9351,0.9485
10,C:1-linear-scale,0.9338,0.9313,0.9338,0.958
9,C:1-linear-auto,0.9338,0.9313,0.9338,0.958
14,C:1-rbf-scale,0.9247,0.9271,0.9247,0.8845
16,C:1-sigmoid-scale,0.9273,0.9245,0.9273,0.9525
6,C:0.5-rbf-scale,0.9228,0.9196,0.9228,0.9505
24,C:5-sigmoid-scale,0.9198,0.9192,0.9198,0.9167
22,C:5-rbf-scale,0.9222,0.919,0.9222,0.9497
1,C:0.5-linear-auto,0.9223,0.9184,0.9223,0.9584


In [18]:
X_train.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [44]:
#存放结果
array_0 = np.zeros((1,5))
colums = ['Method','F1', 'ACC','Precision','Recall']
result = pd.DataFrame(array_0, index=[0], columns=colums)
methods = ['GaussianNB', 'MultinomialNB','ComplementNB','BernoulliNB']
for i,method in enumerate(methods):   
    if i==2:
        new_result = Navie_baye(X_train, X_test, y_train, y_test,method=i,norm=True)
        result.loc[len(result)] = [method+' norm: True']+new_result
        new_result = Navie_baye(X_train, X_test, y_train, y_test,method=i,norm=False)
        result.loc[len(result)] = [method+' norm: False']+new_result
    else:
        new_result = Navie_baye(X_train, X_test, y_train, y_test,method=i)
        result.loc[len(result)] = [method]+new_result
#result.to_sql('de_tree_0', con=engine ,if_exists='replace')

In [49]:
result.to_sql('Navie_baye_0', con=engine ,if_exists='replace')



In [46]:
result

Unnamed: 0,Method,F1,ACC,Precision,Recall
0,0,0.0,0.0,0.0,0.0
1,GaussianNB,0.9176,0.917,0.9176,0.9144
2,MultinomialNB,0.8779,0.8824,0.8779,0.8362
3,ComplementNB norm: True,0.8839,0.8824,0.8839,0.8849
4,ComplementNB norm: False,0.8782,0.8826,0.8782,0.837
5,BernoulliNB,0.8285,0.8195,0.8285,0.8621


In [154]:
#存放结果
array_0 = np.zeros((1,5))
colums = ['Method','F1', 'ACC','Precision','Recall']
result = pd.DataFrame(array_0, index=[0], columns=colums)
#KNN模型测试
for method in ['gini', 'entropy']:   
    for depth in range(100,140,10):
        new_result = decision_tree(X_train, X_test, y_train, y_test,criterion=method,max_depth=depth)
        result.loc[len(result)] = [method+' max_dep:'+str(depth)]+new_result
#result.to_sql('de_tree_0', con=engine ,if_exists='replace')

In [None]:
result.to_sql('de_tree_0', con=engine ,if_exists='replace')

In [45]:
result

Unnamed: 0,Method,F1,ACC,Precision,Recall
0,0,0.0,0.0,0.0,0.0
1,GaussianNB,0.9176,0.917,0.9176,0.9144
2,MultinomialNB,0.8779,0.8824,0.8779,0.8362
3,ComplementNB norm: True,0.8839,0.8824,0.8839,0.8849
4,ComplementNB norm: False,0.8782,0.8826,0.8782,0.837
5,BernoulliNB,0.8285,0.8195,0.8285,0.8621


In [98]:
#存放结果
array_0 = np.zeros((1,5))
colums = ['Method','F1', 'ACC','Precision','Recall']
result = pd.DataFrame(array_0, index=[0], columns=colums)
#KNN模型测试
methods = ['cosine', 'euclidean', 'minkowski']
for method in methods:   
    new_result = KNN(X_train, X_test, y_train, y_test,method)
    result.loc[len(result)] = [method]+new_result
    
#result.to_sql('lo_re_1', con=engine ,if_exists='replace')

In [86]:
result #用cosine度量方式模型准确率要高

Unnamed: 0,Method,F1,ACC,Precision,Recall
0,0,0.0,0.0,0.0,0.0
1,cosine,0.845,0.8485,0.845,0.8401
2,euclidean,0.7284,0.6984,0.7284,0.8227
3,minkowski,0.7284,0.6984,0.7284,0.8227


In [145]:
#存放结果
array_0 = np.zeros((1,5))
colums = ['Method','F1', 'ACC','Precision','Recall']
result = pd.DataFrame(array_0, index=[0], columns=colums)
solvers = ['newton-cg','lbfgs', 'liblinear', 'sag', 'saga']
for penalty in ['l1','l2','elasticnet','none']:
    try:
        if penalty == 'elasticnet':
            for i in [0.1, 0.2 ,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
                new_result = logical_regression(X_train, X_test, y_train, y_test, 
                                                penalty=penalty,l1_ratio=i, solver='saga',C=5)
                result.loc[len(result)]= ['l1:'+str(i)+'l2:'+str(1-i)]+new_result
        else:
            new_result = logical_regression(X_train, X_test, y_train, y_test, penalty=penalty)
            result.loc[len(result)]= [penalty]+new_result
    except:
        pass



In [141]:
np.linspace(0.1, 0.9, 9)

array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

In [146]:
result

Unnamed: 0,Method,F1,ACC,Precision,Recall
0,0,0.0,0.0,0.0,0.0
1,l1,0.9686,0.9694,0.9686,0.9592
2,l2,0.9443,0.9476,0.9443,0.9043
3,l1:0.1l2:0.9,0.9652,0.9666,0.9652,0.9418
4,l1:0.2l2:0.8,0.9663,0.9676,0.9663,0.9447
5,l1:0.3l2:0.7,0.9668,0.968,0.9668,0.9455
6,l1:0.4l2:0.6,0.9678,0.969,0.9678,0.9483
7,l1:0.5l2:0.5,0.9674,0.9686,0.9674,0.9483
8,l1:0.6l2:0.4,0.9693,0.9704,0.9693,0.9515
9,l1:0.7l2:0.30000000000000004,0.9713,0.9722,0.9713,0.9552


In [87]:
from sqlalchemy import create_engine
engine = create_engine("mysql://root:forever@127.0.0.1:3306/sklearn_result", echo=False)

In [147]:
result.to_sql('lo_re_1', con=engine ,if_exists='replace')

In [115]:
#存放结果
array_0 = np.zeros((1,5))
colums = ['Method','F1', 'ACC','Precision','Recall']
result = pd.DataFrame(array_0, index=[0], columns=colums)
for pene in np.linspace(1,10,10):
    try:      
        new_result = logical_regression(X_train, X_test, y_train, y_test,C=c)
        c = str(format(c,'.2f'))
        result.loc[len(result)] = ['C='+c]+new_result
    except:
        pass
#result.to_sql('knn_1', con=engine ,if_exists='replace')

In [117]:
result.to_sql('lo_re_0', con=engine ,if_exists='replace')