# 新华社新闻抄袭自动判别
任务要求：
1. 构建一个机器学习模型，判断这个文章是不是新华社的
2. 当这个模型的acc 大于 0.8778， recall， precision，f1等值都较高的时候
3. 用该模型 判断一篇文章是否是新华社的文章，如果判断出来是新华社的，但是，它的source并不是新华社的，那么，我们就说，这个文章是抄袭的新华社的文章
4. Text Representation uses "https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html"

## 数据预处理

### 获取数据

In [4]:
import pandas as pd
import numpy as np
import math
import jieba

In [5]:
filename = 'E:\\MYGIT\\DataSources\\sqlResult_1558435.csv'
pandas_data = pd.read_csv(filename, encoding='gb18030')
pandas_data = pandas_data.dropna(subset = ['content']) #剔除内容为空的行

In [6]:
content = pandas_data['content'].tolist()
source = pandas_data['source'].tolist()

In [7]:
source = [name if isinstance(name, str) else 'unknow' for name in source] #把没有来源的信息标记为unknow

### content清洗切词，source分好标签

In [8]:
article_labels = [1 if name.strip() == '新华社' else 0 for name in source]

In [9]:
print(sum(article_labels), len(article_labels),len(content))

78661 87054 87054


以上可见正负样本非常不均衡，如何解决？

In [7]:
import os
from pyltp import Segmentor
#获取停用词集
def get_stopwords():
    stopwords = []
    with open('stopwords.txt') as f:
        line_str = f.readline()
        while line_str!= '':
            line_str = line_str.strip()
            stopwords.append(line_str)
            line_str = f.readline()
    return set(stopwords)

def text_deal_cut(text_list):
    stopwords = get_stopwords()
    cws_model_path = 'E:/MYGIT/Project/ltp_data/cws.model'
    
    segmentor = Segmentor()  # 初始化实例
    segmentor.load(cws_model_path)  # 加载模型 
    corpus = []
    i = 0
    for string in text_list:
        i += 1
        if(i%3000 == 0):print(i)
        string = string.strip()
        string_temp = ''
        words = list(segmentor.segment(string))
        for word in words:
            if word not in stopwords:
                string_temp += word + ' '    
        corpus.append(string_temp)
    segmentor.release()  # 释放模型
    return corpus

In [8]:
len(content)

87054

In [9]:
corpus = text_deal_cut(content)

3000
6000
9000
12000
15000
18000
21000
24000
27000
30000
33000
36000
39000
42000
45000
48000
51000
54000
57000
60000
63000
66000
69000
72000
75000
78000
81000
84000
87000


到目前为止，已经获得了处理好的标签，和分词好的文章

In [25]:
import pickle
with open('./temp_file/corpus_list','wb') as f:
    pickle.dump(corpus,f)

In [3]:
#读取
import pickle
try:
    print(corpus[0])
except NameError:
    with open('./temp_file/corpus_list','rb') as f:
        corpus = pickle.load(f)


In [22]:
article_labels[:10]

[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]

In [23]:
corpus[0]

'本周 6月 12日 小米 手机 15 款 机型 外 机型 暂停 更新 发布 含 开发版 体验版 内测 稳定 版 暂 受 影响 确保 工程师 精力 系统 优化 工作 有人 猜测 精力 主要 MIUI 研发 之中 \r\n MIUI 去年 5月 发布 年 有余 更新换代 。\r\n MIUI 确切 信息 等待 官方 消息 '

In [125]:
ss = corpus[0]

In [129]:
ss.replace('\r\n','').replace('。','')

'本周 6月 12日 小米 手机 15 款 机型 外 机型 暂停 更新 发布 含 开发版 体验版 内测 稳定 版 暂 受 影响 确保 工程师 精力 系统 优化 工作 有人 猜测 精力 主要 MIUI 研发 之中  MIUI 去年 5月 发布 年 有余 更新换代  MIUI 确切 信息 等待 官方 消息 '

In [24]:
corpus = [line.replace('\r\n','').replace('。','') for line in corpus]
corpus[0]

'本周 6月 12日 小米 手机 15 款 机型 外 机型 暂停 更新 发布 含 开发版 体验版 内测 稳定 版 暂 受 影响 确保 工程师 精力 系统 优化 工作 有人 猜测 精力 主要 MIUI 研发 之中  MIUI 去年 5月 发布 年 有余 更新换代  MIUI 确切 信息 等待 官方 消息 '

## TFIDF向量化

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [11]:
vectorized = TfidfVectorizer(max_features= 10000) #设置文本单词个数最大值

In [30]:
sub_samples = corpus[:1000]

In [12]:
X = vectorized.fit_transform(corpus)

In [13]:
X.shape

(87054, 10000)

## 逻辑回归模型

In [14]:
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import metrics

In [15]:
y = np.array(article_labels)

In [57]:
def logical_model(X,y,test_rate=0.4):
    X_train, X_test, y_train, y_test = train_test_split\
    (X, y, test_size=test_rate, random_state=0)
    clf = LogisticRegression(random_state=0, solver='lbfgs')
    clf.fit(X_train,y_train)
    predict_prob_y = clf.predict(X_test)
    acc = metrics.precision_score(y_test, predict_prob_y)
    recall = metrics.recall_score(y_test, predict_prob_y)
    f1 = metrics.f1_score(y_test, predict_prob_y)
    auc = metrics.roc_auc_score(y_test, predict_prob_y)
    return [acc,recall,f1,auc]
    #print(classification_report(y_test, predict_prob_y))

class Logical_model:
    def __init__(self,corpus,y):
        self.vectorized, self.model = self.__build_model(corpus, y)
        
    def __build_model(self,corpus,y):
        vectorized = TfidfVectorizer(max_features= 10000)
        X = vectorized.fit_transform(corpus)
        clf = LogisticRegression(random_state=0, solver='lbfgs')
        clf.fit(X,y)
        return vectorized, clf
    
    def predict(self, input_corpus):
        #input_corpus should be a list or array
        X = self.vectorized.transform(input_corpus)
        predict_prob_y = self.model.predict(X)
        return predict_prob_y
    def is_copy(self, input_corpus, true_label):
        input_corpus = [input_corpus]
        X = self.vectorized.transform(input_corpus)
        predict_prob_y = self.model.predict(X)
        
        if predict_prob_y[0]==1 and true_label==0:
            print('This article is cpoyed')
        else:
            print('This article is ture')

In [53]:
Logistic_Regression = logical_model(X,y,0.4)

In [58]:
logistic_model = Logical_model(corpus, y)

In [55]:
print('predict:{} \n   True:{}'.format(logistic_model.predict(corpus[12:20]), y[12:20]))

predict:[0 0 0 0 0 0 0 0] 
   True:[0 0 0 0 0 0 0 0]


In [59]:
predict_label = logistic_model.predict(corpus)

In [67]:
error = 0
test_index = []
for i in range(len(y)):
    if predict_label[i] != y[i] and y[i]==0:
        error += 1
        test_index.append(i)
        #print(i)
print(error)

1159


In [69]:
import random
random_index = random.choice(test_index)
print(random_index)

6050


In [70]:
logistic_model.is_copy(corpus[random_index], y[random_index])

This article is cpoyed


In [85]:
logistic_model.is_copy(corpus[500], y[500])

This article is ture


In [None]:
import pandas as pd

## KNN模型

In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [71]:
#模型参数获取
def Knn_model(X, y,test_rate=0.4):    
    X_train, X_test, y_train, y_test = train_test_split\
    (X, y, test_size=test_rate, random_state=0) 
    neigh = KNeighborsClassifier(n_neighbors=5)
    neigh.fit(X_train, y_train)
    predict_prob_y = neigh.predict(X_test)
    acc = metrics.precision_score(y_test, predict_prob_y)
    recall = metrics.recall_score(y_test, predict_prob_y)
    f1 = metrics.f1_score(y_test, predict_prob_y)
    auc = metrics.roc_auc_score(y_test, predict_prob_y)
    return [acc,recall,f1,auc]
    #print(classification_report(y_test, predict_prob_y))

#模型
class Knn_Model:
    def __init__(self,corpus,y):
        self.vectorized, self.model = self.__build_model(corpus, y)
        
    def __build_model(self,corpus,y):
        vectorized = TfidfVectorizer(max_features= 10000)
        X = vectorized.fit_transform(corpus)
        neigh = KNeighborsClassifier(n_neighbors=5)
        neigh.fit(X,y)
        return vectorized, neigh
    
    def predict(self, input_corpus):
        #input_corpus should be a list or array
        X = self.vectorized.transform(input_corpus)
        predict_prob_y = self.model.predict(X)
        return predict_prob_y
    def is_copy(self, input_corpus, true_label):
        input_corpus = [input_corpus]
        X = self.vectorized.transform(input_corpus)
        predict_prob_y = self.model.predict(X)
        
        if predict_prob_y[0]==1 and true_label==0:
            print('This article is cpoyed')
        else:
            print('This article is ture')

In [29]:
KNN = Knn_model(X, y)

In [73]:
knn_model = Knn_Model(corpus, y)

In [None]:
print('predict:{} \n   True:{}'.format(logistic_model.predict(corpus[12:20]), y[12:20]))

In [74]:
predict_label = knn_model.predict(corpus)

In [75]:
print(len(predict_label), len(y))

87054 87054


In [76]:
error = 0
test_index = []
for i in range(len(y)):
    if predict_label[i] != y[i] and y[i]==0:
        error += 1
        test_index.append(i)
        #print(i)
print(error)

4499


In [77]:
import random
random_index = random.choice(test_index)
print(random_index)

6274


In [78]:
knn_model.is_copy(corpus[random_index], y[random_index])

This article is cpoyed


In [92]:
515 in test_index

False

In [93]:
knn_model.is_copy(corpus[515], y[515])

This article is ture


比较结果：
1. 限制Tfidf向量大小,而不是按输入数据最大值，一定程度上不影响结果
2. 这是因为sklearn模块先对词排序，选取词频前n个词作为 Tfidf的向量元素
3. 对于在整个corpus中出现次数极少的词对文章影响比较小

max_features : int or None (default=None)

    If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.

In [32]:
array_2d = np.array([Logistic_Regression, KNN])
colums = ['ACC', 'RECALL','F1','AUC']
index = ['Logistic_Regression', 'KNN']

In [35]:
result = pd.DataFrame(array_2d, index=index, columns=colums)

In [36]:
result

Unnamed: 0,ACC,RECALL,F1,AUC
Logistic_Regression,0.973997,0.996983,0.985356,0.872967
KNN,0.925957,0.995394,0.959421,0.622323


根据以上结果，选择逻辑回归模型