In [6]:
import numpy as np
import pandas as pd
import jieba
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# 加载停用词
from sklearn.metrics.pairwise import cosine_similarity
with open('chinese_stopwords.txt', 'r', encoding='utf-8') as file:
    stopwords = [i[:-1] for i in file.readlines()]
    
# 数据加载
news = pd.read_csv('sqlResult.csv', encoding='gb18030')

# 处理缺失值
print(news[news.content.isna()].head())
news = news.dropna(subset=['content'])
print(news.shape)

# 分词
def split_text(text):
    text = text.replace(' ', '')
    text = text.replace('\n', '')
    text2 = jieba.cut(text.strip())
    result = ' '.join([w for w in text2 if w not in stopwords])
    return result
# print(news.iloc[0].content)
# print(split_text(news.iloc[0].content))
import pickle, os

if not os.path.exists('corpus.pkl'):
    corpus = list(map(split_text, [str(i) for i in news.content]))
    print(corpus[0])
    print(len(corpus))
    print(corpus[0])
    with open('corpus.pkl', 'wb') as file:
        pickle.dump(corpus, file)
else:
    # 调用上一次的处理结果
    with open('corpus.pkl', 'rb') as file:
        corpus = pickle.load(file)

# 计算 corpus 的 TF-IDF 矩阵
countvectorizer = CountVectorizer(encoding='gb18030', min_df=0.015)
tfidfTransformer = TfidfTransformer()
countvector = countvectorizer.fit_transform(corpus)
tfidf = tfidfTransformer.fit_transform(countvector)
print(tfidf.shape)

# 标记是否是自己的新闻
label = list(map(lambda source: 1 if '新华' in str(source) else 0, news.content))
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
# 数据切分
X_train, X_test, y_train, t_test = train_test_split(tfidf.toarray(), label, test_size=0.3, random_state=33)
clf = MultinomialNB()
clf.fit(X_train, y_train)
# y_predict = clf.predict(X_test)
prediction = clf.predict(tfidf.toarray())
labels = np.array(label)
compare_news_index = pd.DataFrame({'prediction': prediction, 'labels': labels})
# 计算所有可疑文章的 index
copy_news_index = compare_news_index[(compare_news_index['prediction'] == 1) & (compare_news_index['labels'] == 0)].index
# 计算所有新华社文章的 index
xinhuashe_news_index = compare_news_index[(compare_news_index['labels'] == 1)].index
print('可疑文章数：' + str(len(copy_news_index)))

from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer

normalizer = Normalizer()
scaled_array = normalizer.fit_transform(tfidf.toarray())

if not os.path.exists('label.pkl'):
    # 使用 KMeans 完成聚类
    kmeans = KMeans(n_clusters=25)
    k_labels = kmeans.fit_predict(scaled_array)
    with open('label.pkl', 'wb') as file:
        pickle.dump(k_labels, file)
    print('k_labels.shape', k_labels.shape)
else:
    # 调用上一次的处理结果
    with open('label.pkl', 'rb') as file:
        k_labels = pickle.load(file)

if not os.path.exists('id_class.pkl'):
    # 创建一个 id_class
    id_class = {index:class_ for index, class_ in enumerate(k_labels)}
    with open('id_class.pkl', 'wb') as file:
        pickle.dump(id_class, file)
else:
    # 调用上次的处理结果
    with open('id_class.pkl', 'rb') as file:
        id_class = pickle.load(file)


if not os.path.exists('class_id.pkl'):
    from collections import defaultdict
    # 创建一个 class_id
    class_id = defaultdict(set)
    for index, class_ in id_class.items():
        # 只统计新华社发布的 class_id
        if index in xinhuashe_news_index.tolist(): 
            class_id[class_].add(index)
    with open('class_id.pkl', 'wb') as file:
        pickle.dump(class_id, file)
else:
    # 调用上次的处理结果
    with open('class_id.pkl', 'rb') as file:
        class_id = pickle.load(file)

# 找出相似文本
def find_similar_text(cpindex, top=10):
    # 只在新华社发布的文章中进行查找
    dist_dict = {i: cosine_similarity(tfidf[cpindex], tfidf[i]) for i in class_id[id_class[cpindex]]}
    # 从大到小进行排序
    return sorted(dist_dict.items(), key=lambda x: x[1][0], reverse=True)[:top]

cpindex = 3352
similar_list = find_similar_text(cpindex)
print(similar_list)
print('怀疑抄袭：\n', news.iloc[cpindex].content)
# 找一篇相似的原文
similar2 = similar_list[0][0]
print('相似原文:\n', news.iloc[similar2].content)


import editdistance
# 看下两篇文章之间的编辑距离
print('编辑距离：', editdistance.eval(corpus[cpindex], corpus[similar2]))

         id author     source content  \
100   89517    NaN  中国证券报?中证网     NaN   
103   89514    NaN  中国证券报?中证网     NaN   
997   88620    NaN        央广网     NaN   
1273  88344    NaN        央广网     NaN   
1282  88335    NaN        央广网     NaN   

                                                feature  \
100   {"type":"公司","site":"中证网","commentNum":"0","jo...   
103   {"type":"公司","site":"中证网","commentNum":"0","jo...   
997   {"type":"时事要闻","site":"参考消息","commentNum":"0",...   
1273  {"type":"IT业界","site":"参考消息","commentNum":"0",...   
1282  {"type":"IT业界","site":"参考消息","commentNum":"0",...   

                                 title  \
100       天和防务股东未来6个月内计划减持不超过480万股公司股份   
103                    晶盛机电调整限制性股票回购价格   
997              [主播不在家]第二季：主播陈亮体验垃圾清运   
1273                LKK洛可可：想象力经济时代或已到来   
1282  CES2017：京东发布两款叮咚智能音箱新品 开放Alpha平台   

                                                    url  
100   http://www.cs.com.cn/ssgs/gsxw/201706/t2017062...  
103   http://www.cs.com.cn/ss