In [None]:
import pandas as pd
import numpy as np
import jieba

#数据加载
news=pd.read_csv('sqlResult.csv', encoding='gb18030')
print(news.shape)
print(news.head())

news[news.content.isna()].head() #查看content列有没有缺失值
news=news.dropna(subset=['content'])
print(news.shape)

#加载停用词
with open('chinese_stopwords.txt', 'r', encoding='utf-8') as file:
	stopwords = [i[:-1] for i in file.readlines()]
	
#分词
def split_text(text):
	text=text.replace(' ', '').replace('\n', '')
	text2=jieba.cut(text)
	result=' '.join([w for w in text2 if w not in stopwords])
	return result

corpus=list(map(split_text, [str(i) for i in news.content]))
corpus

#计算corpus的TF-IDF
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
countvectorizer=CountVectorizer(encoding='gb18030', min_df=0.015)
tfidftransformer=TfidfTransformer()
countvector=countvectorizer.fit_transform(corpus)
tfidf=tfidftransformer.fit_transform(countvector)
print(tfidf.shape)

#标记是否为自己的新闻
label=list(map(lambda source: 1 if '新华社' in str(source) else 0, news.source))
label

#数据集切分
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(tfidf.toarray(), label, test_size=0.3)

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)
y_predict=model.predict(X_test)
print('准确率：', accuracy_score(y_test, y_predict))
print('精确率：', precision_score(y_test, y_predict))
print('召回率：', recall_score(y_test, y_predict))

from sklearn.naive_bayes import BernoulliNB
model=BernoulliNB()
model.fit(X_train, y_train)
y_predict=model.predict(X_test)
print('准确率：', accuracy_score(y_test, y_predict))
print('精确率：', precision_score(y_test, y_predict))
print('召回率：', recall_score(y_test, y_predict))

#使用模型进行风格预测
prediction=model.predict(tfidf.toarray())
labels=np.array(label)
# compare_news_index有两列，prediction为预测风格，labels为真实结果
compare_news_index=pd.DataFrame({'prediction': prediction, 'labels': labels})
copy_news_index=compare_news_index[(compare_news_index['prediction']==1)&(compare_news_index['labels']==0)].index

#实际为新华社的新闻
xinhuashe_news_index=compare_news_index[(compare_news_index['labels']==1)].index
print('可能为copy的新闻条数：', len(copy_news_index))

from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
normalizer = Normalizer()
scaled_array=normalizer.fit_transform(tfidf.toarray())

#使用Kmeans进行全量文档进行聚类
kmeans=KMeans(n_clusters=25)
k_labels=kmeans.fit_predict(scaled_array)
print(k_labels.shape)

#创建id_class，ID是1-87054，class是1-25
id_class={index:class_ for index, class_ in enumerate(k_labels)}
from collections import defaultdict
class_id=defaultdict(set)
for index, class_ in id_class.items():
	#只统计新华社发布的class_id
	if index in xinhuashe_news_index.tolist():
		class_id[class_].add(index)

#查找相似的文章
from sklearn.metrics.pairwise import cosine_similarity
def find_similar_text(cpindex, top=10):
	#计算指定文章与其所有同一聚类文章的余弦相似度
	dist_dict={i:cosine_similarity(tfidf[cpindex], tfidf[i]) for i in class_id[id_class[cpindex]]}
	#从大到小进行排序
	return sorted(dist_dict.items(), key=lambda x:x[1][0], reverse=True)[:top]

cpindex=3352
print('是否在新华社', cpindex in xinhuashe_news_index)
print('是否在copy_news', cpindex in copy_news_index)

similar_list=find_similar_text(cpindex)
print(similar_list)

print('怀疑抄袭：\n', news.iloc[cpindex].content)
#找一篇相似的原文
similar2=similar_list[0][0]
print('相似的原文：\n', news.iloc[similar2].content)