# 判别新闻来源是否是新华社（即新闻是否抄袭）

In [1]:
# 模块载入

In [1]:
import jieba
import re
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
warnings.filterwarnings('ignore')

In [3]:
# 交互方式，用户能对图像进行操作

In [4]:
%matplotlib notebook

# 数据预处理

In [5]:
news = pd.read_csv('../datasource/sqlResult_1558435.csv', encoding='gb18030')

In [6]:
news.head()  # 第4条来源为新华社

Unnamed: 0,id,author,source,content,feature,title,url
0,89617,,快科技@http://www.kkj.cn/,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""37""...",小米MIUI 9首批机型曝光：共计15款,http://www.cnbeta.com/articles/tech/623597.htm
1,89616,,快科技@http://www.kkj.cn/,骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""15""...",骁龙835在Windows 10上的性能表现有望改善,http://www.cnbeta.com/articles/tech/623599.htm
2,89615,,快科技@http://www.kkj.cn/,此前的一加3T搭载的是3400mAh电池，DashCharge快充规格为5V/4A。\r\n...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""18""...",一加手机5细节曝光：3300mAh、充半小时用1天,http://www.cnbeta.com/articles/tech/623601.htm
3,89614,,新华社,这是6月18日在葡萄牙中部大佩德罗冈地区拍摄的被森林大火烧毁的汽车。新华社记者张立云摄\r\n,"{""type"":""国际新闻"",""site"":""环球"",""commentNum"":""0"",""j...",葡森林火灾造成至少62人死亡 政府宣布进入紧急状态（组图）,http://world.huanqiu.com/hot/2017-06/10866126....
4,89613,胡淑丽_MN7479,深圳大件事,（原标题：44岁女子跑深圳约会网友被拒，暴雨中裸身奔走……）\r\n@深圳交警微博称：昨日清...,"{""type"":""新闻"",""site"":""网易热门"",""commentNum"":""978"",...",44岁女子约网友被拒暴雨中裸奔 交警为其披衣相随,http://news.163.com/17/0618/00/CN617P3Q0001875...


In [7]:
len(news)

89611

[Pandas处理缺失值](https://blog.csdn.net/dss_dssssd/article/details/82814673)

In [8]:
# 处理缺失值

In [9]:
news_dropna = news.dropna(subset=['source', 'content'])

In [10]:
len(news_dropna)

87052

In [11]:
def transfrom(line):
    class_ = 1 if line['source'] == '新华社' else 0
    return pd.Series([class_, line['content']], index=['y', 'content'])  # 创建序列

In [12]:
# 根据新闻来源将数据转为监督学习任务

In [13]:
data = news_dropna.apply(transfrom, axis=1)

In [14]:
data.head()

Unnamed: 0,y,content
0,0,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...
1,0,骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...
2,0,此前的一加3T搭载的是3400mAh电池，DashCharge快充规格为5V/4A。\r\n...
3,1,这是6月18日在葡萄牙中部大佩德罗冈地区拍摄的被森林大火烧毁的汽车。新华社记者张立云摄\r\n
4,0,（原标题：44岁女子跑深圳约会网友被拒，暴雨中裸身奔走……）\r\n@深圳交警微博称：昨日清...


In [15]:
corpus = data.content.to_list()

In [16]:
# 使用astype实现Dataframe字段类型转换

In [17]:
y = data.y.values.astype(np.int)

In [18]:
y.shape

(87052,)

In [19]:
len(corpus)

87052

# 使用TF-IDF进行文本向量化

In [20]:
corpus_cut = []
mask = []
for sentence in tqdm(corpus):
    # isinstance() 函数来判断一个对象是否是一个已知的类型
    if not isinstance(sentence, str):
        mask.append(False)
        continue
    mask.append(True)
    sentence = ''.join(re.findall(r'\w+', sentence))
    corpus_cut.append(' '.join(jieba.cut(sentence=sentence)))

  0%|                                                                                        | 0/87052 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\JEREMY~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.850 seconds.
Prefix dict has been built succesfully.
100%|███████████████████████████████████████████████████████████████████████████| 87052/87052 [03:33<00:00, 408.57it/s]


In [21]:
len(corpus_cut)

87052

In [22]:
y = y[mask]

In [23]:
len(y)

87052

In [24]:
corpus_cut[0]

'此外 自 本周 6 月 12 日起 除 小米 手机 6 等 15 款 机型 外 其余 机型 已 暂停 更新 发布 含 开发 版 体验版 内测 稳定版 暂不受 影响 以 确保 工程师 可以 集中 全部 精力 进行 系统优化 工作 有人 猜测 这 也 是 将 精力 主要 用到 MIUI9 的 研发 之中 MIUI8 去年 5 月 发布 距今已有 一年 有余 也 是 时候 更新换代 了 当然 关于 MIUI9 的 确切 信息 我们 还是 等待 官方消息'

In [25]:
# 中文不比英文，词语之间有着空格的自然分割，所以我们首先要进行分词处理

In [26]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=300)  # 模型

In [27]:
X = vectorizer.fit_transform(corpus_cut)

In [28]:
X = X.toarray()

In [29]:
X.shape

(87052, 300)

# 建模

In [30]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [31]:
# 设置随机种子保证每次随机的结果是一样的

In [32]:
random_state = 2019

In [33]:
X_train, x_test, Y_train, y_test = train_test_split(X, y, random_state=random_state, test_size=0.15)

In [34]:
X_train.shape, x_test.shape

((73994, 300), (13058, 300))

## KNN

In [35]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=42, test_size=0.15)

In [36]:
x_train.shape, x_valid.shape

((62894, 300), (11100, 300))

In [37]:
knc = KNeighborsClassifier(n_jobs=-1)

In [38]:
knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
           weights='uniform')

In [39]:
knc.score(x_valid, y_valid)

0.9175675675675675

In [40]:
y_pred = knc.predict(x_valid)

In [41]:
y_pred_prob = knc.predict_proba(x_valid)

In [42]:
y_pred.shape, y_pred_prob.shape

((11100,), (11100, 2))

In [43]:
precision_score(y_valid, y_pred)

0.9552164372631159

In [44]:
recall_score(y_valid, y_pred)

0.9535995220551629

In [45]:
f1_score(y_valid, y_pred)

0.9544072948328267

In [46]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.9008588518754408

- **调整参数k=3**

In [47]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=62, test_size=0.15)

In [48]:
knc = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)

In [49]:
knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=3, p=2,
           weights='uniform')

In [50]:
y_pred = knc.predict(x_valid)

In [51]:
y_pred_prob = knc.predict_proba(x_valid)

In [52]:
knc.score(x_valid, y_valid)

0.9307207207207208

In [53]:
precision_score(y_valid, y_pred)

0.9542907696844589

In [54]:
recall_score(y_valid, y_pred)

0.9696364362764682

In [55]:
f1_score(y_valid, y_pred)

0.9619024027743374

In [56]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.8839187552143312

- precision和recall都提高了，roc却下降了。

- **调整参数k=7**

In [57]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=62, test_size=0.15)

In [58]:
knc = KNeighborsClassifier(n_neighbors=7, n_jobs=-1)

In [59]:
knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=7, p=2,
           weights='uniform')

In [60]:
y_pred = knc.predict(x_valid)

In [61]:
# predict_proba返回预测属于某标签的概率

In [62]:
y_pred_prob = knc.predict_proba(x_valid)

In [63]:
y_pred.shape, y_pred_prob.shape

((11100,), (11100, 2))

In [64]:
knc.score(x_valid, y_valid)

0.9181981981981981

In [65]:
precision_score(y_valid, y_pred)

0.9538384845463609

In [66]:
recall_score(y_valid, y_pred)

0.9555533359968038

In [67]:
f1_score(y_valid, y_pred)

0.9546951402055682

In [68]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.9183032750405395

- 使用距离作为权重

In [69]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=72, test_size=0.15)

In [70]:
knc = KNeighborsClassifier(n_neighbors=7, n_jobs=-1, weights='distance')

In [71]:
knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=7, p=2,
           weights='distance')

In [72]:
y_pred = knc.predict(x_valid)

In [73]:
y_pred_prob = knc.predict_proba(x_valid)

In [74]:
y_pred.shape, y_pred_prob.shape

((11100,), (11100, 2))

In [75]:
knc.score(x_valid, y_valid)

0.9277477477477477

In [76]:
precision_score(y_valid, y_pred)

0.958291956305859

In [77]:
recall_score(y_valid, y_pred)

0.9619218500797448

In [78]:
f1_score(y_valid, y_pred)

0.9601034722913143

In [79]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.9307580442957069

- **使用距离作为近邻样本权重，效果有明显提升**

- 尝试提高leaf_size的大小，默认为30

In [80]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=92, test_size=0.15)

In [81]:
# ??KNeighborsClassifier

In [82]:
knc = KNeighborsClassifier(n_neighbors=7, n_jobs=-1, weights='distance', leaf_size=50)

In [83]:
knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=50, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=7, p=2,
           weights='distance')

In [84]:
y_pred = knc.predict(x_valid)

In [85]:
y_pred_prob = knc.predict_proba(x_valid)

In [112]:
knc.score(x_valid, y_valid)

0.9318918918918919

In [113]:
precision_score(y_valid, y_pred)

0.960087370929309

In [114]:
recall_score(y_valid, y_pred)

0.9646847565841979

In [115]:
f1_score(y_valid, y_pred)

0.9623805732484076

In [116]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.9372612143495018

- 效果不错，继续增加leaf-size

In [117]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=92, test_size=0.15)

In [118]:
knc = KNeighborsClassifier(n_neighbors=7, n_jobs=-1, weights='distance', leaf_size=70)
knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=70, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=7, p=2,
           weights='distance')

In [119]:
y_pred = knc.predict(x_valid)
y_pred_prob = knc.predict_proba(x_valid)

In [120]:
knc.score(x_valid, y_valid)

0.9315315315315316

In [121]:
precision_score(y_valid, y_pred)

0.960528932193279

In [122]:
recall_score(y_valid, y_pred)

0.9637869114126097

In [123]:
f1_score(y_valid, y_pred)

0.962155163828304

In [124]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.9365350296834066

- 增大到70后效果不佳，保持30

- **使用grid-search确定超参数**

In [125]:
from sklearn.model_selection import GridSearchCV

In [127]:
parameters = {'n_neighbors': [3, 5, 7],'leaf_size': [30, 45, 70]}

In [128]:
knc = KNeighborsClassifier(n_jobs=5, weights='distance')

In [130]:
clf = GridSearchCV(knc, parameters, cv=5, scoring='roc_auc', verbose=5, n_jobs=5)

In [129]:
# ??GridSearchCV

In [131]:
clf

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=5, n_neighbors=5, p=2,
           weights='distance'),
       fit_params=None, iid='warn', n_jobs=5,
       param_grid={'leaf_size': [30, 45, 70], 'n_neighbors': [3, 5, 7]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=5)

In [132]:
# clf.fit(X_train, Y_train)  # 在AI Studio测试，结果表明roc_auc_score()的结果为0.9229514467820347

In [134]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=92, test_size=0.15)

In [135]:
knc = KNeighborsClassifier(n_neighbors=7, n_jobs=-1, weights='distance', leaf_size=40) 
knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=40, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=7, p=2,
           weights='distance')

In [136]:
y_pred = knc.predict(x_valid) 
y_pred_prob = knc.predict_proba(x_valid)

In [137]:
knc.score(x_valid, y_valid)

0.9318918918918919

In [138]:
precision_score(y_valid, y_pred)

0.960087370929309

In [139]:
recall_score(y_valid, y_pred)

0.9646847565841979

In [140]:
f1_score(y_valid, y_pred)

0.9623805732484076

In [141]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.9372612143495018

- **针对KNN模型的最佳参数暂时确定为n_neighbors=7, weights='distance', leaf_size=40**

## Naive Bayes

In [142]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=1002, test_size=0.15)

In [143]:
gnb = GaussianNB()

In [144]:
gnb.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [145]:
y_pred = gnb.predict(x_valid)

In [146]:
y_pred_prob = gnb.predict_proba(x_valid)

In [147]:
gnb.score(x_valid, y_valid)

0.807027027027027

In [148]:
precision_score(y_valid, y_pred)

0.9981153411232567

In [149]:
recall_score(y_valid, y_pred)

0.7887995233839737

In [150]:
f1_score(y_valid, y_pred)

0.8811980033277869

In [152]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.9410166438307452

- **贝叶斯好像没啥好调的，训练速度也比KNN快多了，roc也更高。**

## Logistic Regression

In [153]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, random_state=1002, test_size=0.15)

In [159]:
lr = LogisticRegression(n_jobs=-1)

In [160]:
lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
          penalty='l2', random_state=None, solver='warn', tol=0.0001,
          verbose=0, warm_start=False)

In [161]:
lr.score(x_valid, y_valid)

0.9807207207207207

In [162]:
y_pred = lr.predict(x_valid)
y_pred_prob = lr.predict_proba(x_valid)

In [163]:
precision_score(y_valid, y_pred)

0.9876323340259227

In [164]:
recall_score(y_valid, y_pred)

0.9911627445139509

In [165]:
roc_auc_score(y_valid, y_pred_prob[:, 1])

0.9937303744000685

- **完全不需要调参**

# 利用测试集选定模型

- **KNN**

In [168]:
knc.score(x_test, y_test)

0.9261755245826313

In [169]:
y_pred = knc.predict(x_test)
y_pred_prob = knc.predict_proba(x_test)

In [170]:
precision_score(y_test, y_pred)

0.9586574230639161

In [171]:
recall_score(y_test, y_pred)

0.9597934653800576

In [172]:
roc_auc_score(y_test, y_pred_prob[:, 1])

0.9230684805264014

- **Gaussian Naive Bayes**

In [175]:
gnb.score(x_test, y_test)

0.79621687854189

In [178]:
y_pred = gnb.predict(x_test)
y_pred_prob = gnb.predict_proba(x_test)

In [179]:
precision_score(y_test, y_pred)

0.9984751116436118

In [180]:
recall_score(y_test, y_pred)

0.7759437954968681

In [181]:
f1_score(y_test, y_pred)

0.8732555370326268

In [182]:
roc_auc_score(y_test, y_pred_prob[:, 1])

0.9375509641130992

- **Logistic Regression**

In [183]:
lr.score(x_test, y_test)

0.9790932761525502

In [184]:
y_pred = lr.predict(x_test)
y_pred_prob = lr.predict_proba(x_test)

In [185]:
precision_score(y_test, y_pred)

0.9856914401144685

In [186]:
recall_score(y_test, y_pred)

0.9912815303876756

In [187]:
f1_score(y_test, y_pred)

0.9884785819793206

In [188]:
roc_auc_score(y_test, y_pred_prob[:, 1])

0.9906651640078233

**逻辑回归即使在未见过的测试集上表现依然很好，最后选定逻辑回归模型;
<br>性能达到某个点可以定义为：在这个点训练集误差依然减小而验证集的误差不再下降反而开始上升也就是模型开始过拟合了。**

## 抄袭候选者

- **找出所有预测为1，实际为0的文章作为抄袭的候选者**

In [189]:
y_pred = lr.predict(X)

In [190]:
y_pred.shape, y.shape

((87052,), (87052,))

In [191]:
len(news_dropna)

87052

In [195]:
news_dropna.head(2)

Unnamed: 0,id,author,source,content,feature,title,url,y
0,89617,,快科技@http://www.kkj.cn/,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""37""...",小米MIUI 9首批机型曝光：共计15款,http://www.cnbeta.com/articles/tech/623597.htm,0
1,89616,,快科技@http://www.kkj.cn/,骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""15""...",骁龙835在Windows 10上的性能表现有望改善,http://www.cnbeta.com/articles/tech/623599.htm,0


In [None]:
# 新建1列y

In [193]:
news_dropna['y'] = y

In [196]:
news_dropna['y_pred'] = y_pred

In [198]:
news_dropna.head(2)

Unnamed: 0,id,author,source,content,feature,title,url,y,y_pred
0,89617,,快科技@http://www.kkj.cn/,此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""37""...",小米MIUI 9首批机型曝光：共计15款,http://www.cnbeta.com/articles/tech/623597.htm,0,0
1,89616,,快科技@http://www.kkj.cn/,骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""15""...",骁龙835在Windows 10上的性能表现有望改善,http://www.cnbeta.com/articles/tech/623599.htm,0,0


In [199]:
# 实际为0，预测为1

In [200]:
copy_news = news_dropna[(news_dropna.y == 0) & (news_dropna.y_pred == 1)]

In [201]:
copy_news.head(2)

Unnamed: 0,id,author,source,content,feature,title,url,y,y_pred
51,89566,,新华网,戈壁的大漠黄沙曾掩埋了无数西域古道，而如今一条大漠天路正顽强地与黄沙“搏斗”，在乌兰布和、腾...,"{""type"":""国内新闻"",""site"":""环球"",""commentNum"":""0"",""j...",大漠变通途——世界上最长的穿越沙漠高速公路建设纪实,http://china.huanqiu.com/hot/2017-06/10866392....,0,1
56,89561,,央视新闻,很快，不少人主动添加记者为好友，询问是否需要扫描软件，并声称这些扫描软件能够攻破摄像头的IP...,"{""type"":""科技"",""site"":""cnbeta"",""commentNum"":""15""...",大量家庭摄像头遭入侵 有人兜售IP地址给偷窥者,http://www.cnbeta.com/articles/tech/623631.htm,0,1


In [202]:
len(copy_news.source)

1106

In [203]:
copy_sources =  set(copy_news.source.to_list())

In [204]:
len(copy_sources)

276

# 什么是数据思维？

- **[数据思维](http://www.appadhoc.com/blog/data-thinking-is-the-most-important/)的最核心是利用数据解决问题，利用数据解决问题的最核心是要深度了解需求，了解真正要解决什么样的问题，解决问题背后的真实目的是什么。在解决问题的过程中我们使用数据的方法，通常可以叫量化的方法。**

# 什么事机器学习思维？

- **机器学习思维就是根据大数据学习出一种规则，这个规则可以将输入中的$x$映射至$y$而不像传统的方法由人工写各种繁琐的规则，机器学习模型可以利用继续增加的数据不断迭代优化模型的表现。**

# 使用第4节课讲解的edit distance，在涉嫌抄袭的文章中，找到其重复的文字与被修改的文字

In [205]:
def edit_distance(string1, string2):
    """
    string1 => string2
    """
    len1, len2 = len(string1), len(string2)
    
    if len1 == 0 and len2 == 0:
        return 0
    if len2 == 0:
        return len1, []
    if len1 == 0:
        return len2, []
    
    dp = [[0] * (len2+1) for _ in range(len1+1)]
    duplication = []
    
    for i in range(1, len2+1):
        dp[0][i] = i
    for i in range(1, len1+1):
        dp[i][0] = i
    for i in range(1, len1+1):
        for j in range(1, len2+1):
            if string1[i-1] == string2[j-1]:
                dp[i][j] = dp[i-1][j-1]
                duplication.append(string1[i-1])
            else:
                dp[i][j] = min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) + 1
    return dp[-1][-1], duplication

In [207]:
# 测试edit_distance()

In [208]:
edit_distance('garcg', 'kacfg')

(3, ['g', 'a', 'c', 'g'])

- **拿第一条候选抄袭新闻为例**

In [215]:
source_copy_news = copy_news.iloc[0, 3]

In [216]:
source_copy_news

'戈壁的大漠黄沙曾掩埋了无数西域古道，而如今一条大漠天路正顽强地与黄沙“搏斗”，在乌兰布和、腾格里、巴丹吉林三大沙漠中穿行，成为世界上最长的穿越沙漠高速公路，这就是北京至乌鲁木齐的京新高速公路。京新高速全长2540公里，建成后北京到新疆的行车里程将缩短1300公里，大漠变通途，通疆达海的梦想即将实现。新华社记者邓华 摄\r\n'

In [217]:
source_copy_news = ''.join(re.findall('\w+', source_copy_news))  # 记住是\w+

In [218]:
source_copy_news

'戈壁的大漠黄沙曾掩埋了无数西域古道而如今一条大漠天路正顽强地与黄沙搏斗在乌兰布和腾格里巴丹吉林三大沙漠中穿行成为世界上最长的穿越沙漠高速公路这就是北京至乌鲁木齐的京新高速公路京新高速全长2540公里建成后北京到新疆的行车里程将缩短1300公里大漠变通途通疆达海的梦想即将实现新华社记者邓华摄'

In [221]:
def get_target_news(source_copy_news, idx, news_df):
    min_distance = float('inf')
    duplication = target_news = None
    for i in range(len(news_df)):
        if i == idx:
            continue
        candidate_news = news_df.iloc[i, 1]
        candidate_news = ''.join(re.findall('\w+', candidate_news))
        distance, tmp_duplication = edit_distance(source_copy_news, candidate_news)
        if distance < min_distance:
            min_distance = distance
            target_news = candidate_news
            duplication = tmp_duplication
    return target_news, duplication

In [222]:
target_news, duplication = get_target_news(source_copy_news, 51, data)

In [223]:
target_news

'这是6月18日拍摄的京新高速公路新疆哈密境内路段目前京北京新新疆高速公路新疆哈密至甘肃明水段贯通主体工程已完工预计6月底通车京新高速全长2540公里是世界上最长的穿越沙漠高速公路建成后北京到新疆的行车里程将缩短1300公里大漠变通途通疆达海的梦想即将实现新华社图责任编辑邹少欢'

In [224]:
set(duplication)

{'0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '上',
 '世',
 '京',
 '全',
 '公',
 '到',
 '北',
 '华',
 '即',
 '变',
 '后',
 '大',
 '实',
 '将',
 '建',
 '想',
 '成',
 '摄',
 '新',
 '是',
 '最',
 '梦',
 '沙',
 '海',
 '漠',
 '现',
 '界',
 '疆',
 '的',
 '短',
 '社',
 '程',
 '穿',
 '缩',
 '至',
 '行',
 '越',
 '路',
 '车',
 '达',
 '这',
 '途',
 '通',
 '速',
 '里',
 '长',
 '高'}