# Claim-sentence Relevance

## Loading

In [1]:
import torch
from tqdm import tqdm
import json
import random

In [2]:
fn_static_embeddings = torch.load(
    '../ROT/data/Weibo/FN_bert-base-chinese_embeddings_static.pt')
fn_dynamic_embeddings = torch.load(
    '../ROT/data/Weibo/FN_bert-base-chinese_embeddings_dynamic.pt')
dn_static_embeddings = torch.load(
    '../ROT/data/Weibo/DN_bert-base-chinese_embeddings_static.pt')
dn_dynamic_embeddings = torch.load(
    '../ROT/data/Weibo/DN_bert-base-chinese_embeddings_dynamic.pt')

len(fn_static_embeddings), len(fn_dynamic_embeddings), len(dn_static_embeddings), len(dn_dynamic_embeddings)

(11934, 11934, 27505, 27505)

In [3]:
with open('../../dataset/Weibo/raw/FN_11934_filtered.json', 'r') as f:
    FN = json.load(f)

with open('../../dataset/Weibo/raw/DN_27505_filtered.json', 'r') as f:
    DN = json.load(f)

fnOid2item = {fn['_id']: fn for fn in FN}
dnOid2item = {dn['_id']: dn for dn in DN}

fnIdx2item = {i: fn for i, fn in enumerate(FN)}
dnIdx2item = {i: dn for i, dn in enumerate(DN)}

fnOid2idx = {fn['_id']: i for i, fn in enumerate(FN)}
dnOid2idx = {dn['_id']: i for i, dn in enumerate(DN)}

len(FN), len(DN)

(11934, 27505)

In [4]:
def pytorch_euclidean_distance(a, b):
    return torch.dist(a, b)

In [11]:
def get_sim_scores(fn_emb, dn_emb):
    score_dict = dict()

    for qidx, fn in enumerate(tqdm(FN)):
        dn_oids = fn['debunking_ids']

        for did in dn_oids:
            dn = dnOid2item[did]
            didx = dnOid2idx[did]

            query = fn_emb[qidx]
            sentences = dn_emb[didx]

            items = [pytorch_euclidean_distance(
                query, sent) for sent in sentences]

            # Scale
            m, M = min(items), max(items)
            items = [1 - (x - m) / (M - m + 1e-8) for x in items]

            if qidx not in score_dict.keys():
                score_dict[qidx] = {didx: items}
            else:
                score_dict[qidx][didx] = items
                
    return score_dict

In [12]:
scores_dynamic = get_sim_scores(fn_dynamic_embeddings, dn_dynamic_embeddings)

100%|██████████| 11934/11934 [00:46<00:00, 258.20it/s]


In [13]:
scores_static = get_sim_scores(fn_static_embeddings, dn_static_embeddings)

100%|██████████| 11934/11934 [00:46<00:00, 253.97it/s]


In [14]:
def print_in_color(s, cint=31, end='\n'):
    print('\x1b[{}m{}\x1b[0m'.format(cint, s), end=end)

In [18]:
sorted(scores_dynamic[0][19], reverse=True)

[tensor(1.),
 tensor(0.9864),
 tensor(0.9283),
 tensor(0.8933),
 tensor(0.8823),
 tensor(0.7885),
 tensor(0.7509),
 tensor(0.7465),
 tensor(0.7306),
 tensor(0.7234),
 tensor(0.7197),
 tensor(0.7056),
 tensor(0.6803),
 tensor(0.6667),
 tensor(0.6539),
 tensor(0.6325),
 tensor(0.6219),
 tensor(0.5724),
 tensor(0.5424),
 tensor(0.5198),
 tensor(0.4378),
 tensor(0.3717),
 tensor(0.3205),
 tensor(0.1436),
 tensor(0.)]

## Compare Dynamic & Static

In [25]:
TOP = 3

qidx = random.randint(0, len(FN) - 1)
fn = FN[qidx]

print('[Fake News]\n{}'.format(fn['content_all']))

for did in fn['debunking_ids']:
    didx = dnOid2idx[did]
    dn = DN[didx]

    s_dynamic = scores_dynamic[qidx][didx]
    s_static = scores_static[qidx][didx]

    top_s_dynamic = sorted(s_dynamic, reverse=True)[:TOP]
    top_s_static = sorted(s_static, reverse=True)[:TOP]

    print('\n---------------------------------------------')
    print('qidx = {}, didx = {}\n'.format(qidx, didx))

    print('[Debunking News]')
    for j, sent in enumerate(dn['content_all']):
        print('[Sent-{}]['.format(j), end='')

        print_func = print_in_color if s_dynamic[j] in top_s_dynamic else print
        print_func('Dynamic: {:.3f}'.format(s_dynamic[j]), end='')

        print(', ', end='')

        print_func = print_in_color if s_static[j] in top_s_static else print
        print_func('Static: {:.3f}'.format(s_static[j]), end='')

        print(']\t{}'.format(sent))

[Fake News]
新冠病毒来源于美国的证据来了。应该是美国那家军事实验室泄露@魔法部之声 求大神解释

---------------------------------------------
qidx = 11580, didx = 22830

[Debunking News]
[Sent-0][[31mDynamic: 0.839[0m, Static: 0.581]	#每日疫情快报#【最新辟谣】1. 新型冠状病毒是实验室制造的生物武器？
[Sent-1][[31mDynamic: 1.000[0m, [31mStatic: 1.000[0m]	辟谣：2月19日，在世卫组织东地中海区域办事处新闻发布会上，世卫组织东地中海区域主任称，没有证据表明新型冠状病毒是实验室制造的，也没有证据表明新冠病毒是以生物武器的身份制造出来的，新冠病毒来自动物界。
[Sent-2][Dynamic: 0.484, Static: 0.316]	 2. 新冠肺炎预警人之一，艾芬医生去世？
[Sent-3][[31mDynamic: 0.736[0m, [31mStatic: 0.787[0m]	辟谣：2月20日13时许，武汉经济广播官方微博发布消息称，2月20日中午12点44分，仍在一线工作的武汉市中心医院急诊科主任艾芬，利用午饭间隙，向所有关心关注她的朋友表示感谢。
[Sent-4][Dynamic: 0.000, Static: 0.000]	 3. 网传江苏省学校3月2日陆续开学？
[Sent-5][Dynamic: 0.685, Static: 0.611]	辟谣：这条信息来源于前几天网上流传的江苏某地开会会议讨论稿上的信息。
[Sent-6][Dynamic: 0.370, Static: 0.339]	江苏省政府、省教育厅、市政府、市教育局等官方网站、微信公众号、微博等从未发布过类似信息。
[Sent-7][Dynamic: 0.531, [31mStatic: 0.652[0m]	江苏省教育厅只在2月6日发布过通知：根据疫情发展情况，省政府决定，全省各级各类学校(高校、中小学、中职学校、幼儿园、托育机构)2月底前不开学。
[Sent-8][Dynamic: 0.282, Static: 0.509]	 【紧急寻人】最新患者同乘信息公示：1

# Pattern-sentence Relevance

## Loading

In [3]:
import torch
from tqdm import tqdm
import json
import random
import numpy as np
import pickle

In [2]:
with open('../../dataset/Weibo/raw/FN_11934_filtered.json', 'r') as f:
    FN = json.load(f)

with open('../../dataset/Weibo/raw/DN_27505_filtered.json', 'r') as f:
    DN = json.load(f)

fnOid2item = {fn['_id']: fn for fn in FN}
dnOid2item = {dn['_id']: dn for dn in DN}

fnIdx2item = {i: fn for i, fn in enumerate(FN)}
dnIdx2item = {i: dn for i, dn in enumerate(DN)}

fnOid2idx = {fn['_id']: i for i, fn in enumerate(FN)}
dnOid2idx = {dn['_id']: i for i, dn in enumerate(DN)}

len(FN), len(DN)

(11934, 27505)

## Kmeans clutering cases

In [4]:
with open('./data/Weibo/kmeans.pkl', 'rb') as f:
    kmeans = pickle.load(f)
    
kmeans

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=20, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=1)

In [5]:
centers = np.load('./data/Weibo/kmeans_cluster_centers.npy')
centers.shape

(20, 768)

In [7]:
centers

array([[-6.77433375e-04,  1.15634606e-03, -4.01611179e-03, ...,
         7.70760106e-03,  7.26240608e-03,  7.32099289e-03],
       [-5.01510243e-03,  1.07961863e-03, -4.01010013e-03, ...,
        -1.24469933e-03,  7.92319694e-03, -7.95894576e-03],
       [ 9.37229458e-04,  3.48859839e-03, -6.49430670e-03, ...,
         2.98523055e-03, -4.44983851e-03, -2.69852351e-03],
       ...,
       [-5.26533446e-04,  5.26453579e-03,  4.87067658e-03, ...,
        -4.07386656e-03, -8.10667741e-03,  5.22823139e-03],
       [ 4.73691415e-03,  5.07404897e-03,  2.19203026e-03, ...,
         7.53568484e-05, -3.14491691e-03,  6.93745064e-03],
       [-5.34702413e-04,  4.28005703e-04, -5.26110912e-04, ...,
        -4.80198137e-03, -3.76390257e-03, -1.32672381e-02]])

In [9]:
y = kmeans.labels_
X = np.load('./data/Weibo/clustering_X.npy')
X.shape, y.shape

((253547, 768), (253547,))

In [10]:
dataIdx = pickle.load(open('./data/Weibo/clustering_X_dataIdx.pkl', 'rb'))
len(dataIdx)

253547

In [12]:
y

array([ 6,  6, 15, ..., 15,  3, 15], dtype=int32)

In [13]:
dataIdx

{0: (1, 2245, 1),
 1: (1, 2245, 5),
 2: (1, 2245, 6),
 3: (1, 2245, 11),
 4: (1, 2245, 12),
 5: (1, 2245, 16),
 6: (1, 2245, 17),
 7: (1, 2245, 19),
 8: (1, 2245, 20),
 9: (1, 2245, 21),
 10: (1, 2245, 24),
 11: (1, 2245, 26),
 12: (1, 2245, 30),
 13: (1, 2245, 31),
 14: (1, 2245, 33),
 15: (1, 2245, 36),
 16: (1, 2245, 39),
 17: (1, 2245, 43),
 18: (1, 2245, 44),
 19: (1, 2245, 45),
 20: (1, 2245, 46),
 21: (1, 2245, 47),
 22: (1, 2245, 48),
 23: (1, 2245, 49),
 24: (1, 2245, 51),
 25: (1, 9901, 2),
 26: (1, 9901, 6),
 27: (1, 9901, 8),
 28: (1, 9901, 15),
 29: (1, 9901, 16),
 30: (1, 9901, 19),
 31: (1, 9901, 20),
 32: (1, 9901, 22),
 33: (1, 9901, 23),
 34: (1, 9901, 31),
 35: (1, 9901, 40),
 36: (1, 9901, 41),
 37: (1, 9901, 48),
 38: (1, 9901, 52),
 39: (1, 9901, 54),
 40: (1, 9901, 56),
 41: (1, 9901, 59),
 42: (1, 9901, 60),
 43: (1, 1343, 1),
 44: (3, 22, 3),
 45: (3, 22, 9),
 46: (3, 22, 10),
 47: (3, 22, 46),
 48: (4, 1141, 1),
 49: (4, 1141, 13),
 50: (4, 1141, 14),
 51: (4,

In [11]:
np.where(y==0)

(array([    27,     28,     73, ..., 253529, 253530, 253543]),)

In [17]:
SHOW = 50

# C = random.randint(0, 19)
# C = 0

samples = np.where(y == C)[0].tolist()
print('C = {}, samples = {} ({:.3%})\n'.format(
    C, len(samples), len(samples)/len(y)))
print('-'*25)

for cidx in samples[:SHOW]:
    qidx, didx, sidx = dataIdx[cidx]
    print('[qidx={}, didx={}, sidx={}]\t{}'.format(
        qidx, didx, sidx, DN[didx]['content_all'][sidx]))

print('\n', '-'*25, '\n')

for cidx in random.sample(samples, min(SHOW, len(samples))):
    qidx, didx, sidx = dataIdx[cidx]
    print('[qidx={}, didx={}, sidx={}]\t{}'.format(
        qidx, didx, sidx, DN[didx]['content_all'][sidx]))

C += 1

C = 3, samples = 14756 (5.820%)

-------------------------
[qidx=1, didx=9901, sidx=19]	是不是肉鸡产业中的潜规则？”
[qidx=4, didx=26510, sidx=22]	➋小龙虾是日本人的阴谋？
[qidx=4, didx=26510, sidx=31]	➍小龙虾爱脏水，重金属很多？
[qidx=4, didx=17266, sidx=25]	那小龙虾最爱“污水”的传说又是真的吗？
[qidx=4, didx=26731, sidx=10]	而诸多以前出现过的说法更是“老生常谈”。
[qidx=4, didx=19526, sidx=76]	网上也有种说法，认为是“洗虾粉”导致了横纹肌溶解症。
[qidx=4, didx=19533, sidx=22]	美国FDA的毒素检测也没发现致病物质，只能用“哈夫病”结案。
[qidx=4, didx=26153, sidx=22]	 为了适应污染的环境，克氏原螯虾有一套独家排毒机制——蜕壳。
[qidx=4, didx=7288, sidx=1]	[吃瓜]】在#世界杯# “标配”——“小龙虾+啤酒”被热捧之时，一则号称“吃小龙虾会导致哈夫病和肺吸虫病”“小龙虾致命”“全世界都不敢吃”“中国人蒙在鼓里”的消息又火了。
[qidx=4, didx=17964, sidx=35]	肺吸虫是一个特别会“抱大腿”的主儿。
[qidx=4, didx=17964, sidx=64]	食用小龙虾后出现的横纹肌溶解症，称之为Haff病。
[qidx=4, didx=9963, sidx=10]	小龙虾的真名叫“克氏原螯虾”，学名叫Procambarus clarkii，是一种如假包换的淡水虾。
[qidx=7, didx=49, sidx=18]	也因此，2010年微博造假“异军突起”，如影响恶劣的“金庸被去世”、“鲁迅作品大撤退”等假新闻都来源于微博。
[qidx=7, didx=49, sidx=259]	“年度假新闻”评选引起主管部门的高度重视。
[qidx=7, didx=49, sidx=270]	尽管“假”始终与“真”相伴，且虚假新闻报道成因复杂，所以无数新闻报道中夹杂着若干“客里空”也不足为奇。
[qidx=7, didx=49, 

# Key Sentence Selection (when initialization)

In [7]:
import torch
from tqdm import tqdm
import json
import random
import numpy as np

In [26]:
def print_in_color(s, cint=31, end='\n'):
    print('\x1b[{}m{}\x1b[0m'.format(cint, s), end=end)

In [3]:
with open('../../dataset/Weibo/raw/FN_11934_filtered.json', 'r') as f:
    FN = json.load(f)

with open('../../dataset/Weibo/raw/DN_27505_filtered.json', 'r') as f:
    DN = json.load(f)

fnOid2item = {fn['_id']: fn for fn in FN}
dnOid2item = {dn['_id']: dn for dn in DN}

fnIdx2item = {i: fn for i, fn in enumerate(FN)}
dnIdx2item = {i: dn for i, dn in enumerate(DN)}

fnOid2idx = {fn['_id']: i for i, fn in enumerate(FN)}
dnOid2idx = {dn['_id']: i for i, dn in enumerate(DN)}

len(FN), len(DN)

(11934, 27505)

In [2]:
fn_embeddings = torch.load(
    '../ROT/data/Weibo/FN_bert-base-chinese_embeddings_static.pt')
dn_embeddings = torch.load(
    '../ROT/data/Weibo/DN_bert-base-chinese_embeddings_static.pt')

len(fn_embeddings), len(dn_embeddings)

(11934, 27505)

In [10]:
fn_embeddings[0].shape

torch.Size([768])

In [8]:
memory = np.load('./data/Weibo/kmeans_cluster_centers.npy')
memory.shape

(20, 768)

In [4]:
def pytorch_euclidean_distance(a, b):
    return torch.dist(a, b).item()

In [5]:
def get_claim_sentence_scores(fn_emb=fn_embeddings, dn_emb=dn_embeddings):
    score_dict = dict()

    for qidx, fn in enumerate(tqdm(FN)):
        dn_oids = fn['debunking_ids']

        for did in dn_oids:
            dn = dnOid2item[did]
            didx = dnOid2idx[did]

            query = fn_emb[qidx]
            sentences = dn_emb[didx]

            items = [pytorch_euclidean_distance(
                query, sent) for sent in sentences]

            # Scale
            m, M = min(items), max(items)
            items = [1 - (x - m) / (M - m + 1e-8) for x in items]

            if qidx not in score_dict.keys():
                score_dict[qidx] = {didx: items}
            else:
                score_dict[qidx][didx] = items

    return score_dict

In [11]:
def get_pattern_sentence_scores(fn_emb=fn_embeddings, dn_emb=dn_embeddings):
    score_dict = dict()

    for qidx, fn in enumerate(tqdm(FN)):
        dn_oids = fn['debunking_ids']

        for did in dn_oids:
            dn = dnOid2item[did]
            didx = dnOid2idx[did]

            Q = fn_emb[qidx]
            sentences = dn_emb[didx]

            items = []
            for S in sentences:
                distances = torch.norm((S - Q) - memory, p=2, dim=1)
                center_idx = torch.argmin(distances).item()
                items.append(distances[center_idx].item())

            # Scale
            m, M = min(items), max(items)
            items = [1 - (x - m) / (M - m + 1e-8) for x in items]

            if qidx not in score_dict.keys():
                score_dict[qidx] = {didx: items}
            else:
                score_dict[qidx][didx] = items

    return score_dict

In [12]:
claim_scores = get_claim_sentence_scores()

100%|██████████| 11934/11934 [00:45<00:00, 264.56it/s]


In [13]:
pattern_scores = get_pattern_sentence_scores()

100%|██████████| 11934/11934 [02:41<00:00, 74.02it/s] 


In [20]:
lmd_Q = 0.6
lmd_P = 0.4

scores = dict()

for qidx in tqdm(claim_scores):
    for didx, Q_scores in claim_scores[qidx].items():
        P_scores = pattern_scores[qidx][didx]
        res_scores = [lmd_Q * Q_scores[i] + lmd_P * P_scores[i]
                      for i in range(len(Q_scores))]

        if qidx not in scores.keys():
            scores[qidx] = {didx: res_scores}
        else:
            scores[qidx][didx] = res_scores

100%|██████████| 11934/11934 [00:15<00:00, 777.66it/s] 


In [36]:
TOP = 3

qidx = random.randint(0, len(FN) - 1)
fn = FN[qidx]

print('[Fake News]\n{}'.format(fn['content_all']))

for did in fn['debunking_ids']:
    didx = dnOid2idx[did]
    dn = DN[didx]

    s_claim = claim_scores[qidx][didx]
    s_pattern = pattern_scores[qidx][didx]
    s_selection = scores[qidx][didx]

    top_s_claim = sorted(s_claim, reverse=True)[:TOP]
    top_s_pattern = sorted(s_pattern, reverse=True)[:TOP]
    top_s_selection = sorted(s_selection, reverse=True)[:TOP]

    print('\n---------------------------------------------')
    print('qidx = {}, didx = {}\n'.format(qidx, didx))

    print('[Debunking News]')
    for j, sent in enumerate(dn['content_all']):
        print('[Sent-{}]['.format(j), end='')

        print_func = print_in_color if s_claim[j] in top_s_claim else print
        print_func('Claim: {:.3f}'.format(s_claim[j]), end='')

        print(', ', end='')

        print_func = print_in_color if s_pattern[j] in top_s_pattern else print
        print_func('Pattern: {:.3f}'.format(s_pattern[j]), end='')

        print(', ', end='')
        
        print_func = print_in_color if s_selection[j] in top_s_selection else print
        print_func('Result: {:.3f}'.format(s_selection[j]), end='')
        
        print_func(']\t{}'.format(sent))

[Fake News]
周萌萌，女，6岁半，于2011年1月31日，在四川省彭州市被拐，已报案。其母亲因思念过度住院，希望借助网络的力量寻求帮助，如网友有线索，请联系萌萌爸爸13962672031。

---------------------------------------------
qidx = 278, didx = 14754

[Debunking News]
[Sent-0][Claim: 0.741, Pattern: 0.765, Result: 0.751]	#平安守护# 【辟谣：锦绣花园一个小孩被拐走！
[Sent-1][[31mClaim: 1.000[0m, [31mPattern: 1.000[0m, [31mResult: 1.000[0m[31m]	假的】 近日，鄢陵的朋友圈有人在传一则“十万火急”的寻人信息，信息大致内容是这样的：寻人启事13940292999，有线索酬金10万帮忙扩散，今天上午一个三岁多小女孩在锦绣花园小区附近被人拐走了，从监控中看是被一个四十多岁男人抱走了，现大人都急疯了。[0m
[Sent-2][[31mClaim: 0.849[0m, [31mPattern: 0.931[0m, [31mResult: 0.882[0m[31m]	 经小编向相关部门核实，我局并未接到这一警情，这是一则假消息，网络谣言，谣言！[0m
[Sent-3][Claim: 0.000, Pattern: 0.000, Result: 0.000]	谣言！
[Sent-4][[31mClaim: 0.806[0m, [31mPattern: 0.888[0m, [31mResult: 0.839[0m[31m]	当家长的心情都是一样的，可以理解，但是也请学会辨别，免得好心变坏事咯！[0m
[Sent-5][Claim: 0.300, Pattern: 0.308, Result: 0.303]	 不信谣！
[Sent-6][Claim: 0.235, Pattern: 0.245, Result: 0.239]	不传谣！

---------------------------------------------
qidx = 278, didx = 16094

[Debunking