# Claim-sentence Relevance

## Loading

In [1]:
import torch
from tqdm import tqdm
import json
import random

In [2]:
fn_static_embeddings = torch.load(
    '../ROT/data/Twitter/FN_bert-base-uncased_embeddings_static.pt')
dn_static_embeddings = torch.load(
    '../ROT/data/Twitter/DN_bert-base-uncased_embeddings_static.pt')

len(fn_static_embeddings), len(dn_static_embeddings)

(10003, 1703)

In [3]:
with open('../../dataset/Twitter/raw/FN_10003.json', 'r') as f:
    FN = json.load(f)

with open('../../dataset/Twitter/raw/DN_1703.json', 'r') as f:
    DN = json.load(f)

fnOid2item = {fn['_id']: fn for fn in FN}
dnOid2item = {dn['_id']: dn for dn in DN}

fnIdx2item = {i: fn for i, fn in enumerate(FN)}
dnIdx2item = {i: dn for i, dn in enumerate(DN)}

fnOid2idx = {fn['_id']: i for i, fn in enumerate(FN)}
dnOid2idx = {dn['_id']: i for i, dn in enumerate(DN)}

len(FN), len(DN)

(10003, 1703)

In [4]:
def pytorch_euclidean_distance(a, b):
    return torch.dist(a, b)

In [5]:
def get_sim_scores(fn_emb, dn_emb):
    score_dict = dict()

    for qidx, fn in enumerate(tqdm(FN)):
        dn_oids = fn['debunking_ids']

        for did in dn_oids:
            dn = dnOid2item[did]
            didx = dnOid2idx[did]

            query = fn_emb[qidx]
            sentences = dn_emb[didx]

            items = [pytorch_euclidean_distance(
                query, sent) for sent in sentences]

            # Scale
            m, M = min(items), max(items)
            items = [1 - (x - m) / (M - m + 1e-8) for x in items]

            if qidx not in score_dict.keys():
                score_dict[qidx] = {didx: items}
            else:
                score_dict[qidx][didx] = items
                
    return score_dict

In [7]:
scores_static = get_sim_scores(fn_static_embeddings, dn_static_embeddings)

100%|██████████| 10003/10003 [00:19<00:00, 512.29it/s]


In [8]:
def print_in_color(s, cint=31, end='\n'):
    print('\x1b[{}m{}\x1b[0m'.format(cint, s), end=end)

# Pattern-sentence Relevance

## Loading

In [10]:
import torch
from tqdm import tqdm
import json
import random
import numpy as np
import pickle

## Kmeans clutering cases

In [11]:
with open('./data/Twitter/kmeans.pkl', 'rb') as f:
    kmeans = pickle.load(f)
    
kmeans

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=20, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=1)

In [12]:
centers = np.load('./data/Twitter/kmeans_cluster_centers.npy')
centers.shape

(20, 768)

In [13]:
centers

array([[ 0.00289531,  0.00877545,  0.0021697 , ...,  0.00582432,
        -0.0035272 ,  0.00153809],
       [ 0.00199338,  0.00984488,  0.00322162, ...,  0.00520925,
         0.0019047 ,  0.00330786],
       [ 0.00275042,  0.00926284,  0.00019148, ...,  0.0036177 ,
        -0.00020178,  0.00082505],
       ...,
       [-0.00283085,  0.00656329, -0.00259037, ...,  0.00964924,
         0.00480074,  0.00332236],
       [-0.00035096,  0.00258983, -0.00139927, ..., -0.00233068,
        -0.00351499, -0.0012395 ],
       [ 0.00250381,  0.00711796,  0.00624046, ...,  0.00766808,
         0.00494238,  0.00088399]])

In [15]:
y = kmeans.labels_
X = np.load('./data/Twitter/clustering_X.npy')
X.shape, y.shape

((117576, 768), (117576,))

In [16]:
dataIdx = pickle.load(open('./data/Twitter/clustering_X_dataIdx.pkl', 'rb'))
len(dataIdx)

117576

In [17]:
y

array([15, 14, 14, ..., 16,  6,  5], dtype=int32)

In [18]:
dataIdx

{0: (1, 1, 4),
 1: (1, 1, 5),
 2: (1, 1, 11),
 3: (1, 1, 17),
 4: (1, 1, 20),
 5: (1, 1, 24),
 6: (1, 1, 25),
 7: (1, 1, 26),
 8: (1, 1, 27),
 9: (1, 1, 29),
 10: (1, 1, 30),
 11: (1, 1, 36),
 12: (1, 1, 37),
 13: (1, 1, 38),
 14: (1, 1, 43),
 15: (3, 3, 0),
 16: (3, 3, 2),
 17: (3, 3, 3),
 18: (3, 3, 5),
 19: (3, 3, 29),
 20: (3, 3, 30),
 21: (3, 3, 32),
 22: (4, 4, 2),
 23: (4, 4, 7),
 24: (4, 4, 12),
 25: (4, 4, 13),
 26: (5, 5, 0),
 27: (5, 5, 7),
 28: (5, 5, 10),
 29: (5, 5, 11),
 30: (5, 5, 12),
 31: (5, 5, 13),
 32: (5, 5, 22),
 33: (5, 5, 27),
 34: (5, 5, 28),
 35: (5, 5, 42),
 36: (5, 5, 63),
 37: (5, 5, 64),
 38: (5, 5, 71),
 39: (5, 5, 74),
 40: (5, 5, 76),
 41: (5, 5, 80),
 42: (5, 5, 85),
 43: (5, 5, 87),
 44: (5, 5, 88),
 45: (5, 5, 90),
 46: (5, 5, 98),
 47: (5, 5, 100),
 48: (5, 5, 106),
 49: (5, 5, 108),
 50: (5, 5, 109),
 51: (5, 5, 110),
 52: (5, 5, 111),
 53: (5, 5, 114),
 54: (5, 5, 117),
 55: (5, 5, 119),
 56: (5, 5, 120),
 57: (5, 5, 124),
 58: (5, 5, 134),
 59: 

In [19]:
np.where(y==0)

(array([   246,    247,    248, ..., 117278, 117279, 117360]),)

In [22]:
SHOW = 50

# C = random.randint(0, 19)
# C = 0

samples = np.where(y == C)[0].tolist()
print('C = {}, samples = {} ({:.3%})\n'.format(
    C, len(samples), len(samples)/len(y)))
print('-'*25)

for cidx in samples[:SHOW]:
    qidx, didx, sidx = dataIdx[cidx]
    print('[qidx={}, didx={}, sidx={}]\t{}'.format(
        qidx, didx, sidx, DN[didx]['content'][sidx]))

print('\n', '-'*25, '\n')

for cidx in random.sample(samples, min(SHOW, len(samples))):
    qidx, didx, sidx = dataIdx[cidx]
    print('[qidx={}, didx={}, sidx={}]\t{}'.format(
        qidx, didx, sidx, DN[didx]['content'][sidx]))

C += 1

C = 1, samples = 13249 (11.268%)

-------------------------
[qidx=5, didx=5, sidx=10]	He said it was the best time of his life.
[qidx=8, didx=8, sidx=37]	"We are going to send the message that the party of Lincoln and Reagan and the presidency of the United States will never be held by a con artist."
[qidx=13, didx=14, sidx=19]	According to a schedule released by the White House, the president plans to campaign Tuesday through Friday next week for Hillary Clinton.
[qidx=13, didx=14, sidx=22]	On Tuesday exactly one week before Election Day he will head to Ohio to campaign for Clinton at a "Get Out The Early Vote" campaign in Columbus.
[qidx=22, didx=22, sidx=45]	For some time families had to live on the streets and it all seemed chaotic.
[qidx=24, didx=10, sidx=3]	A chapter in the book suggests that the Clinton family and Russia each may have benefited from a "pay-for-play" scheme while Hillary Clinton was secretary of state, involving the transfer of U.S. uranium reserves to the new Ru

# Key Sentence Selection (when initialization)

In [23]:
import torch
from tqdm import tqdm
import json
import random
import numpy as np

In [24]:
def print_in_color(s, cint=31, end='\n'):
    print('\x1b[{}m{}\x1b[0m'.format(cint, s), end=end)

In [25]:
fn_embeddings = fn_static_embeddings
dn_embeddings = dn_static_embeddings

len(fn_embeddings), len(dn_embeddings)

(10003, 1703)

In [26]:
fn_embeddings[0].shape

torch.Size([768])

In [27]:
memory = np.load('./data/Twitter/kmeans_cluster_centers.npy')
memory.shape

(20, 768)

In [28]:
def pytorch_euclidean_distance(a, b):
    return torch.dist(a, b).item()

In [29]:
def get_claim_sentence_scores(fn_emb=fn_embeddings, dn_emb=dn_embeddings):
    score_dict = dict()

    for qidx, fn in enumerate(tqdm(FN)):
        dn_oids = fn['debunking_ids']

        for did in dn_oids:
            dn = dnOid2item[did]
            didx = dnOid2idx[did]

            query = fn_emb[qidx]
            sentences = dn_emb[didx]

            items = [pytorch_euclidean_distance(
                query, sent) for sent in sentences]

            # Scale
            m, M = min(items), max(items)
            items = [1 - (x - m) / (M - m + 1e-8) for x in items]

            if qidx not in score_dict.keys():
                score_dict[qidx] = {didx: items}
            else:
                score_dict[qidx][didx] = items

    return score_dict

In [30]:
def get_pattern_sentence_scores(fn_emb=fn_embeddings, dn_emb=dn_embeddings):
    score_dict = dict()

    for qidx, fn in enumerate(tqdm(FN)):
        dn_oids = fn['debunking_ids']

        for did in dn_oids:
            dn = dnOid2item[did]
            didx = dnOid2idx[did]

            Q = fn_emb[qidx]
            sentences = dn_emb[didx]

            items = []
            for S in sentences:
                distances = torch.norm((S - Q) - memory, p=2, dim=1)
                center_idx = torch.argmin(distances).item()
                items.append(distances[center_idx].item())

            # Scale
            m, M = min(items), max(items)
            items = [1 - (x - m) / (M - m + 1e-8) for x in items]

            if qidx not in score_dict.keys():
                score_dict[qidx] = {didx: items}
            else:
                score_dict[qidx][didx] = items

    return score_dict

In [31]:
claim_scores = get_claim_sentence_scores()

100%|██████████| 10003/10003 [00:05<00:00, 1921.30it/s]


In [32]:
pattern_scores = get_pattern_sentence_scores()

100%|██████████| 10003/10003 [01:01<00:00, 161.43it/s]


In [33]:
lmd_Q = 0.6
lmd_P = 0.4

scores = dict()

for qidx in tqdm(claim_scores):
    for didx, Q_scores in claim_scores[qidx].items():
        P_scores = pattern_scores[qidx][didx]
        res_scores = [lmd_Q * Q_scores[i] + lmd_P * P_scores[i]
                      for i in range(len(Q_scores))]

        if qidx not in scores.keys():
            scores[qidx] = {didx: res_scores}
        else:
            scores[qidx][didx] = res_scores

100%|██████████| 10003/10003 [00:00<00:00, 21989.18it/s]


In [34]:
TOP = 3

qidx = random.randint(0, len(FN) - 1)
fn = FN[qidx]

print('[Fake News]\n{}'.format(fn['content']))

for did in fn['debunking_ids']:
    didx = dnOid2idx[did]
    dn = DN[didx]

    s_claim = claim_scores[qidx][didx]
    s_pattern = pattern_scores[qidx][didx]
    s_selection = scores[qidx][didx]

    top_s_claim = sorted(s_claim, reverse=True)[:TOP]
    top_s_pattern = sorted(s_pattern, reverse=True)[:TOP]
    top_s_selection = sorted(s_selection, reverse=True)[:TOP]

    print('\n---------------------------------------------')
    print('qidx = {}, didx = {}\n'.format(qidx, didx))

    print('[Debunking News]')
    for j, sent in enumerate(dn['content']):
        print('[Sent-{}]['.format(j), end='')

        print_func = print_in_color if s_claim[j] in top_s_claim else print
        print_func('Claim: {:.3f}'.format(s_claim[j]), end='')

        print(', ', end='')

        print_func = print_in_color if s_pattern[j] in top_s_pattern else print
        print_func('Pattern: {:.3f}'.format(s_pattern[j]), end='')

        print(', ', end='')
        
        print_func = print_in_color if s_selection[j] in top_s_selection else print
        print_func('Result: {:.3f}'.format(s_selection[j]), end='')
        
        print_func(']\t{}'.format(sent))

[Fake News]
BEFORE YOU DONATE SOMETHING TO THINK ABOUT BEFORE you MAKE CONTRIBUTIONS : As you open your pockets to do a good thing and make yourself feel good , please kee the followi facts in mind : The American Red Cross President and CEO Marsha J . Evans ' salary for the year was $ 651.957 plus expenses MARCH DIMES It iS called the March of Dimes because only a dime for every I dollar is given to the needy . The United way President Brian Gallagher receives a $ 375,000 base salary along with numerous expense benefits . UNICEF CEO caryl M . Stem receives Sl per year ( 100k permonth ) plus all expenses including a ROLLS ROYCE . Less than 5 cents of your donated dollar goes to the cause . GOODWILL CEO and owner Mark Curran profits $ 2.3 million a year . Goodwill iS a very catchy name for his You donate to his business and tt & rP he sells the items for PROFIT . He pays nothing for his products and pays his workers minimum wage ! Nice Guy . $ 0.00 qoes to help anyone ! Stop giving to th