# Statistics

In [1]:
import pandas as pd
import json
import random

In [3]:
with open('../raw/FN_11934_filtered.json', 'r') as f:
    FN = json.load(f)
    
with open('../raw/DN_27505_filtered.json', 'r') as f:
    DN = json.load(f)
    
len(FN), len(DN)

(11934, 27505)

## Matching Pairs (Total)

In [4]:
matching_num = [len(fn['debunking_ids']) for fn in FN]
df = pd.DataFrame({'num': matching_num})
len(df)

11934

In [5]:
df.describe()

Unnamed: 0,num
count,11934.0
mean,7.237473
std,10.306454
min,1.0
25%,1.0
50%,3.0
75%,8.0
max,138.0


In [6]:
df[df['num']>25].describe()

Unnamed: 0,num
count,851.0
mean,37.296122
std,13.72809
min,26.0
25%,30.0
50%,36.0
75%,38.0
max,138.0


In [7]:
sum(matching_num)

86372

## Sentences Num of DN

In [8]:
sents_num = [len(dn['content_all']) for dn in DN]
df = pd.DataFrame({'num':sents_num})
len(df)

27505

In [9]:
df.describe()

Unnamed: 0,num
count,27505.0
mean,15.208653
std,20.060215
min,1.0
25%,4.0
50%,7.0
75%,19.0
max,356.0


In [10]:
df[df['num']<=5].describe()

Unnamed: 0,num
count,10942.0
mean,3.216871
std,1.330441
min,1.0
25%,2.0
50%,3.0
75%,4.0
max,5.0


In [11]:
df[df['num']<=3].describe()

Unnamed: 0,num
count,6010.0
mean,2.176539
std,0.785084
min,1.0
25%,2.0
50%,2.0
75%,3.0
max,3.0


In [12]:
len(df[df['num'] <= 3]) / len(df), len(df[df['num'] <= 5]) / len(df)

(0.21850572623159426, 0.39781857844028357)

# Split By Events

## Loading

In [1]:
import json
from sklearn.model_selection import train_test_split
import numpy as np
import csv

In [2]:
with open('../raw/FN_11934_filtered.json') as f:
    fns = json.load(f)
with open('../raw/DN_27505_filtered.json') as f:
    dns = json.load(f)

In [3]:
fns[2322]

{'_id': '5f0ecf97da7c229f16a4dde1',
 'content': '发表了博文 《中日国民素质之比较》 - 联合国公布全球国民素质道德水平调查及排名 前十名的国家 后十名的国家 1 日本 168 印度 2 美国 167 中国 3 法国 166 中日国民素质之比较',
 'event': '5f5e357d63fc4d00ff6be48d',
 'time': 1389004680000,
 'time_format': '2014-01-06 18:38:00',
 'debunking_ids': ['5f1172acb9cce3edf3143112',
  '5f315f59aaa0d4d489f2b32c',
  '5f1172acb9cce3edf31436de',
  '5f2ea72f8d1bbb4d1141af99',
  '5f1172acb9cce3edf3143559',
  '5f0c71f1efe4e78f5bffe037',
  '5f1172acb9cce3edf3143627'],
 'content_all': '发表了博文 《中日国民素质之比较》 - 联合国公布全球国民素质道德水平调查及排名 前十名的国家 后十名的国家 1 日本 168 印度 2 美国 167 中国 3 法国 166 中日国民素质之比较'}

In [4]:
dns[344]

{'_id': '5f0c71f1efe4e78f5b00a4dd',
 'content': '【那些年我们听过的谣言之滴血传艾滋】 11年11月出现一个消息：最近不要到外面吃东西，尤其大盘鸡、烧烤和凉拌菜，有艾滋病感染者将血滴到食物里。其实稍有防艾知识的都知道，滴血食物不可能传播艾滋，但对艾滋缺乏了解的人来说确实“恐慌”。卫生部新闻办公室立刻回应：纯属谣言。网页链接  http://t.cn/zjWAEbN ',
 'time': 1356278400000,
 'time_format': '2012-12-24 00:00:00',
 'content_all': ['【那些年我们听过的谣言之滴血传艾滋】 11年11月出现一个消息：最近不要到外面吃东西，尤其大盘鸡、烧烤和凉拌菜，有艾滋病感染者将血滴到食物里。',
  '其实稍有防艾知识的都知道，滴血食物不可能传播艾滋，但对艾滋缺乏了解的人来说确实“恐慌”。',
  '卫生部新闻办公室立刻回应：纯属谣言。',
  '网页链接 URL']}

In [5]:
# construct event dict
#format: event: [n_fake_news_ids, n_debunking_ids]
fnids = []
events = {}
for fn in fns:
    fnids.append(fn['_id'])
    if fn['event'] not in events:
        events[fn['event']] = [1, len(fn['debunking_ids'])]
    else:
        events[fn['event']][0] += 1

len(events)

2880

In [6]:
random_split_events = events
len(random_split_events)

2880

In [7]:
random_split_events

{'5f5e37e063fc4d00ff6bedcc': [1, 1],
 '5f5e37e063fc4d00ff6bece5': [19, 4],
 '5f5e37e063fc4d00ff6bea59': [1, 6],
 '5f5e37e063fc4d00ff6bea68': [1, 1],
 '5f62d347e59e3fdac3165e9d': [63, 30],
 '5f5e37e063fc4d00ff6bec63': [1, 3],
 '5f5e37e063fc4d00ff6bec64': [1, 2],
 '5f5e357d63fc4d00ff6be562': [46, 3],
 '5f5e37e063fc4d00ff6bed6a': [1, 1],
 '5f5e37e063fc4d00ff6be8bf': [2, 1],
 '5f5e37e063fc4d00ff6be9f0': [1, 2],
 '5f60b8a086fdb04acac62eac': [11, 2],
 '5f5e37e063fc4d00ff6be847': [3, 1],
 '5f5e37e063fc4d00ff6bedbd': [1, 1],
 '5f5e37e063fc4d00ff6be807': [1, 3],
 '5f5e37e063fc4d00ff6bf214': [18, 1],
 '5f5e37e063fc4d00ff6be8be': [12, 1],
 '5f5e37e063fc4d00ff6bea0c': [1, 1],
 '5f5e37e063fc4d00ff6be663': [5, 1],
 '5f5e357d63fc4d00ff6be45c': [23, 1],
 '5f5e37e063fc4d00ff6be80f': [1, 3],
 '5f5e37e063fc4d00ff6be808': [1, 3],
 '5f5e37e063fc4d00ff6bf222': [1, 1],
 '5f5e37e063fc4d00ff6bf21e': [1, 1],
 '5f5e357d63fc4d00ff6be4f2': [2, 2],
 '5f5e37e063fc4d00ff6bf227': [6, 1],
 '5f5e37e063fc4d00ff6be661': [

In [8]:
# l: [(event_ids, n_fake_news_ids), ...]
l = []
for k, v in random_split_events.items():
    l.append([k, v[0]])
l.sort(key=lambda x :x[1])

# add an auxiliary params for train-test split
for i in range(len(l)):
    l[i].append(i // 20)

In [9]:
l

[['5f5e37e063fc4d00ff6bedcc', 1, 0],
 ['5f5e37e063fc4d00ff6bea59', 1, 0],
 ['5f5e37e063fc4d00ff6bea68', 1, 0],
 ['5f5e37e063fc4d00ff6bec63', 1, 0],
 ['5f5e37e063fc4d00ff6bec64', 1, 0],
 ['5f5e37e063fc4d00ff6bed6a', 1, 0],
 ['5f5e37e063fc4d00ff6be9f0', 1, 0],
 ['5f5e37e063fc4d00ff6bedbd', 1, 0],
 ['5f5e37e063fc4d00ff6be807', 1, 0],
 ['5f5e37e063fc4d00ff6bea0c', 1, 0],
 ['5f5e37e063fc4d00ff6be80f', 1, 0],
 ['5f5e37e063fc4d00ff6be808', 1, 0],
 ['5f5e37e063fc4d00ff6bf222', 1, 0],
 ['5f5e37e063fc4d00ff6bf21e', 1, 0],
 ['5f5e37e063fc4d00ff6be8e8', 1, 0],
 ['5f5e37e063fc4d00ff6befb3', 1, 0],
 ['5f5e37e063fc4d00ff6bf201', 1, 0],
 ['5f5e37e063fc4d00ff6bea5d', 1, 0],
 ['5f5e37e063fc4d00ff6bf511', 1, 0],
 ['5f5e37e063fc4d00ff6bea0e', 1, 0],
 ['5f5e37e063fc4d00ff6beb5d', 1, 1],
 ['5f5e37e063fc4d00ff6bea13', 1, 1],
 ['5f60b8a086fdb04acac62fb5', 1, 1],
 ['5f5e37e063fc4d00ff6bf6bb', 1, 1],
 ['5f5e37e063fc4d00ff6bedae', 1, 1],
 ['5f5e37e063fc4d00ff6bed89', 1, 1],
 ['5f5e37e063fc4d00ff6beb10', 1, 1],
 

In [14]:
# x: event_ids
# y: n_fake_news // 20
x, _, y = list(zip(*l))

len(x), len(y)

(2880, 2880)

## For Time-unaware Experiments

### Train+Val : Test = 8:2

In [32]:
x_train_val, x_test, y_train_val, y_test = train_test_split(
    x, y, test_size=0.2, stratify=y)
print(len(x_train_val), len(x_test))

# n_debunking_ids in train+val
train_val_f_sum = 0
train_val_d_sum = 0
for xt in x_train_val:
    train_val_f_sum += events[xt][0]
    train_val_d_sum += events[xt][1]

# n_debunking_ids in test
test_f_sum = 0
test_d_sum = 0
for xt in x_test:
    test_f_sum += events[xt][0]
    test_d_sum += events[xt][1]

train_val_f_sum, test_f_sum, test_f_sum / \
    train_val_f_sum, train_val_d_sum, test_d_sum,  test_d_sum / train_val_d_sum

2304 576


(9548, 2386, 0.2498952660242983, 8838, 2228, 0.25209323376329484)

### Train : Val = 7:1

In [33]:
1 / 7 

0.14285714285714285

In [48]:
x_train, x_val, y_train, y_val = train_test_split(
    x_train_val, y_train_val, test_size=0.12, stratify=y_train_val)
print(len(x_train), len(x_val))
# n_debunking_ids in train
train_f_sum = 0
train_d_sum = 0
for xt in x_train:
    train_f_sum += events[xt][0]
    train_d_sum += events[xt][1]

# n_debunking_ids in val
val_f_sum = 0
val_d_sum = 0
for xt in x_val:
    val_f_sum += events[xt][0]
    val_d_sum += events[xt][1]

train_f_sum, val_f_sum, val_f_sum / \
    train_f_sum, train_d_sum, val_d_sum,  val_d_sum / train_d_sum

2027 277


(8356, 1192, 0.14265198659645764, 7654, 1184, 0.15469035798275413)

### Export

In [50]:
len(x_train), len(x_val), len(x_test)

(2027, 277, 576)

In [52]:
json.dump(x_train, open('./data/events.train', 'w'))
json.dump(x_val, open('./data/events.dev', 'w'))
json.dump(x_test, open('./data/events.test', 'w'))

## Check

In [22]:
train_events = json.load(open('./data/events.train'))
val_events = json.load(open('./data/events.dev'))
test_events = json.load(open('./data/events.test'))

len(train_events), len(val_events), len(test_events)

(2027, 277, 576)

In [23]:
a = set(train_events)
b = set(val_events)
c = set(test_events)

len(a), len(b), len(c)

(2027, 277, 576)

In [24]:
a.intersection(b), a.intersection(c), c.intersection(b)

(set(), set(), set())

# Split By Claim

## Loading

In [1]:
import pandas as pd
import json
import random
import numpy as np
from tqdm import tqdm

In [2]:
with open('../raw/FN_11934_filtered.json', 'r') as f:
    FN = json.load(f)

with open('../raw/DN_27505_filtered.json', 'r') as f:
    DN = json.load(f)

len(FN), len(DN)

(11934, 27505)

In [3]:
fnOid2item = {fn['_id']: fn for fn in FN}
dnOid2item = {dn['_id']: dn for dn in DN}

fnOid2idx = {fn['_id']: i for i, fn in enumerate(FN)}
dnOid2idx = {dn['_id']: i for i, dn in enumerate(DN)}

In [4]:
dnOid2sentsLen = {dn['_id']:len(dn['content_all']) for dn in DN}

In [5]:
list(dnOid2sentsLen.items())[:5]

[('5fef066911f159c49f84b0e8', 28),
 ('5fed832711f159c49f84b0e3', 189),
 ('5f64bf8c11f159c49f84af7e', 6),
 ('5f64bf8b11f159c49f84af79', 6),
 ('5f64bf8c11f159c49f84af7b', 6)]

In [6]:
FN[0]

{'_id': '5f0ed06ada7c229f16a54452',
 'content': '再发一条简体中文的：谢才萍，江湖上人称“谢姐”，是重庆目前已逮捕的十九个黑恶团伙首犯中唯一一位女性，江湖地位甚高。谢才萍与文强是“亲戚关系”：她是文强的弟媳。让人瞠目的是，谢才萍在个人生活上极度荒淫。重庆市打黑成果展显示，她长期包养十六个年轻男子供自己玩乐。',
 'event': '5f5e37e063fc4d00ff6bedcc',
 'time': 1255276800000,
 'time_format': '2009-10-12 00:00:00',
 'debunking_ids': ['6002d4fbbc9086321270c5dc'],
 'content_all': '再发一条简体中文的：谢才萍，江湖上人称“谢姐”，是重庆目前已逮捕的十九个黑恶团伙首犯中唯一一位女性，江湖地位甚高。谢才萍与文强是“亲戚关系”：她是文强的弟媳。让人瞠目的是，谢才萍在个人生活上极度荒淫。重庆市打黑成果展显示，她长期包养十六个年轻男子供自己玩乐。'}

In [7]:
def get_qids(event_file):
    with open(event_file, 'r') as f:
        events = json.load(f)
        
    qids = []
    for fn in FN:
        if fn['event'] in events:
            qids.append(fn['_id'])
            
    print(len(qids), len(set(qids)))
    return qids

In [8]:
train_qids, val_qids, test_qids = get_qids(
    './data/events.train'), get_qids('./data/events.dev'), get_qids('./data/events.test')

8356 8356
1192 1192
2386 2386


In [10]:
a = set(train_qids)
b = set(val_qids)
c = set(test_qids)

a.intersection(b), a.intersection(c), b.intersection(c), len(a), len(b), len(c)

(set(), set(), set(), 8356, 1192, 2386)

In [12]:
len(test_qids) / len(FN), len(val_qids) / len(FN)

(0.199932964638847, 0.09988268811798223)

In [12]:
bm25_arr = np.load('../../../preprocess/BM25/data/bm25_scores_(11934, 27505).npy')

In [14]:
def analysis_qid2dids(qid2dids):
    is_pair = []
    for qid, dids in qid2dids.items():
        is_pair.append(len(set(dids).intersection(
            set(fnOid2item[qid]['debunking_ids']))))

    df = pd.DataFrame({'matching_num': is_pair})
    return df.describe()

In [15]:
def transfer2lineDf(qid2dids):
    Qids = []
    Qidxs = []
    Dids = []
    Didxs = []
    Labels = []

    for qid, dids in qid2dids.items():
        Qids += [qid] * len(dids)
        Qidxs += [fnOid2idx[qid]] * len(dids)
        Dids += dids
        Didxs += [dnOid2idx[did] for did in dids]
        Labels += [1 if did in fnOid2item[qid]
                   ['debunking_ids'] else 0 for did in dids]

    df_line = pd.DataFrame(dict(zip(['qid', 'qidx', 'did', 'didx', 'label'], [
                           Qids, Qidxs, Dids, Didxs, Labels])))

    return df_line


def transfer2df(qid2dids):
    Qids = []
    Qidxs = []
    Dids = []
    Didxs = []
    Labels = []

    for qid, dids in qid2dids.items():
        Qids += [qid]
        Qidxs += [fnOid2idx[qid]]
        Dids += [dids]
        Didxs += [[dnOid2idx[did] for did in dids]]
        Labels += [[1 if did in fnOid2item[qid]
                    ['debunking_ids'] else 0 for did in dids]]

    df = pd.DataFrame(dict(zip(['qid', 'qidx', 'did', 'didx', 'label'], [
                           Qids, Qidxs, Dids, Didxs, Labels])))

    return df

In [16]:
def analysis_lineDf(df):
    qids = set(df['qid'])
    dids = set(df['did'])

    total_pairs = sum([len(fnOid2item[qid]['debunking_ids']) for qid in qids])
    top50_pairs = len(df[df['label'] == 1])

    print('qids = {}, dids = {}, total pairs = {}, top50 pairs = {}'.format(
        len(qids), len(dids), total_pairs, top50_pairs))
    
    return df.describe()

## Attach ONE positive sample for BM25's null results

In [17]:
bm25_arr[0]

array([147.65676972, 175.41249475, 152.70300736, ..., 147.03297213,
       175.86724546, 150.45702979])

In [18]:
def get_dids_from_qids(qids, TOP=50):
    qid2dids = dict()

    for qid in tqdm(qids):
        # First: select top50 candidates
        scores = bm25_arr[fnOid2idx[qid]]
        scores_rank = scores.argsort()[::-1]
        candidates = [DN[r]['_id'] for r in scores_rank[:TOP]]
        
        debunking_ids = fnOid2item[qid]['debunking_ids']
        
        # top50 do not contain any positives
        if len(set(candidates).intersection(set(debunking_ids))) == 0:
            for r in scores_rank[TOP:]:
                did = DN[r]['_id']
                if did in debunking_ids:
                    candidates[-1] = did
                    # 注意：此处在acl投稿时忘记了加break...
                    break

        qid2dids[qid] = candidates

    print(len(qid2dids), len(qid2dids)*50,
          sum([len(v) for v in qid2dids.values()]))
    print()

    return qid2dids

In [19]:
train_qid2dids = get_dids_from_qids(qids=train_qids)
val_qid2dids = get_dids_from_qids(qids=val_qids)
test_qid2dids = get_dids_from_qids(qids=test_qids)

100%|██████████| 8356/8356 [00:32<00:00, 258.56it/s]
  2%|▏         | 18/1192 [00:00<00:06, 173.16it/s]

8356 417800 417800



100%|██████████| 1192/1192 [00:05<00:00, 218.14it/s]
  1%|          | 24/2386 [00:00<00:10, 235.04it/s]

1192 59600 59600



100%|██████████| 2386/2386 [00:08<00:00, 267.88it/s]

2386 119300 119300






In [20]:
analysis_qid2dids(train_qid2dids)

Unnamed: 0,matching_num
count,8356.0
mean,3.422212
std,4.464883
min,1.0
25%,1.0
50%,2.0
75%,4.0
max,50.0


In [21]:
analysis_qid2dids(val_qid2dids)

Unnamed: 0,matching_num
count,1192.0
mean,2.799497
std,3.271263
min,1.0
25%,1.0
50%,1.0
75%,3.0
max,18.0


In [22]:
analysis_qid2dids(test_qid2dids)

Unnamed: 0,matching_num
count,2386.0
mean,3.036463
std,3.568193
min,1.0
25%,1.0
50%,2.0
75%,4.0
max,32.0


In [23]:
train_line_df = transfer2lineDf(train_qid2dids)
val_line_df = transfer2lineDf(val_qid2dids)
test_line_df = transfer2lineDf(test_qid2dids)

In [24]:
val_df = transfer2df(val_qid2dids)
test_df = transfer2df(test_qid2dids)

In [25]:
analysis_lineDf(train_line_df)

qids = 8356, dids = 17385, total pairs = 63873, top50 pairs = 28596


Unnamed: 0,qidx,didx,label
count,417800.0,417800.0,417800.0
mean,6001.999282,12253.957738,0.068444
std,3466.80054,7264.769062,0.252507
min,1.0,0.0,0.0
25%,2885.75,6773.0,0.0
50%,6247.5,11580.0,0.0
75%,8931.25,17954.0,0.0
max,11933.0,27504.0,1.0


In [26]:
analysis_lineDf(val_line_df)

qids = 1192, dids = 8353, total pairs = 7374, top50 pairs = 3337


Unnamed: 0,qidx,didx,label
count,59600.0,59600.0,59600.0
mean,5544.696309,11975.893909,0.05599
std,3227.497234,7298.715065,0.229904
min,12.0,1.0,0.0
25%,3145.75,6330.0,0.0
50%,4886.0,11355.0,0.0
75%,8388.25,17418.0,0.0
max,11913.0,27504.0,1.0


In [27]:
analysis_lineDf(test_line_df)

qids = 2386, dids = 11715, total pairs = 15125, top50 pairs = 7245


Unnamed: 0,qidx,didx,label
count,119300.0,119300.0,119300.0
mean,6052.903185,12187.515331,0.060729
std,3459.273897,7329.480866,0.238834
min,0.0,0.0,0.0
25%,3121.0,6684.0,0.0
50%,5592.0,11500.0,0.0
75%,9161.0,17931.25,0.0
max,11928.0,27503.0,1.0


In [28]:
analysis_lineDf(pd.concat([train_line_df, val_line_df, test_line_df]))

qids = 11934, dids = 19047, total pairs = 86372, top50 pairs = 39178


Unnamed: 0,qidx,didx,label
count,596700.0,596700.0,596700.0
mean,5966.5,12212.899948,0.065658
std,3445.051931,7281.607675,0.247683
min,0.0,0.0,0.0
25%,2983.0,6684.0,0.0
50%,5966.5,11536.0,0.0
75%,8950.0,17940.0,0.0
max,11933.0,27504.0,1.0


In [29]:
train_line_df.to_csv('./data/top50.train.line',
                     index=None, header=None, sep='\t')
val_line_df.to_csv('./data/top50.val.line',
                   index=None, header=None, sep='\t')
test_line_df.to_csv('./data/top50.test.line',
                    index=None, header=None, sep='\t')

In [30]:
val_df.head()

Unnamed: 0,qid,qidx,did,didx,label
0,5f0ed06ada7c229f16a55c17,12,"[5ff5e362c60dfb55fc52b972, 5ff967deefd97ff86d6...","[7061, 27006, 16677, 14492, 24503, 19881, 453,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,5f0ed06ada7c229f16a55c1d,18,"[5ff57ff6c60dfb55fc52b944, 5f2ea72f8d1bbb4d114...","[1203, 20158, 14492, 27006, 18, 27261, 27339, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,5f0ed06ada7c229f16a55c1c,24,"[5f1172acb9cce3edf31425da, 5f1172acb9cce3edf31...","[9233, 11057, 14701, 9057, 9229, 20500, 1457, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,5f0ed040da7c229f16a51432,51,"[5f61d4bd11c8070a282e796e, 5f1172acb9cce3edf31...","[66, 11046, 11385, 26092, 10768, 68, 9571, 159...","[1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
4,5f0ed06ada7c229f16a55c1b,148,"[5f0c71f1efe4e78f5b003534, 5f315f59aaa0d4d489f...","[190, 20500, 27261, 11388, 17329, 9648, 15047,...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [31]:
test_df.head()

Unnamed: 0,qid,qidx,did,didx,label
0,5f0ed06ada7c229f16a54452,0,"[6002d4fbbc9086321270c5dc, 5ff55038c60dfb55fc5...","[19, 21, 8931, 17396, 6883, 13539, 11103, 9365...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,5f0ed06ada7c229f16a53f06,2,"[5fe5acd511f159c49f84b012, 5fe5acd511f159c49f8...","[91, 29, 27, 25, 26, 24, 22, 17, 18, 3180, 543...","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,5f0ed06ada7c229f16a542cb,5,"[5fef252111f159c49f84b0fa, 5fef26f911f159c49f8...","[31, 49, 37, 19315, 20021, 184, 9837, 3180, 11...","[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,5f0ed06ada7c229f16a55040,15,"[5f2ea72f8d1bbb4d1141aa43, 5f1172acb9cce3edf31...","[17301, 11733, 2923, 17956, 19748, 14283, 1801...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,5f0ed040da7c229f16a524f2,30,"[5f62cacffff4579430a77648, 5f1172acb9cce3edf31...","[64, 11139, 1257, 1245, 737, 1598, 8225, 440, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [32]:
val_df.to_csv('./data/top50.val',
              index=None, header=None, sep='\t')
test_df.to_csv('./data/top50.test',
               index=None, header=None, sep='\t')