In [1]:
import sys
from importlib import reload
reload(sys)

import TD_RvNN_exp
import math

import theano
from theano import tensor as T
import numpy as np
from numpy.testing import assert_array_almost_equal

import time
import datetime
import random
from evaluate import *

In [2]:
# paths for CLEARumor training and test data
trainPath = "../resource/rumoureval-2019-training-data.zip"
testPath = "../resource/rumoureval-2019-test-data.zip"
testLabelPath = "../resource/final-eval-key.json" # evaluation file contains test labels

In [3]:
# load data => find the labels for training and test data and put them into a dict structured as needed in RvNN approach
# at first handle zip files as in CLEARumor implementation
from zipfile import ZipFile
from typing import Dict
import json
def get_archive_directory_structure(archive: ZipFile) -> Dict:
    result = {}
    for file in archive.namelist():
        # Skip directories in archive.
        if file.endswith('/'):
            continue

        d = result
        path = file.split('/')[1:]  # [1:] to skip top-level directory.
        for p in path[:-1]:  # [:-1] to skip filename
            if p not in d:
                d[p] = {}
            d = d[p]
        d[path[-1]] = file
    return result

In [4]:
training_data_archive = ZipFile(trainPath)
training_data_contents = get_archive_directory_structure(
        training_data_archive)
train_labels = json.loads(training_data_archive.read(training_data_contents['train-key.json']))
dev_labels = json.loads(training_data_archive.read(training_data_contents['dev-key.json']))
test_labels = json.load(open(testLabelPath,'r'))

In [5]:
# load all labels (train/dev/test) into one dictionary as (sourceID:label)
# and all IDs for training and test data into two separate lists
labelDic, indexDic = {}, {} # labelDic contains all (eid,label) connections and indexDic contains (idx, eid) translation
# to make indexing more simple and avoid 18 digit numbers
eid = 1 # start indexing at one and assign each new tweet an index eid+=1        
train_IDs, test_IDs = [], []
for (idx, label) in train_labels['subtaskbenglish'].items():
    if len(idx) == 18: # for now only consider the twitter IDs and labels
        indexDic[idx] = eid # keep connection between simple index and 18 digit index for look-ups later
        labelDic[eid] = label.lower()
        train_IDs.append(idx)
        eid += 1 # increase index by one for the next tweet
for (idx, label) in dev_labels['subtaskbenglish'].items():
    if len(idx) == 18: # for now only consider the twitter IDs and labels
        indexDic[idx] = eid
        labelDic[eid] = label.lower()
        train_IDs.append(idx)
        eid += 1
for (idx, label) in test_labels['subtaskbenglish'].items():
    if len(idx) == 18: # for now only consider the twitter IDs and labels
        indexDic[idx] = eid
        labelDic[eid] = label.lower()
        test_IDs.append(idx)
        eid += 1
highest_source_eid = eid # keep this value to continue counting upwards for simpler reply indices later

In [6]:
indexDic

{'500288349924782080': 1,
 '500308076004929537': 2,
 '544282227035869184': 3,
 '529695367680761856': 4,
 '544324444773433348': 5,
 '544350712365207552': 6,
 '500389488217309184': 7,
 '552806309540528128': 8,
 '552978099357237248': 9,
 '524956129017995264': 10,
 '500332933098385408': 11,
 '544271362022338560': 12,
 '544514564407427072': 13,
 '553587672137334785': 14,
 '500363126294863876': 15,
 '553474188259102720': 16,
 '500394061887709184': 17,
 '544309275141885952': 18,
 '552811386259386370': 19,
 '524925987239120897': 20,
 '576323086888361984': 21,
 '544314234541469696': 22,
 '544381485591982083': 23,
 '553505242554175489': 24,
 '544278985249550337': 25,
 '553586897168392192': 26,
 '524975705206304769': 27,
 '529653029747064832': 28,
 '577258317942149120': 29,
 '544367462012432384': 30,
 '529654186791944192': 31,
 '552792913910833152': 32,
 '524923462398513152': 33,
 '524952883343925249': 34,
 '544491151118860289': 35,
 '544515538383564801': 36,
 '524925050739490816': 37,
 '50028042

In [7]:
# generate the tree from the zip file data
twitter_english = training_data_contents['twitter-english']
test_data_archive = ZipFile(testPath)
test_data_contents = get_archive_directory_structure(test_data_archive)
twitter_en_test_data = test_data_contents['twitter-en-test-data']

In [8]:
# calculate parent_num, indexP and indexC for the treeDic
def calc_parent_num(tree_branch: Dict) -> int: # go recursively through tree and calculate parent_num
    if isinstance(tree_branch, Dict):
        return 1 + (max(map(calc_parent_num, tree_branch.values())) if tree_branch else 0)
    return 0
    
def find_parent_node(tree_branch: Dict, reply_idx: int) -> int: # go recursively through the tree and find parent index
    for indexP, sub_branch in tree_branch.items(): # keep indexP and go through the keys
        if reply_idx in sub_branch: # search for reply_idx in the keys
            return indexP # if it is found, return the parent node
        elif isinstance(sub_branch, dict): # else check if the keys itself are a dictionary
            parent_node = find_parent_node(sub_branch, reply_idx) # if so, recursively call again the function with the subtree 
            if parent_node is not None: # return the value only if it exists (otherwise we should add try and catch later)
                return parent_node

In [9]:
# create treeDic as it is needed for the RvNN
treeDic = {}
for archive, topics in [(training_data_archive, twitter_english.items()),
                            (test_data_archive, twitter_en_test_data.items())]:
    for topic, threads in topics:
            for thread in threads.values():
                # get the source information as a Dict (contains all info about the source post)
                source_information = json.loads(archive.read(list(thread['source-tweet'].values())[0]))
                eid = source_information['id'] # get the source ID
                post_structure = json.loads(archive.read(thread['structure.json'])) # get the thread structure as a Dict
                parent_num = calc_parent_num(post_structure) # calculate the number of reply levels in each thread structure
                #print(eid, parent_num,  " PAUSE ")
                indexC = eid # initialize post index with source post index
                if idx not in treeDic: # create empty entry first to make the key accessable
                    treeDic[idx] = {}
                treeDic[idx][indexC] = {'parent':'None', 'parent_num':parent_num}
                if 'replies' in thread: # some "replies" folders seem to be empty and then this for loop would throw an error
                    for reply in thread['replies'].values(): # for every reply post
                        # get the reply information as a Dict (contains all info about the reply post)
                        reply_information = json.loads(archive.read(reply))
                        indexC = reply_information['id'] # indexC = reply_ID, just named indexC for consistency with
                        # RvNN implementation; maybe change variable names later for better understanding
                        # find out parent of each reply node
                        indexP = find_parent_node(post_structure, str(indexC)) # somehow the "key" input has to be a string
                        #print(indexC, indexP, parent_num)
                        treeDic[idx][indexC] = {'parent':indexP, 'parent_num':parent_num} # put everything at the right place