In [1]:
import json
import pickle
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter
from bs4 import BeautifulSoup
from nltk import word_tokenize
# from hltc_preprocess.tweets import clean_tweet, tokenize_tweets
from nltk.tokenize import TweetTokenizer
from spacy.lang.en import English
import operator

In [2]:
! ls /home/nayeon7lee/misinfo_data/pheme_extended_kochkina

ls: cannot access '/home/nayeon7lee/misinfo_data/pheme_extended_kochkina': No such file or directory


In [3]:
! ls /home/nayeon7lee/misinfo_data/pheme_extended_kochkina/threads

ls: cannot access '/home/nayeon7lee/misinfo_data/pheme_extended_kochkina/threads': No such file or directory


In [6]:
root_path = "/home/nayeon7lee/misinfo_data/"

In [7]:
base_path = root_path+"pheme_extended_kochkina/threads"

# Liar

In [42]:
def liar_df2objects(df):
    objs = []
    for _, row in df.iterrows():
        obj = {
            'id': row['id'],
            'label': row['label'],
            'text': row['text'],
            'subject': row['subject'],
            'speaker_info': {'speaker': row['speaker'],
                            'job': row['speaker_job'],
                            'state': row['state'],
                            'party': row['party']},
            'credibility': {'barely_true': row['barely_true'],
                            'false': row['false'],
                            'half_true': row['half_true'],
                            'mostly_true': row['mostly_true'],
                            'pants_on_fire': row['pants_on_fire']},
            'location': row['location']
        }
        objs += obj,
    return objs

def load_liar(root_dir):
    liar_train_path = '{}/{}'.format(root_dir, 'liar/train.tsv')
    liar_dev_path = '{}/{}'.format(root_dir, 'liar/valid.tsv')
    liar_test_path = '{}/{}'.format(root_dir, 'liar/test.tsv')

    liar_headers = ['id', 'label', 'text', 'subject','speaker', 'speaker_job',
                    'state','party','barely_true','false','half_true',
                    'mostly_true','pants_on_fire','location']

    liar_train = pd.read_csv(liar_train_path, sep='\t', names=liar_headers)
    liar_dev = pd.read_csv(liar_dev_path, sep='\t', names=liar_headers)
    liar_test = pd.read_csv(liar_test_path, sep='\t', names=liar_headers)

    return liar_df2objects(liar_train), liar_df2objects(liar_dev), liar_df2objects(liar_test)

In [48]:
def filter_short(liar_data):
    return [data for data in liar_data if len(word_tokenize(data['text']))>=5]

In [50]:
liar_train, liar_dev, liar_test = load_liar(root_path)
print(len(liar_train), len(liar_dev), len(liar_test))
liar_train, liar_dev, liar_test = filter_short(liar_train), filter_short(liar_dev), filter_short(liar_test)
print(len(liar_train), len(liar_dev), len(liar_test))
with open('liar_train.pickle', 'wb') as handle:
    pickle.dump(liar_train, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('liar_dev.pickle', 'wb') as handle:
    pickle.dump(liar_dev, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('liar_test.pickle', 'wb') as handle:
    pickle.dump(liar_test, handle, protocol=pickle.HIGHEST_PROTOCOL)

10240 1284 1267
10210 1280 1260


In [45]:
liar_train[0]['text']

'Says the Annies List political group supports third-trimester abortions on demand.'

# Webis

In [51]:
def load_webis(annotation_path, articles_path):
    # overview: XML,portal,orientation,veracity,url
    annotation_df = pd.read_csv(annotation_path)
    webis_data = []
    for _, anno in annotation_df.iterrows():
        # read xml file
        article = {}
        article['id'] = anno['XML']
        article['portal'] = anno['portal']
        article['political_orientation'] = anno['orientation']
        article['veracity_label'] = anno['veracity']
        with open("{}/{}".format(articles_path, anno['XML'])) as fp:
            xml = BeautifulSoup(fp)

            article['text'] = xml.find('maintext').text.encode('utf8').decode('utf8')
            article['author'] = xml.find('author').text
            article['title'] = xml.find('title').text

#             hls = xml.find_all('hyperlink')
#             hls_array = []
#             for hl in hls:
#                 hl_obj = { 'start': hl.find('start').string,
#                         'end': hl.find('end').string }
#                 if 'href' in hl.attrs.keys():
#                     hl_obj['href'] = hl['href']
#                 hls_array += hl_obj,
#             article['hyperlinks'] = hls_array

#             paragraph_spans = xml.find_all('paragraph')
#             ps_array = []
#             for ps in paragraph_spans:
#                 ps_array += (ps.find('start').string, ps.find('end').string),
#             article['paragraph_spans'] = ps_array

#             quote_spans = xml.find_all('quote')
#             quote_array = []
#             for quote in quote_spans:
#                 quote_array += (quote.find('start').string, quote.find('end').string),
#             article['quote_spans'] = quote_array

        webis_data += article,

    return webis_data

webis_annotation = os.path.join(root_path, 'webis/overview.csv')
webis_article_path = os.path.join(root_path, 'webis/articles')
webis_articles = load_webis(webis_annotation, webis_article_path)

In [52]:
print(len(webis_articles))
webis_articles = filter_short(webis_articles)
print(len(webis_articles))

1627
1604


In [53]:
with open('webis.pickle', 'wb') as handle:
    pickle.dump(webis_articles, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Clickbait

In [90]:
with open('/home/nayeon7lee/misinfo_data/clickbait/truth.jsonl', 'r') as label_file:
    label_list = [json.loads(item) for item in list(label_file)]
    id2label_dict = {item['id']:item['truthClass'] for item in label_list}

In [91]:
with open('/home/nayeon7lee/misinfo_data/clickbait/instances.jsonl', 'r') as json_file:
    data_list = [json.loads(item) for item in list(json_file)]

In [92]:
new_clickbait_data = []
for data in data_list:
    data['label'] = id2label_dict[data['id']]
    new_clickbait_data.append(data)

In [93]:
def filter_short_clickbait(data):
    return [d for d in data if len(word_tokenize(clean_txt(" ".join(d['postText']))))>=5]

In [94]:
import re
def clean_txt(text,
              remove_stopwords=False,
              remove_nonalphanumeric=False,
              use_number_special_token=False,
              remove_numbers=False,
              separate_contractions=False,
              separate_punctuations=False
              ):

    text = text.lower()
    if separate_contractions:
        text = re.sub(r"\'s", " \'s", text)
        text = re.sub(r"\'ve", " \'ve", text)
        text = re.sub(r"n\'t", " n\'t", text)
        text = re.sub(r"\'re", " \'re", text)
        text = re.sub(r"\'d", " \'d", text)
        text = re.sub(r"\'ll", " \'ll", text)

    text = re.sub(r"-", " ", text)
#     text = re.sub(r"/", " ", text)
    text = re.sub(r"[a-zA-Z]+\/[a-zA-Z]+", " ", text)
    text = re.sub(r"\n", " ", text)

    if remove_nonalphanumeric:
        text = re.sub(r'([^\s\w\']|_)+', " ", text)

    if use_number_special_token:
        text = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>", text)
    elif remove_numbers:
        text = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "", text)

    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if w not in stops]
        text = " ".join(text)

    # Remove URL
    text = re.sub(r"(http)\S+", "", text)
    text = re.sub(r"(www)\S+", "", text)
    text = re.sub(r"(href)\S+", "", text)
    # Remove multiple spaces
    text = re.sub(r"[ \s\t\n]+", " ", text)

    # remove repetition
    text = re.sub(r"([!?.]){2,}", r"\1", text)
    text = re.sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2", text)

    return text.strip()

In [95]:
print(len(new_clickbait_data))
new_clickbait_data = filter_short_clickbait(new_clickbait_data)
print(len(new_clickbait_data))

19538
19055


In [63]:
print(len(new_clickbait_data))
new_clickbait_data = filter_short_clickbait(new_clickbait_data)
print(len(new_clickbait_data))

19538
19051


In [64]:
with open('clickbait_sns.pickle', 'wb') as handle:
    pickle.dump(new_clickbait_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
# with open('filename.pickle', 'rb') as handle:
#     b = pickle.load(handle)

# PHEME (mine) - explode everything

### 1. rumour detection

In [8]:
'thread_anno_by_events'
all_event_thread_info = {}

events = [d for d in os.listdir(base_path) if d!='.DS_Store']
for event in tqdm(events):
    threads = [d for d in os.listdir('{}/{}'.format(base_path, event))]
    print(event, len(threads))

    thread_info = {}
    rumour_threads = [d for d in os.listdir('{}/{}/rumours'.format(base_path, event)) if d != '.DS_Store']
    non_rumour_threads = [d for d in os.listdir('{}/{}/non-rumours'.format(base_path, event))]

    for r_id in rumour_threads:
        # process lvl2 rumour veracity annotation (true, false, unverified rumour)
        with open('{}/{}/rumours/{}/annotation.json'.format(base_path, event, r_id)) as json_file:
            # anno fields: links - evidences/information about the rumour. category - rumour type. 
            # misinfo & true lables are used to determine labels for each thread
            anno = json.load(json_file)
            veracity_label = convert_annotations(anno)
            evidence_arr = anno['links']
            rumour_category = anno['category'] # TODO check what this is later 

        # combine lvl1 rumour detection labels with lvl2 labels
        thread_info[r_id] = {
            'thread_id': r_id,
            'rumour_label': 'rumour',
            'veracity_label': veracity_label,
            'evidence': evidence_arr,
            'rumour_category': rumour_category
        }

    for r_id in non_rumour_threads:
        # non-rumour doesn't have lvl2 annotation.
        thread_info[r_id] = {'thread_id': r_id, 'rumour_label':'non-rumour'}
    
    all_event_thread_info[event]=thread_info
    
# with open('./thread_annotations_by_event.json', 'w') as outfile:
#     json.dump(all_event_thread_info, outfile)

# # checking for the count
# for k in all_event_thread_info:
#     print(k, len(all_event_thread_info[k]))

FileNotFoundError: [Errno 2] No such file or directory: '/home/nayeon7lee/misinfo_data/pheme_extended_kochkina/threads'

In [123]:
# stance_detection_dataset = []

In [124]:
event_to_thread_rumour_detection_dataset = {}
event_to_thread_veracity_detection_dataset = {}
# event_to_stance_detection_dataset = {}


events = [d for d in os.listdir(base_path) if d!='.DS_Store']
for event in tqdm(events):
    rumour_threads = [d for d in os.listdir('{}/{}/rumours'.format(base_path, event)) if d != '.DS_Store']
    non_rumour_threads = [d for d in os.listdir('{}/{}/non-rumours'.format(base_path, event)) if d != '.DS_Store']
    rumour_detection_dataset = []
    rumour_veracity_dataset = []

    for r_id in rumour_threads:
        with open('{}/{}/rumours/{}/structure.json'.format(base_path, event, r_id)) as json_file:
            thread_tree = json.load(json_file)
            branches = tree2branches(thread_tree)
            thread_info = all_event_thread_info[event][r_id]
            
            if len(branches)==0:
                branches = [[r_id]]
            # start adding these branch with labels into the dataset array
            for branch in branches:
                rumour_detection_dataset.append({'thread_id': r_id, 'branch': branch, 'detection_label': 'rumour',
                                                'evidence': thread_info['evidence'], 
                                                 'rumour_category': thread_info['rumour_category']})
                 
                rumour_veracity_dataset.append({'thread_id': r_id, 'branch': branch, 'veracity_label': veracity_label,
                                                'evidence': thread_info['evidence'], 
                                                 'rumour_category': thread_info['rumour_category']})
                
    for r_id in non_rumour_threads:
        with open('{}/{}/non-rumours/{}/structure.json'.format(base_path, event, r_id)) as json_file:
            thread_tree = json.load(json_file)
            branches = tree2branches(thread_tree)
            
            if len(branches)==0:
                branches = [[r_id]]

            for branch in branches:
                rumour_detection_dataset.append({'thread_id': r_id, 'branch': branch, 'detection_label': 'non-rumour'})
                
    
    event_to_thread_rumour_detection_dataset[event] = rumour_detection_dataset
    event_to_thread_veracity_detection_dataset[event] = rumour_veracity_dataset
    
# with open('./event_to_thread_rumour_detection_dataset.pickle', 'wb') as handle:
#     pickle.dump(event_to_thread_rumour_detection_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('./event_to_thread_veracity_detection_dataset.pickle', 'wb') as handle:
#     pickle.dump(event_to_thread_veracity_detection_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)

100%|██████████| 9/9 [00:02<00:00,  4.38it/s]


### 2. Stance Detection Dataset

In [125]:
tweet_level_annotation = root_path + '/pheme/annotations/en-scheme-annotations.json'

tweet2stance = defaultdict(dict)

source_stances = []
reply_stances = []

for _, line in enumerate(open(tweet_level_annotation, 'r')):
    line = json.loads(line)
    
    threadid = line['threadid']
    tweetid = line['tweetid']
    
    if threadid == tweetid: # source tweet
        source_stances.append({'thread_id': threadid, 'tweet_id': tweetid, 'event': line['event'], 
                                           'stance_label': line['support']})
    else: # replying tweet
        reply_stances.append({'thread_id': threadid, 'tweet_id': tweetid, 'event': line['event'], 
                                           'stance_label': line['responsetype-vs-source']})

In [126]:
with open('./source_stance_dataset.pickle', 'wb') as handle:
    pickle.dump(source_stances, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('./reply_stances_dataset.pickle', 'wb') as handle:
    pickle.dump(reply_stances, handle, protocol=pickle.HIGHEST_PROTOCOL)

# PHEME - paper

#### Lvl 1 rumour verification anno ( rumour vs non-rumour) AND lvl2 rumour veracity anno (true, false, unverified) from PHEME data

In [4]:
'''convertion code from PHEME creators. Label: True = rumour is true. False = rumour is false'''
def convert_annotations(annotation, string = True):
    if 'misinformation' in annotation.keys() and 'true'in annotation.keys():
        if int(annotation['misinformation'])==0 and int(annotation['true'])==0:
            if string:
                label = "unverified"
            else:
                label = 2
        elif int(annotation['misinformation'])==0 and int(annotation['true'])==1 :
            if string:
                label = "true"
            else:
                label = 1
        elif int(annotation['misinformation'])==1 and int(annotation['true'])==0 :
            if string:
                label = "false"
            else:
                label = 0
        elif int(annotation['misinformation'])==1 and int(annotation['true'])==1:
            print ("OMG! They both are 1!")
            print(annotation['misinformation'])
            print(annotation['true'])
            label = None
            
    elif 'misinformation' in annotation.keys() and 'true' not in annotation.keys():
        # all instances have misinfo label but don't have true label
        if int(annotation['misinformation'])==0:
            if string:
                label = "unverified"
            else:
                label = 2
        elif int(annotation['misinformation'])==1:
            if string:
                label = "false"
            else:
                label = 0
                
    elif 'true' in annotation.keys() and 'misinformation' not in annotation.keys():
        print ('Has true not misinformation')
        label = None
    else:
        print('No annotations')
        label = None
           
    return label

In [99]:
'thread anno w/o event'
thread_info = {}
events = [d for d in os.listdir(base_path) if d!='.DS_Store']
for event in tqdm(events):
    threads = [d for d in os.listdir('{}/{}'.format(base_path, event))]
    print(event, len(threads))

    rumour_threads = [d for d in os.listdir('{}/{}/rumours'.format(base_path, event)) if d != '.DS_Store']
    non_rumour_threads = [d for d in os.listdir('{}/{}/non-rumours'.format(base_path, event))]

    for r_id in rumour_threads:
        # process lvl2 rumour veracity annotation (true, false, unverified rumour)
        with open('{}/{}/rumours/{}/annotation.json'.format(base_path, event, r_id)) as json_file:
            # anno fields: links - evidences/information about the rumour. category - rumour type. 
            # misinfo & true lables are used to determine labels for each thread
            anno = json.load(json_file)
            veracity_label = convert_annotations(anno)
            evidence_arr = anno['links']
            rumour_category = anno['category'] # TODO check what this is later 

        # combine lvl1 rumour detection labels with lvl2 labels
        thread_info[r_id] = {
            'thread_id': r_id,
            'rumour_label': 'rumour',
            'veracity_label': veracity_label,
            'evidence': evidence_arr,
            'rumour_category': rumour_category
        }

    for r_id in non_rumour_threads:
        # non-rumour doesn't have lvl2 annotation.
        thread_info[r_id] = {'thread_id': r_id, 'rumour_label':'non-rumour'}
    
    event_name = event.split('-')[0]
    
# with open('./thread_annotations.json', 'w') as outfile:
#     json.dump(thread_info, outfile)

# # checking for the count
# for k in thread_info:
#     print(k, len(thread_info[k]))

 33%|███▎      | 3/9 [00:00<00:00, 21.58it/s]

('charliehebdo-all-rnr-threads', 3)
('ebola-essien-all-rnr-threads', 3)
('ferguson-all-rnr-threads', 3)
('germanwings-crash-all-rnr-threads', 3)
('gurlitt-all-rnr-threads', 3)
('ottawashooting-all-rnr-threads', 3)


100%|██████████| 9/9 [00:00<00:00, 25.43it/s]


('prince-toronto-all-rnr-threads', 3)
('putinmissing-all-rnr-threads', 3)
('sydneysiege-all-rnr-threads', 3)
('576709199146717185', 5)
('524950117192044544', 2)
('500360308289187840', 2)
('553535829529100288', 5)
('498303288295817217', 2)
('552823301899575296', 2)
('500283877567770624', 5)
('498274934553317376', 2)
('544338575240609792', 5)
('544517440332627968', 5)
('499530401136668672', 2)
('536832547062165504', 5)
('500303238810574849', 5)
('553589051044151296', 5)
('580371845997682688', 5)
('553590835850514433', 5)
('552814627399012352', 2)
('524952322703900672', 2)
('544519486498676736', 2)
('500252508884070400', 2)
('498283038338711552', 2)
('553534768537939968', 5)
('498463733447163905', 2)
('552797729504886784', 2)
('525000220371734528', 2)
('544343367279456256', 5)
('553589121776898048', 5)
('525060425184858112', 5)
('544309943533600769', 2)
('544284823041212416', 5)
('544382892378714113', 5)
('552825783337897984', 2)
('552790875492073472', 5)
('553490358748061696', 2)
('52499

('553587735613952001', 5)
('553542282100871168', 2)
('552836968410021888', 2)
('553160909208559616', 2)
('498273703428620289', 2)
('500320616428601344', 2)
('524950159160250368', 2)
('525002842021445632', 5)
('525027317551079424', 5)
('529692457337118721', 5)
('524983050187579393', 5)
('552835794646212608', 2)
('544298507868200961', 2)
('553491604917334016', 2)
('524950321064607744', 2)
('553166766977740801', 2)
('576417507277467648', 2)
('499642374428307457', 5)
('524957517294886912', 5)
('498530946338258944', 2)
('553144712920834048', 5)
('552847959864274944', 2)
('552821977061556225', 2)
('524958227768020992', 5)
('544444661859749890', 2)
('524999488067633152', 5)
('553473629854646272', 2)
('544364000897945600', 5)
('553588913747808256', 5)
('553558126147149825', 2)
('500397058541379584', 2)
('552986187364065281', 5)
('552808658510561280', 2)
('553474514496278528', 5)
('552818301530025984', 2)
('525070811439579138', 2)
('544518647881170945', 5)
('552811128326475776', 2)
('5531082428

In [15]:
'thread_anno_by_events'
all_event_thread_info = {}

events = [d for d in os.listdir(base_path) if d!='.DS_Store']
for event in tqdm(events):
    threads = [d for d in os.listdir('{}/{}'.format(base_path, event))]
    print(event, len(threads))

    thread_info = {}
    rumour_threads = [d for d in os.listdir('{}/{}/rumours'.format(base_path, event)) if d != '.DS_Store']
    non_rumour_threads = [d for d in os.listdir('{}/{}/non-rumours'.format(base_path, event))]

    for r_id in rumour_threads:
        # process lvl2 rumour veracity annotation (true, false, unverified rumour)
        with open('{}/{}/rumours/{}/annotation.json'.format(base_path, event, r_id)) as json_file:
            # anno fields: links - evidences/information about the rumour. category - rumour type. 
            # misinfo & true lables are used to determine labels for each thread
            anno = json.load(json_file)
            veracity_label = convert_annotations(anno)
            evidence_arr = anno['links']
            rumour_category = anno['category'] # TODO check what this is later 

        # combine lvl1 rumour detection labels with lvl2 labels
        thread_info[r_id] = {
            'thread_id': r_id,
            'rumour_label': 'rumour',
            'veracity_label': veracity_label,
            'evidence': evidence_arr,
            'rumour_category': rumour_category
        }

    for r_id in non_rumour_threads:
        # non-rumour doesn't have lvl2 annotation.
        thread_info[r_id] = {'thread_id': r_id, 'rumour_label':'non-rumour'}
    
    event_name = event.split('-')[0]
    all_event_thread_info[event_name]=thread_info
    
# with open('./thread_annotations_by_event.json', 'w') as outfile:
#     json.dump(all_event_thread_info, outfile)

# # checking for the count
# for k in all_event_thread_info:
#     print(k, len(all_event_thread_info[k]))

100%|██████████| 9/9 [00:00<00:00, 54.50it/s]

charliehebdo-all-rnr-threads 3
ebola-essien-all-rnr-threads 3
ferguson-all-rnr-threads 3
{'is_rumour': 'rumour', 'category': 'Mike Brown was shot 10 times', 'misinformation': '1', 'true': 0, 'links': [{'link': 'http://edition.cnn.com/2014/08/10/justice/missouri-police-involved-shooting/index.html', 'mediatype': 'news-media', 'position': 'observing'}], 'is_turnaround': 0}
germanwings-crash-all-rnr-threads 3
gurlitt-all-rnr-threads 3
ottawashooting-all-rnr-threads 3
prince-toronto-all-rnr-threads 3
putinmissing-all-rnr-threads 3
sydneysiege-all-rnr-threads 3





#### Lvl 3 tweet level annotation for Stance detection (from PHEME-journalism data)

In [87]:
! ls /home/nayeon7lee/misinfo_data/pheme/threads/en/charliehebdo

552783667052167168  552834961762709505	553476490315431937  553548567420628992
552785375161499649  552848620375261184	553476880339599360  553549686129561600
552791196247269378  552978099357237248	553478289474740224  553550301886955520
552791578893619200  552978184413921281	553486439129038848  553558982476828674
552792544132997121  552982613288157184	553489393202499584  553566026030272512
552792802309181440  552984502063337472	553501357156876290  553575232867672064
552792913910833152  552996335319007233	553503184174710784  553576010898497536
552793679082311680  553107921081749504	553505242554175489  553579224402235393
552802654641225728  553152395371630592	553506608203169792  553586860334010368
552805488631758849  553160652567498752	553508098825261056  553586897168392192
552806309540528128  553164985460068352	553512735192141826  553587013409325058
552806757672964097  553184482241814530	553518472798683136  553587303172833280
552810448324943872  553197863971610624	5535314134596

In [48]:
tweet_level_annotation = root_path + '/pheme/annotations/en-scheme-annotations.json'

tweet2stance = defaultdict(dict)
for _, line in enumerate(open(tweet_level_annotation, 'r')):
    line = json.loads(line)
    
    threadid = line['threadid']
    tweetid = line['tweetid']
    
    if threadid == tweetid: # source tweet
        tweet2stance[threadid][tweetid] = {'thread_id': threadid, 'tweet_id': tweetid, 'event': line['event'], 
                                           'support': line['support'], 'evidence_form': line['evidentiality'], 
                                           'certainty': line['certainty']}
    else: # replying tweet
        tweet2stance[threadid][tweetid] = {'thread_id': threadid, 'tweet_id': tweetid, 'event': line['event'], 
                                           'stance_to_source': line['responsetype-vs-source']}
        if 'responsetype-vs-previous' in line.keys():
            tweet2stance[threadid][tweetid]['stance_to_prev'] = line['responsetype-vs-previous']
        if 'evidentiality' in line.keys(): 
            tweet2stance[threadid][tweetid]['evidence_form'] = line['evidentiality']
        if 'certainty' in line.keys():
            tweet2stance[threadid][tweetid]['certainty'] = line['certainty']

In [56]:
with open('./tweet_annotations.json', 'wb') as outfile:
    json.dump(tweet2stance, outfile)
#     pickle.dump(tweet2stance, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### Create thread_by_event - branches from threads-structure and tweet text, grouped by events 

In [111]:
from copy import deepcopy
# taken from BranchLSTM  https://github.com/kochkinaelena/branchLSTM
def tree2branches(root):
    node = root
    if len(list(node.values())[0])==0:
        return []
    parent_tracker = []
    parent_tracker.append(root)
    branch = []
    branches = []
    i = 0
    
    while True:
        node_name = list(node.keys())[i]
        #print node_name
        branch.append(node_name)
        # get children of the node
        first_child = list(node.values())[i]
        # actually all chldren, all tree left under this node
        if first_child != []:  # if node has children
            node = first_child      # walk down
            parent_tracker.append(node)
            siblings = list(first_child.keys())
            i = 0  # index of a current node
        else:
            branches.append(deepcopy(branch))
            i = siblings.index(node_name)  # index of a current node
            # if the node doesn't have next siblings
            while i+1 >= len(siblings):
                if node is parent_tracker[0]:  # if it is a root node
                    return branches
                del parent_tracker[-1]
                del branch[-1]
                node = parent_tracker[-1]      # walk up ... one step
                node_name = branch[-1]
                siblings = list(node.keys())
                i = siblings.index(node_name)
            i = i+1    # ... walk right
            del branch[-1]
# branches = tree2branches(tree)
# for b in branches:
#     print(b)

In [112]:
# get general thread first from PHEME
'Note: thread2branches_by_event - having all branches as one instance'
# threads_by_event = {'event_name': [{'thread_id': threadid, 'branches': branch_arr}]} How it should look like
all_event_threads = {}

no_branch_cnt = 0
yes_branch_cnt = 0
events = [d for d in os.listdir(base_path) if d!='.DS_Store']
for event in tqdm(events):
    threads = []
    rumour_threads = [d for d in os.listdir('{}/{}/rumours'.format(base_path, event)) if d != '.DS_Store']
    non_rumour_threads = [d for d in os.listdir('{}/{}/non-rumours'.format(base_path, event)) if d != '.DS_Store']

    for r_id in rumour_threads:
        with open('{}/{}/rumours/{}/structure.json'.format(base_path, event, r_id)) as json_file:
            thread_tree = json.load(json_file)
            branches = tree2branches(thread_tree)
            if len(branches)==0:
                branches = [[r_id]]
            threads += {'thread_id': r_id, 'branches': branches},

    for r_id in non_rumour_threads:
        with open('{}/{}/non-rumours/{}/structure.json'.format(base_path, event, r_id)) as json_file:
            thread_tree = json.load(json_file)
            branches = tree2branches(thread_tree)
            if len(branches)==0:
                branches = [[r_id]]
            threads += {'thread_id': r_id, 'branches': branches},
    
    event_name = event.split('-')[0]
    all_event_threads[event_name] = threads
    print(event_name, len(threads))
    
# print("yes, no", yes_branch_cnt, no_branch_cnt)
# with open('./thread2branches_by_event.pickle', 'wb') as handle:
#     pickle.dump(all_event_threads, handle, protocol=pickle.HIGHEST_PROTOCOL)

# checking for the count
all_cnt = 0
for k in all_event_threads:
    all_cnt += len(all_event_threads[k])
    print(k, len(all_event_threads[k]))
print(all_cnt)

 11%|█         | 1/9 [00:00<00:06,  1.19it/s]

charliehebdo 2079
ebola 14


 44%|████▍     | 4/9 [00:01<00:02,  1.96it/s]

ferguson 1143
germanwings 469
gurlitt 138


 89%|████████▉ | 8/9 [00:01<00:00,  3.32it/s]

ottawashooting 890
prince 233
putinmissing 238


100%|██████████| 9/9 [00:02<00:00,  3.80it/s]

sydneysiege 1221
charliehebdo 2079
ebola 14
ferguson 1143
germanwings 469
gurlitt 138
ottawashooting 890
prince 233
putinmissing 238
sydneysiege 1221
6425





In [115]:
# get general thread first from PHEME
'Note: thread2branches_by_event_split_branch - split the branches into multiple instance'
'Will end up with instance with same threadid but different branch'
# threads_by_event = {'event_name': [{'thread_id': threadid, 'branches': branch_arr}]} How it should look like
all_event_threads = {}

no_branch_cnt = 0
yes_branch_cnt = 0
events = [d for d in os.listdir(base_path) if d!='.DS_Store']
for event in tqdm(events):
    threads = []
    rumour_threads = [d for d in os.listdir('{}/{}/rumours'.format(base_path, event)) if d != '.DS_Store']
    non_rumour_threads = [d for d in os.listdir('{}/{}/non-rumours'.format(base_path, event)) if d != '.DS_Store']

    for r_id in rumour_threads:
        with open('{}/{}/rumours/{}/structure.json'.format(base_path, event, r_id)) as json_file:
            thread_tree = json.load(json_file)
            branches = tree2branches(thread_tree)
            
            if len(branches)==0:
                branches = [[r_id]]
                
            for branch in branches:
                threads += {'thread_id': r_id, 'branch': branch},

    for r_id in non_rumour_threads:
        with open('{}/{}/non-rumours/{}/structure.json'.format(base_path, event, r_id)) as json_file:
            thread_tree = json.load(json_file)
            branches = tree2branches(thread_tree)
            
            if len(branches)==0:
                branches = [[r_id]]

            for branch in branches:
                threads += {'thread_id': r_id, 'branch': branch},

    event_name = event.split('-')[0]
    all_event_threads[event_name] = threads
    print(event_name, len(threads))
    
with open('./thread2branches_by_event_split_branch.pickle', 'wb') as handle:
    pickle.dump(all_event_threads, handle, protocol=pickle.HIGHEST_PROTOCOL)

# checking for the count
all_cnt = 0
for k in all_event_threads:
    all_cnt += len(all_event_threads[k])
    print(k, len(all_event_threads[k]))
print(all_cnt)

 33%|███▎      | 3/9 [00:00<00:01,  5.06it/s]

charliehebdo 23619
ebola 162
ferguson 14519
germanwings 3053
gurlitt 154


100%|██████████| 9/9 [00:00<00:00, 11.25it/s]

ottawashooting 8351
prince 543
putinmissing 534
sydneysiege 14684





charliehebdo 23619
ebola 162
ferguson 14519
germanwings 3053
gurlitt 154
ottawashooting 8351
prince 543
putinmissing 534
sydneysiege 14684
65619


In [None]:
# find the subset of PHEME threads that has additional lvl3 information
# hmmm actually are these needed?


#### ID to tweet text

In [109]:
def process_id2text(threads, lvl_1_path, id2text):
    for r_id in threads:
        # for each rumour_thead, there is ONE source-tweet, and MULTIPLE reations

        # {base}/{event}/{rumours or non-rumour)}/{thread-id}/source-tweets/{rumour-id}.json
        with open('{}/{}/{}/{}/source-tweets/{}.json'.format(base_path, event, lvl_1_path, r_id, r_id)) as json_file:
            tweet_json = json.load(json_file)
            id2text[tweet_json['id_str'].decode("utf-8")] = tweet_json['text']
            
        reaction_tweets = [d for d in os.listdir('{}/{}/{}/{}/reactions'.format(base_path, event, lvl_1_path, r_id))
                          if 'json' in d]
        for r_tweet_id in reaction_tweets:
            with open('{}/{}/{}/{}/reactions/{}'.format(base_path, event, lvl_1_path, r_id, r_tweet_id)) as json_file:
                tweet_json = json.load(json_file)
                id2text[tweet_json['id_str'].decode("utf-8")] = tweet_json['text']
    return id2text

In [98]:
events = [d for d in os.listdir(base_path) if d!='.DS_Store']
id2text = {}

for event in tqdm(events):
    threads = [d for d in os.listdir('{}/{}'.format(base_path, event))]

    rumour_threads = [d for d in os.listdir('{}/{}/rumours'.format(base_path, event)) if d != '.DS_Store']
    non_rumour_threads = [d for d in os.listdir('{}/{}/non-rumours'.format(base_path, event)) if d != '.DS_Store']

    id2text = process_id2text(rumour_threads, 'rumours', id2text)
    id2text = process_id2text(non_rumour_threads, 'non-rumours', id2text)

# with open('./id2text.pickle', 'wb') as handle:
#     pickle.dump(id2text, handle, protocol=pickle.HIGHEST_PROTOCOL)

  0%|          | 0/9 [00:00<?, ?it/s]

charliehebdo-all-rnr-threads


 11%|█         | 1/9 [00:06<00:49,  6.15s/it]

ebola-essien-all-rnr-threads
ferguson-all-rnr-threads


 33%|███▎      | 3/9 [00:08<00:28,  4.72s/it]

germanwings-crash-all-rnr-threads


 44%|████▍     | 4/9 [00:09<00:17,  3.46s/it]

gurlitt-all-rnr-threads
ottawashooting-all-rnr-threads


 78%|███████▊  | 7/9 [00:11<00:03,  1.90s/it]

prince-toronto-all-rnr-threads
putinmissing-all-rnr-threads


 89%|████████▉ | 8/9 [00:11<00:01,  1.36s/it]

sydneysiege-all-rnr-threads


100%|██████████| 9/9 [00:13<00:00,  1.73s/it]


#### ID to tweet info mapping 

In [43]:
useful_tweet_fields = ['text','id_str','favorite_count','retweeted','entities','retweet_count','favorited',
                 'coordinates','created_at','place']
# entities = entities automatically detected and extracted by twitter api - provides span and text if any detected.

useful_user_fields = ['id_str', 'verified','followers_count', 'listed_count', 'statuses_count', 'description', 'friends_count', 'location',
'name', 'lang', 'favourites_count', 'screen_name', 'url', 'created_at','time_zone']
# listed_count = number of public lists that this user is a member of
# statuses_count = number of tweets/retweets
# description = self description
# favourites_count = number of Tweets this user has liked in the account’s lifetime
# url = user provided url associated with their profile

def format_tweet(tweet_obj):
    filtered_tweet_obj = {}
    for field in tweet_obj:
        field = field.decode("utf-8")

        if field == 'user':
            user_obj = {}
            for f in tweet_obj['user']:
                if f in useful_user_fields:
                    user_obj[f] = tweet_obj['user'][f]
            filtered_tweet_obj['user']=user_obj
        elif field in useful_tweet_fields:
            filtered_tweet_obj[field]=tweet_obj[field]
    
    return filtered_tweet_obj

In [44]:
def process_id2tweet(threads, lvl_1_path, id2tweet):
    for r_id in threads:
        # for each rumour_thead, there is ONE source-tweet, and MULTIPLE reations

        # {base}/{event}/{rumours or non-rumour)}/{thread-id}/source-tweets/{rumour-id}.json
        with open('{}/{}/{}/{}/source-tweets/{}.json'.format(base_path, event, lvl_1_path, r_id, r_id)) as json_file:
            tweet_json = json.load(json_file)
            tweet_json = format_tweet(tweet_json)
            id2tweet[tweet_json['id_str'].decode("utf-8")] = tweet_json
            
        reaction_tweets = [d for d in os.listdir('{}/{}/{}/{}/reactions'.format(base_path, event, lvl_1_path, r_id))
                          if 'json' in d]
        for r_tweet_id in reaction_tweets:
            with open('{}/{}/{}/{}/reactions/{}'.format(base_path, event, lvl_1_path, r_id, r_tweet_id)) as json_file:
                tweet_json = json.load(json_file)
                tweet_json = format_tweet(tweet_json)
                id2tweet[tweet_json['id_str'].decode("utf-8")] = tweet_json
    return id2tweet

In [45]:
events = [d for d in os.listdir(base_path) if d!='.DS_Store']
id2tweet = {}

for event in tqdm(events):
    threads = [d for d in os.listdir('{}/{}'.format(base_path, event))]
    print(event)

    rumour_threads = [d for d in os.listdir('{}/{}/rumours'.format(base_path, event)) if d != '.DS_Store']
    non_rumour_threads = [d for d in os.listdir('{}/{}/non-rumours'.format(base_path, event)) if d != '.DS_Store']

    id2tweet = process_id2tweet(rumour_threads, 'rumours', id2tweet)
    id2tweet = process_id2tweet(non_rumour_threads, 'non-rumours', id2tweet)

  0%|          | 0/9 [00:00<?, ?it/s]

charliehebdo-all-rnr-threads


 11%|█         | 1/9 [00:14<01:56, 14.54s/it]

ebola-essien-all-rnr-threads
ferguson-all-rnr-threads


 33%|███▎      | 3/9 [00:24<01:09, 11.62s/it]

germanwings-crash-all-rnr-threads


 44%|████▍     | 4/9 [00:25<00:43,  8.62s/it]

gurlitt-all-rnr-threads
ottawashooting-all-rnr-threads


 67%|██████▋   | 6/9 [00:30<00:20,  6.71s/it]

prince-toronto-all-rnr-threads


 78%|███████▊  | 7/9 [00:30<00:09,  4.80s/it]

putinmissing-all-rnr-threads


 89%|████████▉ | 8/9 [00:30<00:03,  3.45s/it]

sydneysiege-all-rnr-threads


100%|██████████| 9/9 [00:40<00:00,  5.18s/it]


In [46]:
with open('./id2tweet.pickle', 'wb') as handle:
    pickle.dump(id2tweet, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Check for the match between the branch and tweet labels

Tweet Stance Label

In [24]:
! cat /home/nayeon7lee/misinfo_data/pheme/threads/en/charliehebdo/552783667052167168/annotation.json

{"is_rumour":"rumour","category":"(At least) 10 people are dead at Charlie Hebdo offices","misinformation":0,"true":"1","links":[{"link":"http:\/\/www.bbc.co.uk\/news\/live\/world-europe-30710777?ns_mchannel=social","mediatype":"news-media","position":"for"},{"link":"http:\/\/news.sky.com\/story\/1403662\/paris-attack-manhunt-as-armed-killers-flee","mediatype":"news-media","position":"for"}],"is_turnaround":0}

In [77]:
tweet_rumour_eval_path = "/home/nayeon7lee/misinfo_data/rumoureval2017"

train: /home/nayeon7lee/misinfo_data/rumoureval2017/semeval2017-task8-dataset/traindev/rumoureval-subtaskA-train.json

dev: /home/nayeon7lee/misinfo_data/rumoureval2017/semeval2017-task8-dataset/traindev/rumoureval-subtaskA-dev.json

test: /home/nayeon7lee/misinfo_data/rumoureval2017/subtaska-gold_standard_test.json

In [100]:
all_stance_label_dict = {}
for name in ['semeval2017-task8-dataset/traindev/rumoureval-subtaskA-train.json', 
            'semeval2017-task8-dataset/traindev/rumoureval-subtaskA-dev.json', 'subtaska-gold_standard_test.json']:
    for idx, line in enumerate(open("{}/{}".format(tweet_rumour_eval_path, name), 'r')):
        label_dict = json.loads(line)
        all_stance_label_dict.update(label_dict)

with open('./rumoureval_tweet_annotations.pickle', 'wb') as handle:
    pickle.dump(all_stance_label_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

Thread Label - each branch separate

In [114]:
all_event_threads = {}

no_tweet_rid = 0
yes_tweet_rid = 0
mismatch = 0

events = [d for d in os.listdir(base_path) if d!='.DS_Store']
for event in tqdm(events):
    threads = []
    rumour_threads = [d for d in os.listdir('{}/{}/rumours'.format(base_path, event)) if d != '.DS_Store']
    non_rumour_threads = [d for d in os.listdir('{}/{}/non-rumours'.format(base_path, event)) if d != '.DS_Store']
    print(event)
    for r_id in rumour_threads:
        with open('{}/{}/rumours/{}/structure.json'.format(base_path, event, r_id)) as json_file:
            thread_tree = json.load(json_file)
            branches = tree2branches(thread_tree)
            
            if len(branches)==0:
                branches = [[r_id]]
            
            if branches[0][0] in tweet2stance[r_id]:
                yes_tweet_rid += 1
            for branch in branches:
                threads += {'thread_id': r_id, 'branch': branch},
                
                if '521311862118711296”:[],”521313155004497920”:[],”521313840416714752' in branch:
                    print(r_id, branch)
                    print("here!!!")

#                 branch_labels = [all_stance_label_dict[tweet_id] for tweet_id in branch
#                                 if tweet_id in all_stance_label_dict]
                
#                 if len(branch_labels) not in [0,1] and len(branch) != len(branch_labels):
#                     mismatch += 1
#                     print(r_id, branch)
#                     print(len(branch), len(branch_labels))
#                     print([all_stance_label_dict[tweet_id] if tweet_id in all_stance_label_dict else -1
#                                      for tweet_id in branch])

                    
    for r_id in non_rumour_threads:
        with open('{}/{}/non-rumours/{}/structure.json'.format(base_path, event, r_id)) as json_file:
            thread_tree = json.load(json_file)
            branches = tree2branches(thread_tree)
            
            if len(branches)==0:
                branches = [[r_id]]

            for branch in branches:
                threads += {'thread_id': r_id, 'branch': branch},

    event_name = event.split('-')[0]
    all_event_threads[event_name] = threads
    print(event_name, len(threads))
    
print(no_tweet_rid, yes_tweet_rid)

  0%|          | 0/9 [00:00<?, ?it/s]

charliehebdo-all-rnr-threads


 33%|███▎      | 3/9 [00:00<00:01,  4.03it/s]

charliehebdo 23619
ebola-essien-all-rnr-threads
ebola 162
ferguson-all-rnr-threads
ferguson 14519
germanwings-crash-all-rnr-threads


 67%|██████▋   | 6/9 [00:00<00:00,  5.31it/s]

germanwings 3053
gurlitt-all-rnr-threads
gurlitt 154
ottawashooting-all-rnr-threads
ottawashooting 8351
prince-toronto-all-rnr-threads
prince 543
putinmissing-all-rnr-threads
putinmissing 534
sydneysiege-all-rnr-threads


100%|██████████| 9/9 [00:00<00:00,  9.64it/s]

sydneysiege 14684
0 297



