In [222]:
import pandas as pd
import numpy as np
import os
import json

In [223]:
def read_json_file(path):
    f = open(path)
    json_content = json.load(f)
    f.close()
    return json_content

In [224]:
training_dataset_directory = 'datasets/rumoureval-2019-training-data/twitter-english'
test_dataset_directory = 'datasets/rumoureval-2019-test-data/twitter-en-test-data'

training_dataset_reddit_directory = 'datasets/rumoureval-2019-training-data/reddit-training-data'
test_dataset_reddit_directory = 'datasets/rumoureval-2019-test-data/reddit-test-data'

training_labels_json = 'datasets/rumoureval-2019-training-data/train-key.json'
training_labels_json_2 = 'datasets/rumoureval-2019-training-data/dev-key.json'
test_labels_json = 'datasets/final-eval-key.json'

training_labels_dict = read_json_file(training_labels_json)['subtaskaenglish']
training_labels_dict.update(read_json_file(training_labels_json_2)['subtaskaenglish'])
test_labels_dict = read_json_file(test_labels_json)['subtaskaenglish']

In [225]:
class Tweet:
    def __init__(self, post_content, post_id, parent_post_id=None, external_urls_count=0):
        self.post_content = post_content
        self.post_id = post_id
        self.category = None
        self.parent_post_id = parent_post_id
        self.external_urls = external_urls_count > 0
        self.user_metadata = None
        
    def add_category(self, category):
        self.category = category
        
#     def __repr__(self):
#         return f"tweet {self.post_id}: {self.post_content}, category: {self.category}"

In [226]:
class SourceTweet:
    def __init__(self, tweet: Tweet):
        self.tweet = tweet
        self.replies = []
        
    def add_reply(self, reply: Tweet):
        self.replies.append(reply)
        
#     def __repr__(self):
#         return f"source {self.tweet}: {len(self.replies)} replies"

In [227]:
def read_tweets_dataset(dataset_dir_path, labels_dict):
    topic_directories = [f.path for f in os.scandir(dataset_dir_path) if f.is_dir()]
    
    topic_to_tweets_map = {}  # {topic_name: SourceTweet}
    topic_to_tuples_map = {}  # {topic_name: (SourceTweet, Tweet)}

    for topic_dir in topic_directories:
        topic_name = topic_dir.split('\\')[1]
        source_tweets = []
        tweet_pairs = []
        
        tweets_paths = [f.path for f in os.scandir(topic_dir) if f.is_dir()]
        for tweet_dir in tweets_paths:
            source_tweet_path = [f.path for f in os.scandir(tweet_dir + '/source-tweet')][0]
            source_tweet_json = read_json_file(source_tweet_path)
            
            tweet = Tweet(source_tweet_json['text'], source_tweet_json['id'],
                              source_tweet_json['in_reply_to_status_id'],
                              len(source_tweet_json['entities']['urls']))
            
            source_tweet = SourceTweet(tweet)
            source_tweets.append(source_tweet)

            
            reply_tweets_paths = [f.path for f in os.scandir(tweet_dir + '/replies')]
            for reply_tweet_path in reply_tweets_paths:
                reply_tweet_json = read_json_file(reply_tweet_path)
                
                reply_tweet = Tweet(reply_tweet_json['text'], reply_tweet_json['id'],
                                        source_tweet.tweet.post_id, len(reply_tweet_json['entities']['urls']))
                reply_tweet.add_category(labels_dict[str(reply_tweet_json['id'])])
                
                source_tweet.add_reply(reply_tweet)
                tweet_pairs.append((source_tweet, reply_tweet))
        
        topic_to_tweets_map[topic_name] = source_tweets
        topic_to_tuples_map[topic_name] = tweet_pairs
        
#     print(topic_to_tweets_map)
    return topic_to_tuples_map
       

In [228]:
def read_reddit_dataset(dataset_dir_path, labels_dict):
    topic_directories = [f.path for f in os.scandir(dataset_dir_path) if f.is_dir()]
    
    topic_to_tweets_map = {}  # {topic_name: SourceTweet}
    topic_to_tuples_map = {}  # {topic_name: (SourceTweet, Tweet)}

    for topic_dir in topic_directories:
        topic_name = topic_dir.split('\\')[1]
        source_tweets = []
        tweet_pairs = []
        
        source_tweet_path = [f.path for f in os.scandir(topic_dir + '/source-tweet')][0]
        source_tweet_json = read_json_file(source_tweet_path)

        content = source_tweet_json['data']['children'][0]['data']['title'] + ' ' + source_tweet_json['data']['children'][0]['data']['selftext']
        tweet = Tweet(content, source_tweet_json['data']['children'][0]['data']['id'], None, content.count("http"))

        source_tweet = SourceTweet(tweet)
        source_tweets.append(source_tweet)


        reply_tweets_paths = [f.path for f in os.scandir(topic_dir + '/replies')]
        for reply_tweet_path in reply_tweets_paths:
            reply_tweet_json = read_json_file(reply_tweet_path)
            
            if 'body' in reply_tweet_json['data']:
                reply_tweet = Tweet(reply_tweet_json['data']['body'], reply_tweet_json['data']['id'],
                                        source_tweet.tweet.post_id, reply_tweet_json['data']['body'].count('http'))
                reply_tweet.add_category(labels_dict[str(reply_tweet.post_id)])
                source_tweet.add_reply(reply_tweet)
                tweet_pairs.append((source_tweet, reply_tweet))
                
        topic_to_tweets_map[topic_name] = source_tweets
        topic_to_tuples_map[topic_name] = tweet_pairs
        
#     print(topic_to_tweets_map)
    return topic_to_tuples_map

In [229]:
def create_df(tuples_map):
    triples = []
    for topic, tweets_list in tuples_map.items():
        for tweet_pair in tweets_list:
            source_tweet, reply = tweet_pair
            triples.append((topic, source_tweet.tweet.post_content, reply.post_content, reply.external_urls, reply.category))
    return pd.DataFrame(triples, columns=['topic', 'source_tweet', 'reply', 'external_urls', 'category'])

In [230]:
# Twitter
training_topic_to_tweets_map = read_tweets_dataset(training_dataset_directory, training_labels_dict)
test_topic_to_tweets_map = read_tweets_dataset(test_dataset_directory, test_labels_dict)

# Reddit
training_topic_to_reddit_map = read_reddit_dataset(training_dataset_reddit_directory, training_labels_dict)
test_topic_to_reddit_map = read_reddit_dataset(test_dataset_reddit_directory, test_labels_dict)

All DataFrames

In [235]:
training_tweets_df = create_df(training_topic_to_tweets_map)
test_tweets_df = create_df(test_topic_to_tweets_map)
training_reddit_df = create_df(training_topic_to_reddit_map)
test_reddit_df = create_df(test_topic_to_reddit_map)

In [236]:
training_tweets_df

Unnamed: 0,topic,source_tweet,reply,external_urls,category
0,charliehebdo,France: 10 people dead after shooting at HQ of...,MT @euronews France: 10 dead after shooting at...,False,comment
1,charliehebdo,France: 10 people dead after shooting at HQ of...,@j0nathandavis They who? Stupid and partial op...,False,deny
2,charliehebdo,France: 10 people dead after shooting at HQ of...,"@nanoSpawn Socialists, Antisemites, anti zioni...",False,comment
3,charliehebdo,France: 10 people dead after shooting at HQ of...,@euronews @TradeDesk_Steve A French crime of p...,False,query
4,charliehebdo,France: 10 people dead after shooting at HQ of...,"@euronews LOL. 5 million Muslims in France, wh...",False,comment
...,...,...,...,...,...
5238,sydneysiege,Police confirm that #sydneysiege is finally ov...,@Angus_OL The 6 of us are watching this unfold...,False,support
5239,sydneysiege,Police confirm that #sydneysiege is finally ov...,@emaccaz_ omfg it is 😱😱😱😱😱,False,comment
5240,sydneysiege,Police confirm that #sydneysiege is finally ov...,@Angus_OL thank god they're all safe now. some...,False,support
5241,sydneysiege,Police confirm that #sydneysiege is finally ov...,"@Angus_OL thank god its over, they're finally ...",False,comment


In [237]:
test_tweets_df

Unnamed: 0,topic,source_tweet,reply,external_urls,category
0,afghanistan,#Breaking: Pentagon releases video of the “mot...,@TODAYshow \n\nBig expensive payload to kill 3...,False,comment
1,afghanistan,#Breaking: Pentagon releases video of the “mot...,@TODAYshow How many ISIS did it kill?,False,query
2,afghanistan,#Breaking: Pentagon releases video of the “mot...,@TODAYshow @HallieJackson The mother... give m...,False,comment
3,afghanistan,#Breaking: Pentagon releases video of the “mot...,"@TODAYshow \nBefore and after, looks like ther...",False,comment
4,afghanistan,#Breaking: Pentagon releases video of the “mot...,@TODAYshow The next one could be the Daddy of ...,False,comment
...,...,...,...,...,...
1005,wildfires-deduction,"From @JimPuzzanghera:\n\n""The House Republican...","From @clairezillman:\n\n""Those fires took plac...",True,support
1006,wildfires-deduction,"From @JimPuzzanghera:\n\n""The House Republican...","And @SteveKnight25, your ""YEA"" vote in favor o...",False,comment
1007,wildfires-deduction,"From @JimPuzzanghera:\n\n""The House Republican...",@CA25UP @SteveKnight25 DAAAAAAMN. You got ownn...,False,comment
1008,wildfires-deduction,"From @JimPuzzanghera:\n\n""The House Republican...","Btw, a one-time disaster relief package does n...",True,comment


In [238]:
training_reddit_df

Unnamed: 0,topic,source_tweet,reply,external_urls,category
0,18dmb4,Even ants won't eat aspartame!,"Wikipedia would be a good start, I think. \nh...",True,comment
1,18dmb4,Even ants won't eat aspartame!,Snopes has the basics: \nwww.snopes.com/humor...,False,comment
2,18dmb4,Even ants won't eat aspartame!,"Just about every sentence is wrong, as others ...",True,comment
3,18dmb4,Even ants won't eat aspartame!,I find the Snopes ant test fascinating but it ...,False,comment
4,18dmb4,Even ants won't eat aspartame!,TIL: Aspartame contains 10 calories per teaspo...,True,comment
...,...,...,...,...,...
651,8yktu5,Jon Sopel: Bizarre. @realDonaldTrump says he c...,Quote:\n\n&gt; I was opening Turnberry the day...,False,deny
652,8yktu5,Jon Sopel: Bizarre. @realDonaldTrump says he c...,[deleted],False,comment
653,8yktu5,Jon Sopel: Bizarre. @realDonaldTrump says he c...,He said he was opening it the day before Brexi...,False,comment
654,8yktu5,Jon Sopel: Bizarre. @realDonaldTrump says he c...,"""Well if you remember I was opening Turnberry ...",False,comment


In [239]:
test_reddit_df

Unnamed: 0,topic,source_tweet,reply,external_urls,category
0,1d7lzf,Debunk This: A friend of mine claims Red Dye c...,You can't reason someone out of a belief they ...,False,comment
1,1d7lzf,Debunk This: A friend of mine claims Red Dye c...,"From wikipedia: ""Though past research showed n...",False,comment
2,1d7lzf,Debunk This: A friend of mine claims Red Dye c...,Whats the reason behind your reasoning? I reas...,False,comment
3,1d7lzf,Debunk This: A friend of mine claims Red Dye c...,You didn't use reason to come to the conclusio...,False,comment
4,1d7lzf,Debunk This: A friend of mine claims Red Dye c...,"Hah, well I'll be damned. \n\nAnd to be fair, ...",False,comment
...,...,...,...,...,...
660,xn2bn,"I've been searching, and can't find a single c...","Lawl. I'm assuming you're a troll. But if not,...",False,comment
661,xn2bn,"I've been searching, and can't find a single c...","not saying bush was the best, or palin was the...",False,comment
662,xn2bn,"I've been searching, and can't find a single c...",^^^^^\nthey would do that if someone hadn't se...,False,comment
663,xn2bn,"I've been searching, and can't find a single c...","You are right about that, but we didn't know s...",False,comment
