In [111]:
import pandas as pd
import numpy as np
import os
import json

In [112]:
def read_json_file(path):
    f = open(path)
    json_content = json.load(f)
    f.close()
    return json_content

In [113]:
training_dataset_directory = 'datasets/rumoureval-2019-training-data/twitter-english'
test_dataset_directory = 'datasets/rumoureval-2019-test-data/twitter-en-test-data'

training_labels_json = 'datasets/rumoureval-2019-training-data/train-key.json'
training_labels_json_2 = 'datasets/rumoureval-2019-training-data/dev-key.json'
test_labels_json = 'datasets/final-eval-key.json'

training_labels_dict = read_json_file(training_labels_json)['subtaskaenglish']
training_labels_dict.update(read_json_file(training_labels_json_2)['subtaskaenglish'])
test_labels_dict = read_json_file(test_labels_json)['subtaskaenglish']

In [114]:
class Tweet:
    def __init__(self, post_content, post_id, parent_post_id=None, external_urls_count=0):
        self.post_content = post_content
        self.post_id = post_id
        self.category = None
        self.parent_post_id = parent_post_id
        self.external_urls = external_urls_count > 0
        self.user_metadata = None
        
    def add_category(self, category):
        self.category = category
        
#     def __repr__(self):
#         return f"tweet {self.post_id}: {self.post_content}, category: {self.category}"

In [115]:
class SourceTweet:
    def __init__(self, tweet: Tweet):
        self.tweet = tweet
        self.replies = []
        
    def add_reply(self, reply: Tweet):
        self.replies.append(reply)
        
#     def __repr__(self):
#         return f"source {self.tweet}: {len(self.replies)} replies"

In [116]:
def read_dataset(dataset_dir_path, labels_dict):
    topic_directories = [f.path for f in os.scandir(dataset_dir_path) if f.is_dir()]
    
    topic_to_tweets_map = {}  # {topic_name: SourceTweet}
    topic_to_tuples_map = {}  # {topic_name: (SourceTweet, Tweet)}

    for topic_dir in topic_directories:
        topic_name = topic_dir.split('\\')[1]
        source_tweets = []
        tweet_pairs = []
        
        tweets_paths = [f.path for f in os.scandir(topic_dir) if f.is_dir()]
        for tweet_dir in tweets_paths:
            source_tweet_path = [f.path for f in os.scandir(tweet_dir + '/source-tweet')][0]
            source_tweet_json = read_json_file(source_tweet_path)

            tweet = Tweet(source_tweet_json['text'], source_tweet_json['id'],
                          source_tweet_json['in_reply_to_status_id'],
                          len(source_tweet_json['entities']['urls']))
            
            source_tweet = SourceTweet(tweet)
            source_tweets.append(source_tweet)

            
            reply_tweets_paths = [f.path for f in os.scandir(tweet_dir + '/replies')]
            for reply_tweet_path in reply_tweets_paths:
                reply_tweet_json = read_json_file(reply_tweet_path)
                reply_tweet = Tweet(reply_tweet_json['text'], reply_tweet_json['id'],
                                    source_tweet.tweet.post_id, len(reply_tweet_json['entities']['urls']))
                reply_tweet.add_category(labels_dict[str(reply_tweet_json['id'])])
                
                source_tweet.add_reply(reply_tweet)
                tweet_pairs.append((source_tweet, reply_tweet))
        
        topic_to_tweets_map[topic_name] = source_tweets
        topic_to_tuples_map[topic_name] = tweet_pairs
        
#     print(topic_to_tweets_map)
    return topic_to_tuples_map
       

In [117]:
training_topic_to_tweets_map = read_dataset(training_dataset_directory, training_labels_dict)
test_topic_to_tweets_map = read_dataset(test_dataset_directory, test_labels_dict)

In [118]:
def create_df(tuples_map):
    triples = []
    for topic, tweets_list in tuples_map.items():
        for tweet_pair in tweets_list:
            source_tweet, reply = tweet_pair
            triples.append((topic, source_tweet.tweet.post_content, reply.post_content, reply.external_urls, reply.category))
    return pd.DataFrame(triples, columns=['topic', 'source_tweet', 'reply', 'external_urls', 'category'])

In [119]:
training_df = create_df(training_topic_to_tweets_map)
training_df

Unnamed: 0,topic,source_tweet,reply,external_urls,category
0,charliehebdo,France: 10 people dead after shooting at HQ of...,MT @euronews France: 10 dead after shooting at...,False,comment
1,charliehebdo,France: 10 people dead after shooting at HQ of...,@j0nathandavis They who? Stupid and partial op...,False,deny
2,charliehebdo,France: 10 people dead after shooting at HQ of...,"@nanoSpawn Socialists, Antisemites, anti zioni...",False,comment
3,charliehebdo,France: 10 people dead after shooting at HQ of...,@euronews @TradeDesk_Steve A French crime of p...,False,query
4,charliehebdo,France: 10 people dead after shooting at HQ of...,"@euronews LOL. 5 million Muslims in France, wh...",False,comment
...,...,...,...,...,...
5238,sydneysiege,Police confirm that #sydneysiege is finally ov...,@Angus_OL The 6 of us are watching this unfold...,False,support
5239,sydneysiege,Police confirm that #sydneysiege is finally ov...,@emaccaz_ omfg it is 😱😱😱😱😱,False,comment
5240,sydneysiege,Police confirm that #sydneysiege is finally ov...,@Angus_OL thank god they're all safe now. some...,False,support
5241,sydneysiege,Police confirm that #sydneysiege is finally ov...,"@Angus_OL thank god its over, they're finally ...",False,comment


In [120]:
test_df = create_df(test_topic_to_tweets_map)
test_df

Unnamed: 0,topic,source_tweet,reply,external_urls,category
0,afghanistan,#Breaking: Pentagon releases video of the “mot...,@TODAYshow \n\nBig expensive payload to kill 3...,False,comment
1,afghanistan,#Breaking: Pentagon releases video of the “mot...,@TODAYshow How many ISIS did it kill?,False,query
2,afghanistan,#Breaking: Pentagon releases video of the “mot...,@TODAYshow @HallieJackson The mother... give m...,False,comment
3,afghanistan,#Breaking: Pentagon releases video of the “mot...,"@TODAYshow \nBefore and after, looks like ther...",False,comment
4,afghanistan,#Breaking: Pentagon releases video of the “mot...,@TODAYshow The next one could be the Daddy of ...,False,comment
...,...,...,...,...,...
1005,wildfires-deduction,"From @JimPuzzanghera:\n\n""The House Republican...","From @clairezillman:\n\n""Those fires took plac...",True,support
1006,wildfires-deduction,"From @JimPuzzanghera:\n\n""The House Republican...","And @SteveKnight25, your ""YEA"" vote in favor o...",False,comment
1007,wildfires-deduction,"From @JimPuzzanghera:\n\n""The House Republican...",@CA25UP @SteveKnight25 DAAAAAAMN. You got ownn...,False,comment
1008,wildfires-deduction,"From @JimPuzzanghera:\n\n""The House Republican...","Btw, a one-time disaster relief package does n...",True,comment
