In [35]:
import pandas as pd
import numpy as np
import os
import json

In [36]:
def read_json_file(path):
    f = open(path)
    json_content = json.load(f)
    f.close()
    return json_content

In [37]:
training_dataset_directory = 'datasets/rumoureval-2019-training-data/twitter-english'
test_dataset_directory = 'datasets/rumoureval-2019-test-data/twitter-en-test-data'

training_labels_json = 'datasets/rumoureval-2019-training-data/train-key.json'
training_labels_json_2 = 'datasets/rumoureval-2019-training-data/dev-key.json'
test_labels_json = 'datasets/final-eval-key.json'

training_labels_dict = read_json_file(training_labels_json)['subtaskaenglish']
training_labels_dict.update(read_json_file(training_labels_json_2)['subtaskaenglish'])
test_labels_dict = read_json_file(test_labels_json)['subtaskaenglish']

In [38]:
class Tweet:
    def __init__(self, post_content, post_id, parent_post_id=None, external_urls_count=0):
        self.post_content = post_content
        self.post_id = post_id
        self.category = None
        self.parent_post_id = parent_post_id
        self.presence_of_external_urls = external_urls_count > 0
        self.user_metadata = None
        
    def add_category(self, category):
        self.category = category
        
#     def __repr__(self):
#         return f"tweet {self.post_id}: {self.post_content}, category: {self.category}"

In [39]:
class SourceTweet:
    def __init__(self, tweet: Tweet):
        self.tweet = tweet
        self.replies = []
        
    def add_reply(self, reply: Tweet):
        self.replies.append(reply)
        
#     def __repr__(self):
#         return f"source {self.tweet}: {len(self.replies)} replies"

In [40]:
def read_dataset(dataset_dir_path, labels_dict):
    topic_directories = [f.path for f in os.scandir(dataset_dir_path) if f.is_dir()]
    
    topic_to_tweets_map = {}  # {topic_name: SourceTweet}
    topic_to_tuples_map = {}  # {topic_name: (SourceTweet, Tweet)}

    for topic_dir in topic_directories:
        topic_name = topic_dir.split('\\')[1]
        source_tweets = []
        tweet_pairs = []
        
        tweets_paths = [f.path for f in os.scandir(topic_dir) if f.is_dir()]
        for tweet_dir in tweets_paths:
            source_tweet_path = [f.path for f in os.scandir(tweet_dir + '/source-tweet')][0]
            source_tweet_json = read_json_file(source_tweet_path)

            tweet = Tweet(source_tweet_json['text'], source_tweet_json['id'],
                          source_tweet_json['in_reply_to_status_id'],
                          len(source_tweet_json['entities']['urls']))
            
            source_tweet = SourceTweet(tweet)
            source_tweets.append(source_tweet)

            
            reply_tweets_paths = [f.path for f in os.scandir(tweet_dir + '/replies')]
            for reply_tweet_path in reply_tweets_paths:
                reply_tweet_json = read_json_file(reply_tweet_path)
                reply_tweet = Tweet(reply_tweet_json['text'], reply_tweet_json['id'],
                                    source_tweet.tweet.post_id, len(reply_tweet_json['entities']['urls']))
                reply_tweet.add_category(labels_dict[str(reply_tweet_json['id'])])
                
                source_tweet.add_reply(reply_tweet)
                tweet_pairs.append((source_tweet, reply_tweet))
        
        topic_to_tweets_map[topic_name] = source_tweets
        topic_to_tuples_map[topic_name] = tweet_pairs
        
#     print(topic_to_tweets_map)
    return topic_to_tuples_map
       

In [41]:
training_topic_to_tweets_map = read_dataset(training_dataset_directory, training_labels_dict)

In [42]:
test_topic_to_tweets_map = read_dataset(test_dataset_directory, test_labels_dict)

Returns triples in a form of ```{topic_name: (source_tweet_text, reply_tweet_text, category)}```

In [43]:
def map_topic_to_triples(tuples_map):
    new_map = {}  # {topic: (source_tweet_text, reply_tweet_text, category)}
    for topic, tweets_list in tuples_map.items():
        triples = []
        for tweet_pair in tweets_list:
            source_tweet, reply = tweet_pair
            triples.append((source_tweet.tweet.post_content, reply.post_content, reply.category))
        new_map[topic] = triples
    return new_map

In [44]:
map_topic_to_triples(training_topic_to_tweets_map)

{'charliehebdo': [('France: 10 people dead after shooting at HQ of satirical weekly newspaper #CharlieHebdo, according to witnesses http://t.co/FkYxGmuS58',
   "MT @euronews France: 10 dead after shooting at HQ of satirical weekly #CharlieHebdo. If Zionists/Jews did this they'd be nuking Israel",
   'comment'),
  ('France: 10 people dead after shooting at HQ of satirical weekly newspaper #CharlieHebdo, according to witnesses http://t.co/FkYxGmuS58',
   '@j0nathandavis They who? Stupid and partial opinions like this one only add noise to any debate.',
   'deny'),
  ('France: 10 people dead after shooting at HQ of satirical weekly newspaper #CharlieHebdo, according to witnesses http://t.co/FkYxGmuS58',
   '@nanoSpawn Socialists, Antisemites, anti zionists - usual suspects',
   'comment'),
  ('France: 10 people dead after shooting at HQ of satirical weekly newspaper #CharlieHebdo, according to witnesses http://t.co/FkYxGmuS58',
   '@euronews @TradeDesk_Steve A French crime of passion or a

In [45]:
map_topic_to_triples(test_topic_to_tweets_map)

{'afghanistan': [('#Breaking: Pentagon releases video of the “mother of all bombs” being dropped in Afghanistan https://t.co/GaXwhpWDmb',
   '@TODAYshow \n\nBig expensive payload to kill 36 people',
   'comment'),
  ('#Breaking: Pentagon releases video of the “mother of all bombs” being dropped in Afghanistan https://t.co/GaXwhpWDmb',
   '@TODAYshow How many ISIS did it kill?',
   'query'),
  ('#Breaking: Pentagon releases video of the “mother of all bombs” being dropped in Afghanistan https://t.co/GaXwhpWDmb',
   '@TODAYshow @HallieJackson The mother... give me a beak. Men must name all these things. So childish.',
   'comment'),
  ('#Breaking: Pentagon releases video of the “mother of all bombs” being dropped in Afghanistan https://t.co/GaXwhpWDmb',
   "@TODAYshow \nBefore and after, looks like there's nothing there after. Wiped out whole landscape.",
   'comment'),
  ('#Breaking: Pentagon releases video of the “mother of all bombs” being dropped in Afghanistan https://t.co/GaXwhpWDm

If we don't need to separate by topic, it returns a list of triples: ```[(source_tweet_text, reply_tweet_text, category)]```

In [46]:
def make_list_of_triples(tuples_map):
    triples_list = []  # [(source_tweet_text, reply_tweet_text, category)]
    
    for topic, tweets_list in tuples_map.items():
        for tweet_pair in tweets_list:
            source_tweet, reply = tweet_pair
            triples_list.append((source_tweet.tweet.post_content, reply.post_content, reply.category))
    return triples_list

In [47]:
make_list_of_triples(training_topic_to_tweets_map)

[('France: 10 people dead after shooting at HQ of satirical weekly newspaper #CharlieHebdo, according to witnesses http://t.co/FkYxGmuS58',
  "MT @euronews France: 10 dead after shooting at HQ of satirical weekly #CharlieHebdo. If Zionists/Jews did this they'd be nuking Israel",
  'comment'),
 ('France: 10 people dead after shooting at HQ of satirical weekly newspaper #CharlieHebdo, according to witnesses http://t.co/FkYxGmuS58',
  '@j0nathandavis They who? Stupid and partial opinions like this one only add noise to any debate.',
  'deny'),
 ('France: 10 people dead after shooting at HQ of satirical weekly newspaper #CharlieHebdo, according to witnesses http://t.co/FkYxGmuS58',
  '@nanoSpawn Socialists, Antisemites, anti zionists - usual suspects',
  'comment'),
 ('France: 10 people dead after shooting at HQ of satirical weekly newspaper #CharlieHebdo, according to witnesses http://t.co/FkYxGmuS58',
  '@euronews @TradeDesk_Steve A French crime of passion or another heathen moslem atroc

In [48]:
make_list_of_triples(test_topic_to_tweets_map)

[('#Breaking: Pentagon releases video of the “mother of all bombs” being dropped in Afghanistan https://t.co/GaXwhpWDmb',
  '@TODAYshow \n\nBig expensive payload to kill 36 people',
  'comment'),
 ('#Breaking: Pentagon releases video of the “mother of all bombs” being dropped in Afghanistan https://t.co/GaXwhpWDmb',
  '@TODAYshow How many ISIS did it kill?',
  'query'),
 ('#Breaking: Pentagon releases video of the “mother of all bombs” being dropped in Afghanistan https://t.co/GaXwhpWDmb',
  '@TODAYshow @HallieJackson The mother... give me a beak. Men must name all these things. So childish.',
  'comment'),
 ('#Breaking: Pentagon releases video of the “mother of all bombs” being dropped in Afghanistan https://t.co/GaXwhpWDmb',
  "@TODAYshow \nBefore and after, looks like there's nothing there after. Wiped out whole landscape.",
  'comment'),
 ('#Breaking: Pentagon releases video of the “mother of all bombs” being dropped in Afghanistan https://t.co/GaXwhpWDmb',
  '@TODAYshow The next o