In [134]:
import pandas as pd
import numpy as np
import os
import json
import spacy

# Data Loading


In [135]:
def read_json_file(path):
    f = open(path)
    json_content = json.load(f)
    f.close()
    return json_content

In [136]:
training_dataset_directory = 'datasets/rumoureval-2019-training-data/twitter-english'
test_dataset_directory = 'datasets/rumoureval-2019-test-data/twitter-en-test-data'

training_dataset_reddit_directory = 'datasets/rumoureval-2019-training-data/reddit-training-data'
test_dataset_reddit_directory = 'datasets/rumoureval-2019-test-data/reddit-test-data'

training_labels_json = 'datasets/rumoureval-2019-training-data/train-key.json'
training_labels_json_2 = 'datasets/rumoureval-2019-training-data/dev-key.json'
test_labels_json = 'datasets/final-eval-key.json'

training_labels_dict = read_json_file(training_labels_json)['subtaskaenglish']
training_labels_dict.update(read_json_file(training_labels_json_2)['subtaskaenglish'])
test_labels_dict = read_json_file(test_labels_json)['subtaskaenglish']

In [137]:
class Tweet:
    def __init__(self, post_content, post_id, parent_post_id=None, external_urls_count=0):
        self.post_content = post_content
        self.post_id = post_id
        self.category = None
        self.parent_post_id = parent_post_id
        self.external_urls = external_urls_count > 0
        self.user_metadata = None
        
    def add_category(self, category):
        self.category = category

In [138]:
class SourceTweet:
    def __init__(self, tweet: Tweet):
        self.tweet = tweet
        self.replies = []
        
    def add_reply(self, reply: Tweet):
        self.replies.append(reply)

In [139]:
def read_tweets_dataset(dataset_dir_path, labels_dict):
    topic_directories = [f.path for f in os.scandir(dataset_dir_path) if f.is_dir()]
    topic_to_tweets_map = {}  # {topic_name: [SourceTweet, ...]}

    for topic_dir in topic_directories:
        topic_name = topic_dir.split('\\')[1]
        source_tweets = []
        
        tweets_paths = [f.path for f in os.scandir(topic_dir) if f.is_dir()]
        for tweet_dir in tweets_paths:
            source_tweet_path = [f.path for f in os.scandir(tweet_dir + '/source-tweet')][0]
            source_tweet_json = read_json_file(source_tweet_path)
            
            tweet = Tweet(source_tweet_json['text'], source_tweet_json['id'],
                              source_tweet_json['in_reply_to_status_id'],
                              len(source_tweet_json['entities']['urls']))
            
            source_tweet = SourceTweet(tweet)
            source_tweets.append(source_tweet)
            tweet.add_category("support")
            source_tweet.add_reply(tweet)

            
            reply_tweets_paths = [f.path for f in os.scandir(tweet_dir + '/replies')]
            for reply_tweet_path in reply_tweets_paths:
                reply_tweet_json = read_json_file(reply_tweet_path)
                
                reply_tweet = Tweet(reply_tweet_json['text'], reply_tweet_json['id'],
                                        source_tweet.tweet.post_id, len(reply_tweet_json['entities']['urls']))
                reply_tweet.add_category(labels_dict[str(reply_tweet_json['id'])])
                source_tweet.add_reply(reply_tweet)
        
        topic_to_tweets_map[topic_name] = source_tweets
        
    return topic_to_tweets_map
       

In [140]:
def read_reddit_dataset(dataset_dir_path, labels_dict):
    topic_directories = [f.path for f in os.scandir(dataset_dir_path) if f.is_dir()]
    topic_to_tweets_map = {}  # {topic_name: [SourceTweet, ...]}

    for topic_dir in topic_directories:
        topic_name = topic_dir.split('\\')[1]
        source_tweets = []
        
        source_tweet_path = [f.path for f in os.scandir(topic_dir + '/source-tweet')][0]
        source_tweet_json = read_json_file(source_tweet_path)

        content = source_tweet_json['data']['children'][0]['data']['title'] + ' ' + source_tweet_json['data']['children'][0]['data']['selftext']
        tweet = Tweet(content, source_tweet_json['data']['children'][0]['data']['id'], None, content.count("http"))

        source_tweet = SourceTweet(tweet)
        source_tweets.append(source_tweet)
        tweet.add_category("support")
        source_tweet.add_reply(tweet)

        reply_tweets_paths = [f.path for f in os.scandir(topic_dir + '/replies')]
        for reply_tweet_path in reply_tweets_paths:
            reply_tweet_json = read_json_file(reply_tweet_path)
            
            if 'body' in reply_tweet_json['data']:
                reply_tweet = Tweet(reply_tweet_json['data']['body'], reply_tweet_json['data']['id'],
                                        source_tweet.tweet.post_id, reply_tweet_json['data']['body'].count('http'))
                reply_tweet.add_category(labels_dict[str(reply_tweet.post_id)])
                source_tweet.add_reply(reply_tweet)
                
        topic_to_tweets_map[topic_name] = source_tweets
        
    return topic_to_tweets_map

In [141]:
# Twitter
training_topic_to_tweets_map = read_tweets_dataset(training_dataset_directory, training_labels_dict)
test_topic_to_tweets_map = read_tweets_dataset(test_dataset_directory, test_labels_dict)

# Reddit
training_topic_to_reddit_map = read_reddit_dataset(training_dataset_reddit_directory, training_labels_dict)
test_topic_to_reddit_map = read_reddit_dataset(test_dataset_reddit_directory, test_labels_dict)

# Preprocessing

In [142]:
nlp = spacy.load("en_core_web_sm")

In [143]:
def preprocessing(sentence):
    lemmatizer = nlp.get_pipe("lemmatizer")        
    doc = nlp(sentence)
    lemmas = []
    for token in doc:
        if token.is_stop:
            continue
        elif token.pos_ == "NUM":
            lemmas.append('#')
        elif token.pos_ == "SYM":
            continue
        else:
            lemmas.append(token.lemma_.lower())
    return lemmas

In [144]:
def create_df(topic_map):
    rows = []
    for topic, source_tweets in topic_map.items():
        for source_tweet in source_tweets:
            tokenized_source_tweet = preprocessing(source_tweet.tweet.post_content)
            for reply in source_tweet.replies:
                tokenized_reply = preprocessing(reply.post_content)
                rows.append((topic, source_tweet.tweet.post_content, reply.post_content, tokenized_source_tweet, tokenized_reply, reply.external_urls, reply.category))
    return pd.DataFrame(rows, columns=['topic', 'original_source_tweet', 'original_reply', 'source_tweet', 'reply', 'external_urls', 'category'])

## DataFrames

In [145]:
training_tweets_df = create_df(training_topic_to_tweets_map)
test_tweets_df = create_df(test_topic_to_tweets_map)
training_reddit_df = create_df(training_topic_to_reddit_map)
test_reddit_df = create_df(test_topic_to_reddit_map)

In [146]:
training_tweets_df

Unnamed: 0,topic,original_source_tweet,original_reply,source_tweet,reply,external_urls,category
0,charliehebdo,France: 10 people dead after shooting at HQ of...,France: 10 people dead after shooting at HQ of...,"[france, :, #, people, dead, shoot, hq, satiri...","[france, :, #, people, dead, shoot, hq, satiri...",False,support
1,charliehebdo,France: 10 people dead after shooting at HQ of...,MT @euronews France: 10 dead after shooting at...,"[france, :, #, people, dead, shoot, hq, satiri...","[mt, @euronews, france, :, #, dead, shoot, hq,...",False,comment
2,charliehebdo,France: 10 people dead after shooting at HQ of...,@j0nathandavis They who? Stupid and partial op...,"[france, :, #, people, dead, shoot, hq, satiri...","[@j0nathandavis, ?, stupid, partial, opinion, ...",False,deny
3,charliehebdo,France: 10 people dead after shooting at HQ of...,"@nanoSpawn Socialists, Antisemites, anti zioni...","[france, :, #, people, dead, shoot, hq, satiri...","[@nanospawn, socialists, ,, antisemites, ,, an...",False,comment
4,charliehebdo,France: 10 people dead after shooting at HQ of...,@euronews @TradeDesk_Steve A French crime of p...,"[france, :, #, people, dead, shoot, hq, satiri...","[@euronews, @tradedesk_steve, french, crime, p...",False,query
...,...,...,...,...,...,...,...
5563,sydneysiege,Police confirm that #sydneysiege is finally ov...,@Angus_OL The 6 of us are watching this unfold...,"[police, confirm, #, sydneysiege, finally, ., ...","[@angus_ol, #, watch, unfold, ,, shake, ,, hea...",False,support
5564,sydneysiege,Police confirm that #sydneysiege is finally ov...,@emaccaz_ omfg it is 😱😱😱😱😱,"[police, confirm, #, sydneysiege, finally, ., ...","[@emaccaz, _, omfg, 😱, 😱, 😱, 😱, 😱]",False,comment
5565,sydneysiege,Police confirm that #sydneysiege is finally ov...,@Angus_OL thank god they're all safe now. some...,"[police, confirm, #, sydneysiege, finally, ., ...","[@angus_ol, thank, god, safe, ., wound, ,, saf...",False,support
5566,sydneysiege,Police confirm that #sydneysiege is finally ov...,"@Angus_OL thank god its over, they're finally ...","[police, confirm, #, sydneysiege, finally, ., ...","[@angus_ol, thank, god, ,, finally, safe, ,, w...",False,comment


In [147]:
test_tweets_df

Unnamed: 0,topic,original_source_tweet,original_reply,source_tweet,reply,external_urls,category
0,afghanistan,#Breaking: Pentagon releases video of the “mot...,#Breaking: Pentagon releases video of the “mot...,"[#, break, :, pentagon, release, video, "", mot...","[#, break, :, pentagon, release, video, "", mot...",False,support
1,afghanistan,#Breaking: Pentagon releases video of the “mot...,@TODAYshow \n\nBig expensive payload to kill 3...,"[#, break, :, pentagon, release, video, "", mot...","[@todayshow, \n\n, big, expensive, payload, ki...",False,comment
2,afghanistan,#Breaking: Pentagon releases video of the “mot...,@TODAYshow How many ISIS did it kill?,"[#, break, :, pentagon, release, video, "", mot...","[@todayshow, isis, kill, ?]",False,query
3,afghanistan,#Breaking: Pentagon releases video of the “mot...,@TODAYshow @HallieJackson The mother... give m...,"[#, break, :, pentagon, release, video, "", mot...","[@todayshow, @halliejackson, mother, ..., beak...",False,comment
4,afghanistan,#Breaking: Pentagon releases video of the “mot...,"@TODAYshow \nBefore and after, looks like ther...","[#, break, :, pentagon, release, video, "", mot...","[@todayshow, \n, ,, look, like, ., wipe, lands...",False,comment
...,...,...,...,...,...,...,...
1061,wildfires-deduction,"From @JimPuzzanghera:\n\n""The House Republican...","From @clairezillman:\n\n""Those fires took plac...","[@jimpuzzanghera, :, \n\n, "", house, republica...","[@clairezillman, :, \n\n, "", fire, take, place...",True,support
1062,wildfires-deduction,"From @JimPuzzanghera:\n\n""The House Republican...","And @SteveKnight25, your ""YEA"" vote in favor o...","[@jimpuzzanghera, :, \n\n, "", house, republica...","[@steveknight25, ,, "", yea, "", vote, favor, pa...",False,comment
1063,wildfires-deduction,"From @JimPuzzanghera:\n\n""The House Republican...",@CA25UP @SteveKnight25 DAAAAAAMN. You got ownn...,"[@jimpuzzanghera, :, \n\n, "", house, republica...","[@ca25up, @steveknight25, daaaaaamn, ., got, o...",False,comment
1064,wildfires-deduction,"From @JimPuzzanghera:\n\n""The House Republican...","Btw, a one-time disaster relief package does n...","[@jimpuzzanghera, :, \n\n, "", house, republica...","[btw, ,, -, time, disaster, relief, package, u...",True,comment


In [148]:
training_reddit_df

Unnamed: 0,topic,original_source_tweet,original_reply,source_tweet,reply,external_urls,category
0,18dmb4,Even ants won't eat aspartame!,Even ants won't eat aspartame!,"[ant, will, eat, aspartame, !]","[ant, will, eat, aspartame, !]",False,support
1,18dmb4,Even ants won't eat aspartame!,"Wikipedia would be a good start, I think. \nh...","[ant, will, eat, aspartame, !]","[wikipedia, good, start, ,, think, ., \n, htt...",True,comment
2,18dmb4,Even ants won't eat aspartame!,Snopes has the basics: \nwww.snopes.com/humor...,"[ant, will, eat, aspartame, !]","[snope, basic, :, \n, www.snopes.com/humor/if...",False,comment
3,18dmb4,Even ants won't eat aspartame!,"Just about every sentence is wrong, as others ...","[ant, will, eat, aspartame, !]","[sentence, wrong, ,, point, ,, puzzle, go, aut...",True,comment
4,18dmb4,Even ants won't eat aspartame!,I find the Snopes ant test fascinating but it ...,"[ant, will, eat, aspartame, !]","[find, snopes, ant, test, fascinating, bring, ...",False,comment
...,...,...,...,...,...,...,...
681,8yktu5,Jon Sopel: Bizarre. @realDonaldTrump says he c...,Quote:\n\n&gt; I was opening Turnberry the day...,"[jon, sopel, :, bizarre, ., @realdonaldtrump, ...","[quote, :, \n\n, &, gt, ;, open, turnberry, da...",False,deny
682,8yktu5,Jon Sopel: Bizarre. @realDonaldTrump says he c...,[deleted],"[jon, sopel, :, bizarre, ., @realdonaldtrump, ...","[[, delete, ]]",False,comment
683,8yktu5,Jon Sopel: Bizarre. @realDonaldTrump says he c...,He said he was opening it the day before Brexi...,"[jon, sopel, :, bizarre, ., @realdonaldtrump, ...","[say, open, day, brexit, ., \n\n, charitable, ...",False,comment
684,8yktu5,Jon Sopel: Bizarre. @realDonaldTrump says he c...,"""Well if you remember I was opening Turnberry ...","[jon, sopel, :, bizarre, ., @realdonaldtrump, ...","["", remember, open, turnberry, day, brexit, "",...",False,comment


In [149]:
test_reddit_df

Unnamed: 0,topic,original_source_tweet,original_reply,source_tweet,reply,external_urls,category
0,1d7lzf,Debunk This: A friend of mine claims Red Dye c...,Debunk This: A friend of mine claims Red Dye c...,"[debunk, :, friend, claim, red, dye, cause, ad...","[debunk, :, friend, claim, red, dye, cause, ad...",False,support
1,1d7lzf,Debunk This: A friend of mine claims Red Dye c...,You can't reason someone out of a belief they ...,"[debunk, :, friend, claim, red, dye, cause, ad...","[reason, belief, use, reason, place, .]",False,comment
2,1d7lzf,Debunk This: A friend of mine claims Red Dye c...,"From wikipedia: ""Though past research showed n...","[debunk, :, friend, claim, red, dye, cause, ad...","[wikipedia, :, "", past, research, show, correl...",False,comment
3,1d7lzf,Debunk This: A friend of mine claims Red Dye c...,Whats the reason behind your reasoning? I reas...,"[debunk, :, friend, claim, red, dye, cause, ad...","[s, reason, reasoning, ?, reason, reason, peop...",False,comment
4,1d7lzf,Debunk This: A friend of mine claims Red Dye c...,You didn't use reason to come to the conclusio...,"[debunk, :, friend, claim, red, dye, cause, ad...","[use, reason, come, conclusion, person, reason...",False,comment
...,...,...,...,...,...,...,...
685,xn2bn,"I've been searching, and can't find a single c...","Lawl. I'm assuming you're a troll. But if not,...","[search, ,, find, single, credible, source, .,...","[lawl, ., assume, troll, ., ,, vote, office, i...",False,comment
686,xn2bn,"I've been searching, and can't find a single c...","not saying bush was the best, or palin was the...","[search, ,, find, single, credible, source, .,...","[say, bush, good, ,, palin, good, ,, say, pret...",False,comment
687,xn2bn,"I've been searching, and can't find a single c...",^^^^^\nthey would do that if someone hadn't se...,"[search, ,, find, single, credible, source, .,...","[^^^^^, \n, seal, record, college]",False,comment
688,xn2bn,"I've been searching, and can't find a single c...","You are right about that, but we didn't know s...","[search, ,, find, single, credible, source, .,...","[right, ,, know, shit, palin, ,, present, ,, c...",False,comment
