This code assumes that there are three folders containing all the information for data processing. These are:
- ```posts``` and ```users```: contain the json obtained with the Twitter API of both tweets and users profiles.
- ```tree```: the structure of the propagation trees of the dataset to be processed.


In [None]:
import numpy as np
import os
import operator
import json
import matplotlib.pyplot as plt
import time
import datetime


class User:
    def __init__(self, path):
        users = {}
        for file in os.listdir(path+'/users/'):
            user = json.loads("".join(open(path+'/users/'+file).readlines()))
            users[int(file.split(".")[0])] = self.__try(user)
        self.user = users
            
    def __try(self, user):
        len_description = len(user.get('description', ''))
        len_username = len(user.get('name', ''))
        ers = user.get('followers_count', 0)
        ings = user.get('friends_count', 0)
        statuses = user.get('statuses_count', 0)
        is_verified = 1 if user.get('verified') else 0
        is_geo_enabled = 1 if user.get('geo_enabled') else 0
        has_location = 1 if user.get('location') else 0
        listed = user.get('listed_count', 0)
        favorites = user.get('favourites_count', 0)

        time_now = datetime.datetime.now().strftime("%a %b %d %H:%M:%S +0000 %Y")
        ts_now = time.mktime(time.strptime(time_now,"%a %b %d %H:%M:%S +0000 %Y")) 
        ts_user = time.mktime(time.strptime(user['created_at'],"%a %b %d %H:%M:%S +0000 %Y"))  
        age = int((ts_now - ts_user)/3600.0) # in hours

        return [len_description, len_username, ers,
                ings, statuses, is_verified,
                is_geo_enabled, has_location, listed,
                favorites, age]
    
class Post:
    def __init__(self, path):
        self.post = {}
        for file in os.listdir(path+'/post/'):
            self.post[int(file.split(".")[0])] = json.loads("".join(open(path+'/post/'+file).readlines()))
    
class Node:
    def __init__(self, tripla, retweet):
        '''
        tripla: id_user, id_post, time to root
        retweet: if it is retweet or not
        '''
        self.user_id = tripla[0]
        self.post_id = tripla[1]
        self.timestamp = tripla[2]
        self.retweet = retweet
        
class Tree:
    def __init__(self, id_news, tree_str):  
        '''
        id_news: news ID
        nodes_order: Ordered list of the posts by time
        '''
        self.id = id_news
        nodes_order = []
        for line in tree_str:
            parent, child = line.strip().split("->")
            parent = self.__node_to_list(parent)
            child = self.__node_to_list(child)     
            if parent == (None, None, 0.0):
                nodes_order.append(Node(parent, False)) #uid, post id, root time 
            else: 
                if parent[2] <= child[2]: 
                    retweet = True if parent[1] == child[1] else False
                    nodes_order.append(Node(child,retweet))     
        self.nodes_order = sorted(nodes_order, key = operator.attrgetter('timestamp'))
        self.first_time = self.nodes_order[0].timestamp
        self.last_time = self.nodes_order[-1].timestamp
        
    def __node_to_list(self, node_str):
        num = lambda s: eval(s) if not set(s).difference('0123456789. *+-/e') else None
        array_str = [num(i.strip()[1:-1]) for i in node_str[1:-1].split(",")]
        return tuple(array_str)
    
class News:
    def __init__(self, id_news, tree_str, label):
        '''
        id_news: News/claim ID
        tree: Ordered list of post of a propagation tree, by time
        label: News label
        '''
        self.id = int(id_news)
        self.label = label
        self.tree = Tree(self.id, tree_str)
        self.lifespan = self.tree.last_time - self.tree.first_time
        

def load_data(path='.'):
    labels = {}
    news = {}
    for label in open(path+'/label.txt').readlines():
        labels[int(label.split(":")[1][:-1])] = label.split(":")[0]    
    for file in os.listdir(path+'/tree/'):
        id_news = file.split(".")[0]
        news[int(id_news)] = News(id_news, open(path+'/tree/'+file).readlines(), labels[int(id_news)])        
    return news

def load_data_users(path='.'):
    return User(path)

def load_data_posts(path='.'):
    return Post(path)

def load_embeddings(path='.'):
    emb = {}
    for id_ in os.listdir(path):
      emb[id_.split('.')[0]] = np.load(path + id_)
    return emb

def create_claim_embedding(emb_folder='embeddings/'):
  for id, claim in data_filtered.items():
    label = claim.label
    tree = claim.tree
    posts
    claim = []
    for node in tree.nodes_order:
      user = users.user.get(node.user_id, None)
      timestamp = node.timestamp
      text_emb = embeddings.get(str(node.post_id), None)
      if user == None or text_emb is None:
        continue
      emb = np.append(text_emb, [*user, timestamp])
      claim.append(emb)
    np.save(emb_folder+str(id)+'.npy', claim)

In [None]:
# load data
users = load_data_users()
posts = load_data_posts()
data = load_data()

# replace path with the embedding folder
embeddings = load_embeddings(path='awe_spacy_en_vectors_web_lg/')

# comment this line if you are working with four classes
data_filtered = {key: value for key, value in data.items() if data[key].label in ['true', 'false'] }

# create propagation embeddings
create_claim_embedding()