In [1]:
import os
import json
import math
from datetime import datetime as dt
from datetime import timedelta

import pandas as pd
from tqdm import tqdm
import numpy as np
import torch
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def read_json_normalized(path):
    with open(path) as data_file:    
        data = json.load(data_file)
    return pd.json_normalize(data)

In [3]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

#### TODO: Define time splits here. Remove data not contained

### Load data

In [4]:
path = "src/Data_test/"
path_preprocessed = "src/Data_test/preprocessed/"
# create folder if not existing
os.makedirs(path_preprocessed, exist_ok=True)

In [5]:
user = read_json_normalized(f"{path}user.json")

In [6]:
user['created_at'] = pd.to_datetime(user['created_at'], utc=True)

In [7]:
uid_index={uid:index for index, uid in enumerate(user['id'].values)}

In [8]:
edge = pd.read_csv(f"{path}edges.csv")

In [9]:
split=pd.read_csv(f"{path}split.csv")

In [10]:
label=pd.read_csv(f"{path}labels.csv")

In [11]:
uid_label={uid:label for uid, label in zip(label['id'].values,label['label'].values)}

In [12]:
uid_split={uid:split for uid, split in zip(split['id'].values,split['split'].values)}

### Create Labels und data masks

In [13]:
labels_new = []
train_mask = []
test_mask = []
validation_mask = []

for i, uid in enumerate(tqdm(uid_index.keys())):
    user_label = uid_label[uid]
    user_split = uid_split[uid]
    
    if user_label == "human":
        labels_new.append(0)
    else:
        labels_new.append(1)
        
    if user_split == "train":
        train_mask.append(i)
    elif user_split == "test":
        test_mask.append(i)
    else:
        validation_mask.append(i)
        
assert (len(train_mask) + len(test_mask) + len(validation_mask)) == len(uid_index)
print("Train Labels: " + str(len(train_mask)))
print("Test Labels: " + str(len(test_mask)))
print("Validation Labels: " + str(len(validation_mask)))

100%|██████████████████████████████████████████████████████████| 68869/68869 [00:00<00:00, 908594.14it/s]

Train Labels: 50609
Test Labels: 8967
Validation Labels: 9293





In [14]:
torch.save(torch.tensor(train_mask, dtype=torch.long), f"{path_preprocessed}train_mask.pt")
torch.save(torch.tensor(test_mask, dtype=torch.long), f"{path_preprocessed}test_mask.pt")
torch.save(torch.tensor(validation_mask, dtype=torch.long), f"{path_preprocessed}validation_mask.pt")
torch.save(torch.tensor(labels_new, dtype=torch.long), f"{path_preprocessed}labels.pt")

### Create Edge indices and types

In [19]:
edge['relation'].unique()

array(['followers', 'following'], dtype=object)

In [15]:
edge_index = []
edge_type = []
#edge_relation_mapping = {'followers': 0, 'following': 1}
edge_relation_mapping = {'retweet': 0, 'following': 1}

for i in tqdm(range(len(edge))):
    source_id = edge['source_id'][i]
    target_id = edge['target_id'][i]
    relation = edge['relation'][i]
    if relation in edge_relation_mapping:
        try:
            edge_index.append([uid_index[source_id], uid_index[target_id]])
            edge_type.append(edge_relation_mapping[relation])
        except KeyError:
            continue
            
assert len(edge_index) == len(edge_type)
print("Edge Index: " + str(len(edge_index)))

100%|█████████████████████████████████████████████████████████| 138774/138774 [00:01<00:00, 91187.99it/s]

Edge Index: 138774





In [16]:
torch.save(torch.tensor(edge_index, dtype=torch.long).t(), f"{path_preprocessed}edge_index_retweet.pt")
torch.save(torch.tensor(edge_type, dtype=torch.long), f"{path_preprocessed}edge_type_retweet.pt")

### Create numerical and categorical feature vector

In [17]:
print(user.iloc[0])

created_at                                                2021-11-28 02:14:18+00:00
description                                                           AI Researcher
id                                                             u1464779562471112705
location                                                                       None
name                                                                 Thabang Lebese
pinned_tweet_id                                                                 NaN
profile_image_url                 https://pbs.twimg.com/profile_images/146477986...
protected                                                                     False
url                                                         https://t.co/oGmKkcIvGz
username                                                                  TDLebese_
verified                                                                      False
withheld                                                                    

In [18]:
def normalize_numerical_feature(data):
    data = np.array(data)
    mean = data.mean()
    std = data.std()
    return ((data - mean) / std).reshape(-1, 1)

In [19]:
def extract_numeric_user_property(property_name, normalize = False):
    res = []
    for e in user[property_name]:
        if e is not None and e is not math.isnan(e):
            res.append(e)
        else:
            res.append(0)
    return normalize_numerical_feature(res) if normalize else res

def extract_literal_user_property(property_name):
    res = []
    for e in user[property_name]:
        if e is not None:
            res.append(e)
        else:
            res.append("")
    return res

def extract_boolean_user_property(property_name):
    res = []
    for e in user[property_name]:
        if e == True:
            res.append(1)
        else:
            res.append(0)
    return res

In [20]:
following_count = extract_numeric_user_property('public_metrics.following_count', True)
followers_count = extract_numeric_user_property('public_metrics.followers_count', True)
tweet_count = extract_numeric_user_property('public_metrics.tweet_count', True)

#username_length = list(map(lambda s: len(s), extract_literal_user_property('username'))) #not in use
name_length = list(map(lambda s: len(s), extract_literal_user_property('name')))

#normalize
#username_length = normalize_numerical_feature(username_length)
name_length = normalize_numerical_feature(name_length)

In [21]:
#start_date = dt.strptime('Tue Sep 5 00:00:00 +0000 2020 ','%a %b %d %X %z %Y ')
start_date = dt.strptime('15/03/22 00:00:00 +0000','%d/%m/%y %H:%M:%S %z') #change to last date of dataset
active_days = []
for create_date in user['created_at']:
    active_days.append((start_date - create_date).days)
active_days = normalize_numerical_feature(active_days)

In [22]:
# convert to tensors
following_count = torch.tensor(following_count, dtype=torch.float32)
followers_count = torch.tensor(followers_count, dtype=torch.float32)
tweet_count = torch.tensor(tweet_count, dtype=torch.float32)
# username_length = torch.tensor(username_length, dtype=torch.float32) #not in use
name_length = torch.tensor(name_length, dtype=torch.float32)
active_days = torch.tensor(active_days, dtype=torch.float32)

num_properties_tensor = torch.cat([
    followers_count,
    active_days,
    name_length,
    following_count,
    tweet_count], 
    dim=1)

In [23]:
# check for NaN values
pd.DataFrame(num_properties_tensor.detach().numpy()).isna().values.any()

False

In [24]:
protected = extract_boolean_user_property('protected')
verified = extract_boolean_user_property('verified')

In [25]:
default_profile_image = []
default_image_url = 'https://abs.twimg.com/sticky/default_profile_images/default_profile_normal.png'
for e in user['profile_image_url']:
    if e is not None:
        if e == default_image_url or e == '':
            default_profile_image.append(1)
        else:
            default_profile_image.append(0)
    else:
        default_profile_image.append(1)

In [26]:
# convert to tensors
protected = torch.tensor(protected, dtype=torch.float16).reshape(-1, 1)
verified = torch.tensor(verified, dtype=torch.float16).reshape(-1, 1)
default_profile_image = torch.tensor(default_profile_image, dtype=torch.float16).reshape(-1, 1)

categorical_properties_tensor = torch.cat([
    protected,
    verified,
    default_profile_image], 
    dim=1)

In [27]:
torch.save(num_properties_tensor, f"{path_preprocessed}num_properties_tensor.pt")
torch.save(categorical_properties_tensor, f"{path_preprocessed}categorical_properties_tensor.pt")

### Extract user tweets

In [28]:
# create weekly time split
first_week = dt.strptime('20/09/21 00:00:00 +0000','%d/%m/%y %H:%M:%S %z') # Start: Week 38-2021
number_of_weeks = 26 # End: Week 10-2022
all_timestamps = []

for i in range(number_of_weeks):
    all_timestamps.append(first_week + timedelta(weeks=i))
    
id_tweet_timestamps = [{i:[] for i in range(len(uid_index.keys()))} for _ in range(len(all_timestamps))]

In [29]:
'''
user_tweets_path = "src/Data_test/users/"
with open(f"{user_tweets_path}u12/tweet.json", 'r') as tweet_file:
    tweets_ = json.load(tweet_file)
    for tweet in tweets_:
        timestamp = dt.strptime(tweet['created_at'], '%Y-%m-%d %H:%M:%S%z')
        for idx in range(1, len(all_timestamps)):
            before = all_timestamps[idx-1]
            after = all_timestamps[idx]
            if timestamp >= before and timestamp < after:
                print(timestamp)
''';

In [30]:
from collections import defaultdict
number_of_users_with_tweets = 0

counts = defaultdict(int)
user_tweets_path = "src/Data_test/users/"
for username in tqdm(list(uid_index.keys())):
    tweet_path_specific = f"{user_tweets_path}{username}/tweet.json"
    try:
        u_id = uid_index[username]
        with open(tweet_path_specific, 'r') as tweet_file:
            tweets = json.load(tweet_file)
            if len(tweets) > 0:
                number_of_users_with_tweets += 1
        for tweet in tweets:
            text = tweet['text']
            timestamp = dt.strptime(tweet['created_at'], '%Y-%m-%d %H:%M:%S%z')
            for idx in range(1, len(all_timestamps)):
                before = all_timestamps[idx-1]
                after = all_timestamps[idx]
                if timestamp >= before and timestamp < after:
                    id_tweet_timestamps[idx-1][u_id].append(text)
                    counts[idx-1] += 1
                    break
    except FileNotFoundError:
        continue
        
for idx, t in tqdm(enumerate(id_tweet_timestamps)):
    with open(f"{path_preprocessed}id_tweet_{idx}.json", 'w') as tweet_file:
        json.dump(t, tweet_file)

100%|██████████████████████████████████████████████████████████████| 68869/68869 [43:15<00:00, 26.53it/s]
26it [00:29,  1.14s/it]


In [31]:
number_of_users_with_tweets

63802

In [32]:
counts

defaultdict(int,
            {20: 1003474,
             18: 534902,
             17: 399229,
             16: 319977,
             15: 240536,
             10: 147259,
             9: 114668,
             22: 841797,
             21: 651518,
             8: 119609,
             1: 72102,
             0: 68483,
             23: 1125186,
             19: 817105,
             12: 176174,
             7: 114822,
             6: 102911,
             5: 97503,
             4: 90940,
             2: 82451,
             14: 167701,
             3: 85157,
             13: 162539,
             11: 163205,
             24: 341283})

In [33]:
'''
user_tweets_path = "src/Data_test/users/"
for username in tqdm(uid_index.keys()):
    tweet_path_specific = f"{user_tweets_path}{username}/tweet.json"
    try:
        u_id = uid_index[username]
        with open(tweet_path_specific, 'r') as tweet_file:
            tweets = json.load(tweet_file)
        for tweet in tweets:
            text = tweet['text']
            id_tweet[u_id].append(text)
    except:
        continue
        
with open(f"{path_preprocessed}id_tweet.json", 'w') as tweet_file:
    json.dump(id_tweet, tweet_file)

''';

In [34]:
user_descriptions = list(user['description'])

In [35]:
# uses string as key
#each_user_tweets=json.load(open("src/Data_test/preprocessed/id_tweet.json",'r')) # = id_tweet

In [36]:
id_tweet_timestamps = [json.load(open(f"{path_preprocessed}id_tweet_{idx}.json",'r')) for idx in range(26)]

In [37]:
print(id_tweet_timestamps[24]['45'])

['RT @FrRonconi: This aircraft uses foldable wings to take off like a helicopter yet fly like a normal airplane 👍🏻\n\nby @pterodynamics \n#Drone…', 'RT @enilev: Researchers Build #NeuralNetworks With Actual #Neurons \nV/ @hackaday \n\n#Science #neurotech \n@CorticalLabs \n\n@andi_staub @IanLJo…', 'RT @chboursin: This egg-shaped capsule is actually a smart, self-sustaining home v/ @mashable  cc @jblefevre60 @kalydeoo @FrRonconi @andi_s…', 'RT @sebbourguignon: [#Innovation] This robotic upper limb looks so realistic via @gigadgets_ \n\n#AI #Engineering\n\n@labordeolivier @MargaretS…', 'RT @jblefevre60: 💥Top 5 future technology inventions 2022-2050!\n\n#AI #MachineLearning #Python #coding #100DaysOfCode @MikeQuindazzi\n\n@Spiro…', 'RT @pascal_bornet: Breakthrough: thanks to this machine, this paralyzed man is now able to communicate using his thoughts\n\nA brilliant proj…', 'RT @VisiveAI: This is a #robot monster right out of #WildWildWest.\n \n#Robotics #Automation #Robot #ArtificialInte

In [38]:
from collections import defaultdict
counts = defaultdict(int)
number_of_users_with_tweets_timestamps = 0
for i in range(26):
    for j in range(len(id_tweet_timestamps[i])):
        if len(id_tweet_timestamps[i][str(j)]) > 0:
            counts[i] += 1
            number_of_users_with_tweets_timestamps += 1

In [39]:
counts

defaultdict(int,
            {0: 10873,
             1: 11266,
             2: 12266,
             3: 12326,
             4: 12781,
             5: 13229,
             6: 13971,
             7: 14574,
             8: 15004,
             9: 14799,
             10: 16761,
             11: 17902,
             12: 18490,
             13: 17579,
             14: 17034,
             15: 21159,
             16: 24638,
             17: 27798,
             18: 29015,
             19: 32734,
             20: 38767,
             21: 42058,
             22: 40329,
             23: 37622,
             24: 23332})

In [40]:
number_of_users_with_tweets_timestamps

536307

### Create feature embeddings

In [41]:
text_extraction_pipeline = pipeline('feature-extraction', model='roberta-base', tokenizer='roberta-base', device=device, padding=True, truncation=True, max_length=50, add_special_tokens=True)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [42]:
'''
empty_counter = 0
for idx, (k, v) in tqdm(enumerate(each_user_tweets.items())):
    print(k)
    print(v)
    
    if idx == 10:
        break
    
    
print(empty_counter)
#print(empty_counter / len(each_user_tweets))
'''

'\nempty_counter = 0\nfor idx, (k, v) in tqdm(enumerate(each_user_tweets.items())):\n    print(k)\n    print(v)\n    \n    if idx == 10:\n        break\n    \n    \nprint(empty_counter)\n#print(empty_counter / len(each_user_tweets))\n'

In [43]:
#list(each_user_tweets.keys())[0:5]

In [44]:
#key = '3'
#print(type(each_user_tweets[key]))
#print(each_user_tweets[key])

In [45]:
'''
user_description_embedding = []
for desc in tqdm(user_descriptions):
    if not desc or len(desc) == 0:
        user_description_embedding.append(torch.zeros(768))
        continue
    feature = torch.tensor(text_extraction_pipeline(desc))
    mean_feature = torch.mean(feature, dim=[0,1])
    user_description_embedding.append(mean_feature)
''';

In [46]:
# torch.save(torch.stack(user_description_embedding, dim=0), f"{path_preprocessed}user_description_embedding_tensor.pt")

In [47]:
max_tweets_per_user = 20
from collections import defaultdict
counts_enc = defaultdict(int)

for idx, t_list in tqdm(enumerate(id_tweet_timestamps)):
    tweets_list = []
    for i in range(len(t_list)):
        tweets = t_list[str(i)]
        number_of_tweets = min(max_tweets_per_user, len(tweets))

        if len(tweets) == 0:
            mean_feature = torch.zeros(768)
        else:
            counts_enc[idx] += 1
            tweet_embeddings = []
            for j, tweet in enumerate(tweets[0:number_of_tweets]):

                if not tweet or len(tweet) == 0:
                    tweet_embeddings.append(torch.zeros(768))
                    continue

                each_tweet_tensor=torch.tensor(text_extraction_pipeline(tweet))
                total_word_tensor = torch.mean(each_tweet_tensor, dim=[0,1])
                tweet_embeddings.append(total_word_tensor)

            mean_feature = torch.mean(torch.stack(tweet_embeddings), dim=0)
        tweets_list.append(mean_feature)
    torch.save(torch.stack(tweets_list), f"{path_preprocessed}user_tweets_tensor_{idx}.pt")

26it [16:00:03, 2215.52s/it]


In [48]:
#torch.mean(torch.stack(tweets_list, dim=0), dim=0)

In [49]:
#torch.save(torch.stack(tweets_list), f"{path_preprocessed}user_tweets_tensor.pt")

In [50]:
'''
max_tweets_per_user = 20
tweets_list = []
for i in tqdm(range(len(each_user_tweets))):
    tweets = each_user_tweets[str(i)]
    number_of_tweets = min(max_tweets_per_user, len(tweets))
    
    if len(tweets) == 0:
        mean_feature = torch.zeros(768)
    else:
        tweet_embeddings = []
        for j, tweet in enumerate(tweets[0:number_of_tweets]):
            if not tweet or len(tweet) == 0:
                tweet_embeddings.append(torch.zeros(768))
                continue

            each_tweet_tensor=torch.tensor(text_extraction_pipeline(tweet))
            total_word_tensor = torch.mean(each_tweet_tensor, dim=[0,1])
            tweet_embeddings.append(total_word_tensor)
            
        mean_feature = torch.mean(torch.stack(tweet_embeddings), dim=0)
    tweets_list.append(mean_feature)
''';