In [None]:
import pandas as pd
import torch
from tqdm import tqdm
import numpy as np
import datetime as dt

In [None]:
# Path to your dataset
path="/Twibot-22/"

user=pd.read_json(path + 'filtered_user.json')
label = pd.read_csv(path + 'filtered_label.csv')
split = pd.read_csv(path + 'filtered_split.csv')

In [None]:
user_idx=user['id']
uid_index={uid:index for index,uid in enumerate(user_idx.values)}
user_index_to_uid = list(user.id)
uid_to_user_index = {x : i for i, x in enumerate(user_index_to_uid)}

LABEL AND TRAIN,TEST,VAL SPLIT

In [None]:
uid_label={uid:label for uid, label in zip(label['id'].values,label['label'].values)}
uid_split={uid:split for uid, split in zip(split['id'].values,split['split'].values)}

label_new=[]
train_idx=[]
test_idx=[]
val_idx=[]
for i,uid in enumerate(tqdm(user_idx.values)):
    single_label=uid_label[uid]
    single_split=uid_split[uid]
    if single_label =='human':
        label_new.append(0)
    else:
        label_new.append(1)
    if single_split=='train':
        train_idx.append(i)
    elif single_split=='test':
        test_idx.append(i)
    else:
        val_idx.append(i)

labels=torch.LongTensor(label_new)
train_mask = torch.LongTensor(train_idx)
valid_mask = torch.LongTensor(val_idx)
test_mask = torch.LongTensor(test_idx)
torch.save(train_mask,"filtered_train_idx.pt")
torch.save(valid_mask,"filtered_val_idx.pt")
torch.save(test_mask,"filtered_test_idx.pt")
torch.save(labels,'filtered_label.pt')

NUMERICAL PROPERTIES

In [None]:
print('extracting num_properties')
following_count=[]
for i,each in enumerate(user['public_metrics']):
    if i==len(user):
        break
    if each is not None and isinstance(each,dict):
        if each['following_count'] is not None:
            print(i, each['following_count'])
            following_count.append(each['following_count'])
        else:
            following_count.append(0)
    else:
        following_count.append(0)
        
statues=[]
for i,each in enumerate(user['public_metrics']):
    if i==len(user):
        break
    if each is not None and isinstance(each,dict):
        if each['tweet_count'] is not None:
            statues.append(each['tweet_count'])
        else:
            statues.append(0)
    else:
        statues.append(0)

followers_count=[]
for each in user['public_metrics']:
    if each is not None and each['followers_count'] is not None:
        followers_count.append(int(each['followers_count']))
    else:
        followers_count.append(0)
        
num_username=[]
for each in user['username']:
    if each is not None:
        num_username.append(len(each))
    else:
        num_username.append(int(0))
        
created_at=user['created_at']
created_at=pd.to_datetime(created_at,unit='s')

followers_count=pd.DataFrame(followers_count)
followers_count=(followers_count-followers_count.mean())/followers_count.std()
followers_count=torch.tensor(np.array(followers_count),dtype=torch.float32)

date0=dt.strptime('Tue Sep 5 00:00:00 +0000 2020 ','%a %b %d %X %z %Y ')
active_days=[]
for each in created_at:
    active_days.append((date0-each).days)
    
active_days=pd.DataFrame(active_days)
active_days=active_days.fillna(int(1)).astype(np.float32)

screen_name_length=[]
for each in user['name']:
    if each is not None:
        screen_name_length.append(len(each))
    else:
        screen_name_length.append(int(0))

followers_count=(followers_count-followers_count.mean())/followers_count.std()
followers_count=torch.tensor(np.array(followers_count),dtype=torch.float32)

active_days=pd.DataFrame(active_days)
active_days.fillna(int(0))
active_days=active_days.fillna(int(0)).astype(np.float32)

active_days=(active_days-active_days.mean())/active_days.std()
active_days=torch.tensor(np.array(active_days),dtype=torch.float32)

screen_name_length=pd.DataFrame(screen_name_length)
screen_name_length=(screen_name_length-screen_name_length.mean())/screen_name_length.std()
screen_name_length=torch.tensor(np.array(screen_name_length),dtype=torch.float32)

following_count=pd.DataFrame(following_count)
following_count=(following_count-following_count.mean())/following_count.std()
following_count=torch.tensor(np.array(following_count),dtype=torch.float32)

statues=pd.DataFrame(statues)
statues=(statues-statues.mean())/statues.std()
statues=torch.tensor(np.array(statues),dtype=torch.float32)

num_properties_tensor=torch.cat([followers_count,active_days,screen_name_length,following_count,statues],dim=1)

num_properties_tensor=torch.cat([followers_count,active_days,screen_name_length,following_count,statues],dim=1)

pd.DataFrame(num_properties_tensor.detach().numpy()).isna().value_counts()

CATEGORICAL PROPERTIES

In [None]:
print('extracting cat_properties')
protected=user['protected']
verified=user['verified']

protected_list=[]
for each in protected:
    if each == True:
        protected_list.append(1)
    else:
        protected_list.append(0)
        
verified_list=[]
for each in verified:
    if each == True:
        verified_list.append(1)
    else:
        verified_list.append(0)
        
default_profile_image=[]
for each in user['profile_image_url']:
    if each is not None:
        if each=='https://abs.twimg.com/sticky/default_profile_images/default_profile_normal.png':
            default_profile_image.append(int(1))
        elif each=='':
            default_profile_image.append(int(1))
        else:
            default_profile_image.append(int(0))
    else:
        default_profile_image.append(int(1))

protected_tensor=torch.tensor(protected_list,dtype=torch.float)
verified_tensor=torch.tensor(verified_list,dtype=torch.float)
default_profile_image_tensor=torch.tensor(default_profile_image,dtype=torch.float)

size = len(user)
cat_properties_tensor=torch.cat([protected_tensor.reshape([size,1]),verified_tensor.reshape([size,1]),default_profile_image_tensor.reshape([size,1])],dim=1)
