In [48]:
import json
import pandas as pd
from pathlib import Path
import numpy as np

In [49]:
import tqdm
from tqdm import tqdm

In [50]:
# 1) For embedding+dimensionality reduction
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

In [109]:
import torch

### Dataset

In [51]:
DATA_DIR = "../Dataset/TwiBot-22"

DATA_DIR = Path(DATA_DIR)
# make sure the data directory exists
if not DATA_DIR.exists():
    raise FileNotFoundError(f"Data directory {DATA_DIR} does not exist.")

def load_json_records(fname):
    """Load a JSON file of array- or line- delimited records."""
    path = DATA_DIR / fname
    with open(path, 'r', encoding='utf-8') as f:
        # if the file is a single large JSON array:
        try:
            data = json.load(f)
        except json.JSONDecodeError:
            # fallback: one JSON object per line
            f.seek(0)
            data = [json.loads(line) for line in f]
    return data

In [61]:
user_dicts = load_json_records('user.json')
users_df = pd.DataFrame(user_dicts)

### Preprocessing

In [62]:
pm = pd.json_normalize(users_df['public_metrics'])
print(pm.columns)
print(pm.head())

Index(['followers_count', 'following_count', 'tweet_count', 'listed_count'], dtype='object')
   followers_count  following_count  tweet_count  listed_count
0             7316              215         3098            69
1              123             1090         1823             0
2                3               62           66             0
3              350              577          237             1
4              240              297         3713             8


In [63]:
users_df = pd.concat([users_df.drop('public_metrics',axis=1), pm], axis=1)

In [64]:
ent = pd.json_normalize(users_df['entities'])
print(ent.columns)
print(ent.head())

Index(['url.urls', 'description.urls', 'description.mentions',
       'description.hashtags', 'description.cashtags'],
      dtype='object')
                                            url.urls  \
0  [{'start': 0, 'end': 23, 'url': 'https://t.co/...   
1                                                NaN   
2                                                NaN   
3                                                NaN   
4                                                NaN   

                                    description.urls  \
0  [{'start': 41, 'end': 64, 'url': 'https://t.co...   
1                                                NaN   
2                                                NaN   
3                                                NaN   
4                                                NaN   

                               description.mentions description.hashtags  \
0                                               NaN                  NaN   
1                                

In [65]:
users_df = pd.concat([users_df.drop('entities',axis=1), ent], axis=1)

In [66]:
# parse your created_at as UTC
users_df['created_at'] = pd.to_datetime(users_df['created_at'], utc=True)

# get “now” in UTC, so it’s also tz-aware
now_utc = pd.Timestamp.now(tz='UTC')

In [67]:
# 1) account age in days
users_df['account_age_days'] = (now_utc - users_df['created_at']).dt.days

# 2) tweets per day
users_df['tweets_per_day'] = users_df['tweet_count'] / users_df['account_age_days']

# # 3) binary flags
# users_df['is_verified'] = users_df['verified'].astype(int)
# users_df['is_protected'] = users_df['protected'].astype(int)

# 4) length of bio
# users_df['desc_len'] = users_df['description'].fillna('').str.len()


In [69]:
# Drop created at, description, and verified
users_df = users_df.drop(columns=['created_at'])

In [71]:
users_df['profile_image_url'] = users_df['profile_image_url'].fillna('')

In [70]:
users_df.columns


Index(['description', 'id', 'location', 'name', 'pinned_tweet_id',
       'profile_image_url', 'protected', 'url', 'username', 'verified',
       'withheld', 'followers_count', 'following_count', 'tweet_count',
       'listed_count', 'url.urls', 'description.urls', 'description.mentions',
       'description.hashtags', 'description.cashtags', 'account_age_days',
       'tweets_per_day'],
      dtype='object')

In [104]:
users_df['protected'] = users_df['protected'].astype(int)
users_df['verified']  = users_df['verified'].astype(int)

In [None]:
print("[3/6] Concatenating text fields and embedding…")

# combine the three text fields into one string per user
def safe_str(x):
    return x if isinstance(x, str) else ""
users_df['text_combo'] = (
    users_df['description'].apply(safe_str) + "  " +
    users_df['name'].apply(safe_str) + "  " +
    users_df['username'].apply(safe_str) + "  " +
    users_df['location'].apply(safe_str)
)

users_df['text_combo'] = users_df['text_combo'].fillna('')
users_df = users_df.drop(columns=['description', 'name', 'username', 'location'])


[3/6] Concatenating text fields and embedding…


In [88]:
users_df.shape

(1000000, 19)

In [None]:
users_df = users_df.drop(columns=['withheld', 'url.urls', 'description.urls', 'description.mentions', 'description.hashtags', 'description.cashtags'])

In [98]:
users_df.columns

Index(['id', 'pinned_tweet_id', 'profile_image_url', 'protected', 'url',
       'verified', 'followers_count', 'following_count', 'tweet_count',
       'listed_count', 'account_age_days', 'tweets_per_day', 'text_combo'],
      dtype='object')

### Text Cols

In [100]:
# load a compact, high-quality SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')  
# WHY? mpnet-base gives 768-dim embeddings aligned for sentence similarity.

# embed in batches to avoid OOM
batch_size = 256
texts = users_df['text_combo'].tolist()
embeddings = []
for i in tqdm(range(0, len(texts), batch_size)):
    batch = texts[i:i+batch_size]
    embs = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
    embeddings.append(embs)
embeddings = np.vstack(embeddings)  
# shape = (num_users, 768)

100%|██████████| 3907/3907 [1:17:20<00:00,  1.19s/it]


In [106]:
# -----------------------------------------------------------------------------
# ——— STEP 4: PCA → 8 DIMS ————————————————————————————————
# -----------------------------------------------------------------------------
print("[4/6] Reducing embeddings to 12 dims with PCA…")
pca = PCA(n_components=12, random_state=42)
text_feats_12 = pca.fit_transform(embeddings)  

[4/6] Reducing embeddings to 12 dims with PCA…


### ID Cols

In [103]:
id_cols = ['id', 'pinned_tweet_id', 'profile_image_url', 'url']
# ensure strings and fill missing pinned_tweet_id with empty string
users_df['pinned_tweet_id'] = users_df['pinned_tweet_id'].fillna('').astype(str)
users_df['profile_image_url'] = users_df['profile_image_url'].fillna('').astype(str)
users_df['url']               = users_df['url'].fillna('').astype(str)
users_df['id']                = users_df['id'].astype(str)

meta = { col: users_df[col].tolist() for col in id_cols }
with open('node_meta.json','w') as f:
    json.dump(meta, f)
print(f"[✔] Saved ID/meta mapping to node_meta.json")


[✔] Saved ID/meta mapping to node_meta.json


### Numeric Cols

In [105]:
numeric_cols = [
    'protected','verified',
    'followers_count','following_count','tweet_count','listed_count',
    'account_age_days','tweets_per_day'
]
numeric_feats = users_df[numeric_cols].to_numpy(dtype=float)
assert numeric_feats.shape[1] == 8

### Final

In [107]:
# -----------------------------------------------------------------------------
# ——— STEP 5: STACK & FORM FINAL (num_users × 20) ————————————————————
# -----------------------------------------------------------------------------
print("[5/6] Stacking numeric + text feats → (N, 20)…")
final_feats = np.hstack([numeric_feats, text_feats_12])
assert final_feats.shape[1] == 20, "Expected 20 features!"

[5/6] Stacking numeric + text feats → (N, 20)…


In [110]:
# convert → torch tensor
prop_tensor = torch.tensor(final_feats, dtype=torch.float)

# -----------------------------------------------------------------------------
# ——— STEP 6: SAVE FOR RGT ————————————————————————————————
# -----------------------------------------------------------------------------
out_path = 'num_properties_tensor.pt'
torch.save(prop_tensor, out_path)
print(f"[6/6] Done! Saved property tensor with shape {prop_tensor.shape} to {out_path}")

[6/6] Done! Saved property tensor with shape torch.Size([1000000, 20]) to num_properties_tensor.pt
