In [2]:
import torch
from torch_geometric.data import Data
import pandas as pd
import numpy as np
from tqdm import tqdm
import dask.dataframe as dd

# Load label (node) data
print("Loading label data...")
label_file = 'D:/Twitter-Bot-Detection-Model/ml_model/dataset/label.csv'
labels_df = pd.read_csv(label_file, dtype={'id': 'object', 'label': 'category'})

# Convert user IDs to integers more efficiently
labels_df['id'] = pd.to_numeric(labels_df['id'].str[1:], errors='coerce')

# Create a mapping from old ID to new index using a dictionary comprehension
id_to_index = {old_id: new_index for new_index, old_id in enumerate(labels_df['id'])}

# Map bot/human labels to binary (bot = 0, human = 1)
label_map = {'bot': 0, 'human': 1}
labels_df['label'] = labels_df['label'].map(label_map)

# Create node labels tensor
num_nodes = len(id_to_index)
y = torch.zeros(num_nodes, dtype=torch.long)
y[labels_df['id'].map(id_to_index)] = torch.tensor(labels_df['label'].values, dtype=torch.long)

# Load edge data using pandas for initial processing
print("Loading edge data...")
edge_file = 'D:/Twitter-Bot-Detection-Model/ml_model/dataset/edge.csv'
chunks = pd.read_csv(edge_file, dtype={'source_id': 'object', 'target_id': 'object'}, chunksize=1000000)

# Process edges
print("Processing edges...")
edge_list = []
for chunk in tqdm(chunks, desc="Processing edge chunks"):
    chunk['source_id'] = pd.to_numeric(chunk['source_id'].str[1:], errors='coerce')
    chunk['target_id'] = pd.to_numeric(chunk['target_id'].str[1:], errors='coerce')
    
    # Filter out rows with NaN values
    chunk = chunk.dropna()
    
    source_indices = chunk['source_id'].map(id_to_index)
    target_indices = chunk['target_id'].map(id_to_index)
    
    # Filter out edges that don't have corresponding nodes
    mask = source_indices.notna() & target_indices.notna()
    source_indices = source_indices[mask].astype(int)
    target_indices = target_indices[mask].astype(int)
    
    edge_list.append(np.array([source_indices, target_indices]))

# Combine edge parts
edge_index = np.concatenate(edge_list, axis=1)

# Convert to torch tensor
edge_index = torch.tensor(edge_index, dtype=torch.long)

# Create the PyTorch-Geometric data object
data = Data(edge_index=edge_index, y=y)

print(f"Graph Data Object: {data}")
print(f"Number of nodes: {data.num_nodes}")
print(f"Number of edges: {data.num_edges}")

Loading label data...
Loading edge data...
Processing edges...


Processing edge chunks: 171it [09:58,  3.50s/it]


Graph Data Object: Data(edge_index=[2, 3745300], y=[1000000])
Number of nodes: 996093
Number of edges: 3745300


