# Test the data requirement of metapath2vec
Reference: example using hetero [[Link](https://github.com/pyg-team/pytorch_geometric/blob/master/examples/hetero/metapath2vec.py)].

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\mobi-seg-net

D:\mobi-seg-net


In [2]:
import pandas as pd
import torch
import torch_geometric
from torch_geometric.nn import MetaPath2Vec
from torch_geometric.data import HeteroData
import workers
import sqlalchemy
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import numpy as np
from torch_geometric.datasets import AMiner
print(torch.__version__)

2.5.1+cu124


In [3]:
# Data location
user = workers.keys_manager['database']['user']
password = workers.keys_manager['database']['password']
port = workers.keys_manager['database']['port']
db_name = workers.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}?gssencmode=disable')

## 1. Prepare a subset of individuals

In [19]:
df_stops = pd.read_parquet("dbs/cities/stockholm.parquet")
print(f"No of edges {len(df_stops)} from {df_stops['device_aid'].nunique()} unique devices")

No of edges 24609483 from 941746 unique devices


In [20]:
df_ib = pd.read_sql("""SELECT device_aid, "group" FROM device_group;""", con=engine)
df_ib = df_ib[df_ib['device_aid'].isin(df_stops['device_aid'].unique())]

## 2. Get the edges

In [21]:
df_stops = df_stops[df_stops.device_aid.isin(df_ib['device_aid'].unique())]
print(f"No of edges {len(df_stops)} from {df_stops['device_aid'].nunique()} unique devices")
individuals_mapping = dict(zip(df_stops['device_aid'].unique(), range(0, df_stops['device_aid'].nunique())))
h3_mapping = dict(zip(df_stops['h3_id'].unique(), range(0, df_stops['h3_id'].nunique())))

No of edges 24532983 from 937937 unique devices


### 2.1 Process the H-F edges

In [22]:
df_stops_h = df_stops[['h3_id', 'kind']].explode('kind')
df_stops_h.dropna(inplace=True)
df_stops_h = df_stops_h.groupby(['h3_id', 'kind']).size().rename('count').reset_index()
print(df_stops_h.shape, df_stops_h['count'].max(), df_stops_h['count'].min())

(7100, 3) 1661358 1


In [23]:
def count_norm(data):
    data['count_n'] = np.ceil(data['count'] / data['count'].sum() * 10)
    return data
df_stops_hs = df_stops_h.groupby('h3_id').apply(lambda data: count_norm(data), include_groups=False).reset_index()

In [24]:
df_stops_hs['count_n'] = df_stops_hs['count_n'].astype(int)
df_stops_hs['count_r'] = df_stops_hs['count_n'].apply(lambda x: [1]*x)
df_stops_hs = df_stops_hs[['h3_id', 'kind', 'count_r']].explode('count_r').drop(columns=['count_r'])
print(df_stops_hs.shape)

(27230, 2)


In [25]:
# df_stops_h.drop_duplicates(['h3_id', 'kind'], inplace=True)
poi_mapping = dict(zip(df_stops_hs['kind'].unique(), range(0, df_stops_hs['kind'].nunique())))

In [26]:
df_stops.loc[:, 'src_id'] = df_stops.loc[:, 'device_aid'].map(individuals_mapping)
df_stops.loc[:, 'dst_id'] = df_stops.loc[:, 'h3_id'].map(h3_mapping)
df_stops_hs.loc[:, 'src_id'] = df_stops_hs.loc[:, 'h3_id'].map(h3_mapping)
df_stops_hs.loc[:, 'dst_id'] = df_stops_hs.loc[:, 'kind'].map(poi_mapping)

### 2.2 Process the G-I edges

In [27]:
df_ib.loc[:, 'dst_id'] = df_ib.loc[:, 'device_aid'].map(individuals_mapping)
df_ib.loc[:, 'src_id'] = df_ib.loc[:, 'group']

In [28]:
df_ib.groupby('group').size()

group
1    556165
2     61536
3    254354
4     65882
dtype: int64

In [29]:
# path = 'dbs/AMiner'
# dataset = AMiner(path)
# data_eg = dataset[0]

## 3. Convert to HeteroData object

In [30]:
data = HeteroData()
# Add node features
data['individual'].y_index = torch.tensor([v for _, v in individuals_mapping.items()], dtype=torch.long)
data['hexagon'].y_index = torch.tensor([v for _, v in h3_mapping.items()], dtype=torch.long)
data['poi'].y_index = torch.tensor([v for _, v in poi_mapping.items()], dtype=torch.long)
data['group'].y_index = torch.tensor([v for v in df_ib['group'].unique()], dtype=torch.long)

In [31]:
# Add edge - group includes individual
edge_index = torch.tensor(df_ib[['src_id', 'dst_id']].values.T, dtype=torch.long)
data['group', 'includes', 'individual'].edge_index = edge_index

# Add edge - individual belongs_to group
edge_index = torch.tensor(df_ib[['dst_id', 'src_id']].values.T, dtype=torch.long)
data['individual', 'belongs_to', 'group'].edge_index = edge_index

# Add edge - individual visits hexagon
edge_index = torch.tensor(df_stops[['src_id', 'dst_id']].values.T, dtype=torch.long)
data['individual', 'visits', 'hexagon'].edge_index = edge_index

# Add edge - hexagon visited by individual
edge_index = torch.tensor(df_stops[['dst_id', 'src_id']].values.T, dtype=torch.long)
data['hexagon', 'visited_by', 'individual'].edge_index = edge_index

# Add edge - hexagon contains poi
edge_index = torch.tensor(df_stops_hs[['src_id', 'dst_id']].values.T, dtype=torch.long)
data['hexagon', 'contains', 'poi'].edge_index = edge_index

# Add edge - hexagon contains poi
edge_index = torch.tensor(df_stops_hs[['dst_id', 'src_id']].values.T, dtype=torch.long)
data['poi', 'located_in', 'hexagon'].edge_index = edge_index

In [34]:
# For demonstration, we define an integer label for each author node.
individual_group_dict = df_ib.set_index('device_aid')['group'].to_dict()
individual_labels = [individual_group_dict[k] for k, _ in individuals_mapping.items()]
data["individual"].y = torch.tensor(individual_labels, dtype=torch.int32)
# We'll store indices of these author nodes so we can access them for the classification test.
print("Constructed HeteroData object:", data)
print("Individual labels:", data["individual"].y)

Constructed HeteroData object: HeteroData(
  individual={
    y_index=[937937],
    y=[937937],
  },
  hexagon={ y_index=[3146] },
  poi={ y_index=[5] },
  group={ y_index=[4] },
  (group, includes, individual)={ edge_index=[2, 937937] },
  (individual, belongs_to, group)={ edge_index=[2, 937937] },
  (individual, visits, hexagon)={ edge_index=[2, 24532983] },
  (hexagon, visited_by, individual)={ edge_index=[2, 24532983] },
  (hexagon, contains, poi)={ edge_index=[2, 27230] },
  (poi, located_in, hexagon)={ edge_index=[2, 27230] }
)
Individual labels: tensor([1, 1, 1,  ..., 1, 1, 1], dtype=torch.int32)


In [33]:
torch.save(data, 'dbs/cities/graph_data_stockholm.pth')

## 4. Set up the model

In [53]:
metapath = [
    ('group', 'includes', 'individual'),
    ('individual', 'visits', 'hexagon'),
    ('hexagon', 'contains', 'poi'),
    ('poi', 'located_in', 'hexagon'),
    ('hexagon', 'visited_by', 'individual'),
    ('individual', 'belongs_to', 'group'),
]
torch.cuda.empty_cache()
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch_geometric.is_xpu_available():
    device = torch.device('xpu')
else:
    device = torch.device('cpu')
print('Device: {}'.format(device))

model = MetaPath2Vec(
    data.edge_index_dict,
    embedding_dim=64,        # Smaller dimension for our toy data
    metapath=metapath,
    walk_length=100,
    context_size=7,
    walks_per_node=500,
    num_negative_samples=5,
    sparse=True  # Use a sparse embedding for memory efficiency
).to(device)

optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

Device: cuda


## 5. Train the model to predict the individual residential segregation level
1- Segregated towards foreign-born

2- Non-segregated

3- Segregted towards native-born

In [55]:
loader = model.loader(batch_size=16, shuffle=True, num_workers=0)
def train(epoch, log_steps=10):
    model.train()
    total_loss = 0

    for i, (pos_rw, neg_rw) in enumerate(loader):
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        if (i + 1) % log_steps == 0:
            avg_loss = total_loss / log_steps
            print(f'Epoch: {epoch}, Step: {i + 1:03d}/{len(loader)}, '
                  f'Loss: {avg_loss:.4f}')
            total_loss = 0

@torch.no_grad()
def test(train_ratio=0.5):
    """Simple test: we embed individuals, then do a logistic regression on their labels."""
    model.eval()
    z = model('individual', batch=data["individual"].y_index.to(device))
    y = data["individual"].y

    num_nodes = z.size(0)
    perm = torch.randperm(num_nodes)
    train_size = int(num_nodes * train_ratio)
    train_mask = perm[:train_size]
    test_mask = perm[train_size:]

    # Fit a simple linear model on top of embeddings:
    x_train = z[train_mask].cpu()
    y_train = y[train_mask].cpu()
    x_test = z[test_mask].cpu()
    y_test = y[test_mask].cpu()

    # We can do a tiny logistic regression or SVC, but here we use a
    # built-in model test from MetaPath2Vec or do a manual approach:
    return model.test(x_train, y_train, x_test, y_test, max_iter=50)

#@torch.no_grad()
def test_alternative(train_ratio=0.5, model_type='xgboost'):
    """
    Advanced test: we embed individuals, then train a more advanced model (XGBoost or Random Forest) on their labels.

    Args:
        train_ratio (float): Ratio of training data.
        model_type (str): Type of model to use ('xgboost' or 'random_forest').
    """
    model.eval()
    z = model('individual', batch=data["individual"].y_index.to(device))
    y = data["individual"].y

    num_nodes = z.size(0)
    perm = torch.randperm(num_nodes)
    train_size = int(num_nodes * train_ratio)
    train_mask = perm[:train_size]
    test_mask = perm[train_size:]

    # Prepare data
    x_train = z[train_mask].detach().cpu().numpy()
    y_train = y[train_mask].detach().cpu().numpy()
    x_test = z[test_mask].detach().cpu().numpy()
    y_test = y[test_mask].detach().cpu().numpy()

    # Choose and train the model
    if model_type == 'xgboost':
        clf = XGBClassifier(
            n_estimators=100,  # Number of boosting rounds
            max_depth=6,       # Maximum depth of a tree
            learning_rate=0.1, # Learning rate
            objective='binary:logistic',  # For binary classification
            random_state=42
        )
    elif model_type == 'random_forest':
        clf = RandomForestClassifier(
            n_estimators=100,  # Number of trees
            max_depth=10,       # Maximum depth of a tree
            random_state=42
        )
    else:
        raise ValueError("Unsupported model type. Choose 'xgboost' or 'random_forest'.")

    # Train the model
    clf.fit(x_train, y_train)

    # Evaluate the model
    y_pred = clf.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Test Accuracy ({model_type}): {accuracy:.4f}")
    return accuracy

In [None]:
# Run Training
for epoch in range(1, 6):
    train(epoch)
    # acc = test_alternative()
    acc = test()
    print(f'Epoch: {epoch}, Accuracy: {acc:.4f}')