# Test the data requirement of metapath2vec
Reference: example using hetero [[Link](https://github.com/pyg-team/pytorch_geometric/blob/master/examples/hetero/metapath2vec.py)].

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\mobi-seg-net

D:\mobi-seg-net


In [2]:
import pandas as pd
import torch
import torch_geometric
from torch_geometric.nn import MetaPath2Vec
from torch_geometric.data import HeteroData
import workers
import sqlalchemy
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import numpy as np
from torch_geometric.datasets import AMiner
print(torch.__version__)

2.5.1+cu124


In [3]:
# Data location
user = workers.keys_manager['database']['user']
password = workers.keys_manager['database']['password']
port = workers.keys_manager['database']['port']
db_name = workers.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}?gssencmode=disable')

## 1. Prepare a subset of individuals

In [4]:
df_stops = pd.read_parquet("dbs/stops_pr/stops_pr_0.parquet", columns=['device_aid', 'h3_id', 'home', 'kind'])
df_stops = df_stops[df_stops['home']!=1]
# df_stops.drop_duplicates(subset=['device_aid', 'h3_id'], inplace=True)
print(f"No of edges {len(df_stops)} from {df_stops['device_aid'].nunique()} unique devices")

No of edges 1709633 from 33317 unique devices


In [34]:
df_ib = pd.read_sql("""SELECT device_aid, b_id FROM home_building;""", con=engine)
df_ib = df_ib[df_ib['device_aid'].isin(df_stops['device_aid'].unique())]
df_bd = pd.read_sql("""SELECT * FROM building_data;""", con=engine)
df_ib = pd.merge(df_ib, df_bd, on='b_id', how='left')
df_ib.dropna(inplace=True)

In [12]:
def ice(ai=None, bi=None, popi=None, share_a=0.8044332515556147, share_b=0.11067529894925136):
    oi = popi - ai - bi
    share_o = 1 - share_a - share_b
    return (ai / share_a - bi / share_b) / (ai / share_a + bi / share_b + oi / share_o)

def ice_group(x):
    if x < -0.2:
        return 0
    elif x < 0.2:
        return 1
    else:
        return 2

### 1.1 Birth background label

In [35]:
df_ib['pop_o'] = df_ib.loc[:, ['sweden', 'nordic', 'eu', 'other']].sum(axis=1)
df_ib = df_ib[df_ib['pop_o'] > 0].copy()
df_ib.loc[:, 'other_share'] = df_ib['other'] / df_ib['pop_o']
df_ib.loc[:, 'ice'] = df_ib.apply(lambda x: ice(ai=x['sweden'], bi=x['other'], popi=x['pop_o']), axis=1)
# workers.distr(df_ib, col_name='other_share', x_lb='Share of other', y_lb='Probability density', bin_num=100)

In [36]:
df_ib.loc[:, 'grp_o'] = pd.qcut(df_ib.loc[:, 'other_share'], q=4, labels=[1, 2, 3, 4])
df_ib.loc[:, 'grp_ice_o'] = df_ib.loc[:, 'ice'].apply(lambda x: ice_group(x))

### 1.2 Income label

In [37]:
df_ib['pop_i'] = df_ib.loc[:, ['Q1', 'Q2', 'Q3', 'Q4']].sum(axis=1)
df_ib = df_ib[df_ib['pop_i'] > 0].copy()
df_ib.loc[:, 'q1_share'] = df_ib.loc[:, 'Q1'] / df_ib.loc[:, ['Q1', 'Q2', 'Q3', 'Q4']].sum(axis=1)
df_ib.loc[:, 'ice_i'] = df_ib.apply(lambda x: ice(ai=x['Q1'], bi=x['Q4'], popi=x['pop_i'], share_a=0.25, share_b=0.25), axis=1)
# workers.distr(df_ib, col_name='q1_share', x_lb='Share of low-income quantile', y_lb='Probability density', bin_num=100)

In [38]:
df_ib.loc[:, 'grp_i'] = pd.qcut(df_ib.loc[:, 'q1_share'], q=4, labels=[1, 2, 3, 4])
df_ib.loc[:, 'grp_ice_i'] = df_ib.loc[:, 'ice_i'].apply(lambda x: ice_group(x))

In [39]:
df_ib = df_ib[['device_aid', 'grp_o', 'grp_ice_o', 'grp_i', 'grp_ice_i']]
print(len(df_ib), len(df_ib.dropna()))

33163 33163


In [40]:
df_ib.groupby('grp_ice_o').size()

grp_ice_o
0    11018
1    12900
2     9245
dtype: int64

## 2. Get the edges

In [22]:
df_stops = df_stops[df_stops.device_aid.isin(df_ib['device_aid'])]

In [23]:
individuals_mapping = dict(zip(df_stops['device_aid'].unique(), range(0, df_stops['device_aid'].nunique())))
h3_mapping = dict(zip(df_stops['h3_id'].unique(), range(0, df_stops['h3_id'].nunique())))

In [24]:
df_stops_h = df_stops[['h3_id', 'kind']].explode('kind')
df_stops_h.dropna(inplace=True)
# df_stops_h.drop_duplicates(['h3_id', 'kind'], inplace=True)
poi_mapping = dict(zip(df_stops_h['kind'].unique(), range(0, df_stops_h['kind'].nunique())))

In [25]:
df_stops.loc[:, 'src_id'] = df_stops.loc[:, 'device_aid'].map(individuals_mapping)
df_stops.loc[:, 'dst_id'] = df_stops.loc[:, 'h3_id'].map(h3_mapping)
df_stops_h.loc[:, 'src_id'] = df_stops_h.loc[:, 'h3_id'].map(h3_mapping)
df_stops_h.loc[:, 'dst_id'] = df_stops_h.loc[:, 'kind'].map(poi_mapping)

In [221]:
# path = 'dbs/AMiner'
# dataset = AMiner(path)
# data_eg = dataset[0]

## 3. Convert to HeteroData object

In [41]:
data = HeteroData()
# Add node features
data['individual'].y_index = torch.tensor([v for _, v in individuals_mapping.items()], dtype=torch.long)
data['hexagon'].y_index = torch.tensor([v for _, v in h3_mapping.items()], dtype=torch.long)
data['poi'].y_index = torch.tensor([v for _, v in poi_mapping.items()], dtype=torch.long)

In [42]:
# Add edge - individual visits hexagon
edge_index = torch.tensor(df_stops[['src_id', 'dst_id']].values.T, dtype=torch.long)
data['individual', 'visits', 'hexagon'].edge_index = edge_index

# Add edge - individual visits hexagon
edge_index = torch.tensor(df_stops[['dst_id', 'src_id']].values.T, dtype=torch.long)
data['hexagon', 'visited_by', 'individual'].edge_index = edge_index

# Add edge - hexagon contains poi
edge_index = torch.tensor(df_stops_h[['src_id', 'dst_id']].values.T, dtype=torch.long)
data['hexagon', 'contains', 'poi'].edge_index = edge_index

# Add edge - hexagon contains poi
edge_index = torch.tensor(df_stops_h[['dst_id', 'src_id']].values.T, dtype=torch.long)
data['poi', 'located_in', 'hexagon'].edge_index = edge_index

In [43]:
# For demonstration, we define an integer label for each author node.
individual_group_dict = df_ib.set_index('device_aid')['grp_ice_o'].to_dict()
individual_labels = [individual_group_dict[k] for k, _ in individuals_mapping.items()]
data["individual"].y = torch.tensor(individual_labels, dtype=torch.int32)
# We'll store indices of these author nodes so we can access them for the classification test.
print("Constructed HeteroData object:", data)
print("Author labels:", data["individual"].y)

Constructed HeteroData object: HeteroData(
  individual={
    y_index=[33163],
    y=[33163],
  },
  hexagon={ y_index=[47171] },
  poi={ y_index=[5] },
  (individual, visits, hexagon)={ edge_index=[2, 1701194] },
  (hexagon, visited_by, individual)={ edge_index=[2, 1701194] },
  (hexagon, contains, poi)={ edge_index=[2, 12401609] },
  (poi, located_in, hexagon)={ edge_index=[2, 12401609] }
)
Author labels: tensor([2, 1, 1,  ..., 1, 1, 1], dtype=torch.int32)


## 4. Set up the model

In [44]:
metapath = [
    ('hexagon', 'visited_by', 'individual'),
    ('individual', 'visits', 'hexagon'),
    ('hexagon', 'contains', 'poi'),
    ('poi', 'located_in', 'hexagon'),
]
torch.cuda.empty_cache()
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch_geometric.is_xpu_available():
    device = torch.device('xpu')
else:
    device = torch.device('cpu')
print('Device: {}'.format(device))

model = MetaPath2Vec(
    data.edge_index_dict,
    embedding_dim=64,        # Smaller dimension for our toy data
    metapath=metapath,
    walk_length=20,
    context_size=4,
    walks_per_node=2,
    num_negative_samples=2,
    sparse=True  # Use a sparse embedding for memory efficiency
).to(device)

optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

Device: cuda


## 5. Train the model to predict the individual residential segregation level
1- Segregated towards foreign-born

2- Non-segregated

3- Segregted towards native-born

In [47]:
loader = model.loader(batch_size=16, shuffle=True, num_workers=0)
def train(epoch, log_steps=1):
    model.train()
    total_loss = 0

    for i, (pos_rw, neg_rw) in enumerate(loader):
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        if (i + 1) % log_steps == 0:
            avg_loss = total_loss / log_steps
            print(f'Epoch: {epoch}, Step: {i + 1:03d}/{len(loader)}, '
                  f'Loss: {avg_loss:.4f}')
            total_loss = 0

@torch.no_grad()
def test(train_ratio=0.5):
    """Simple test: we embed individuals, then do a logistic regression on their labels."""
    model.eval()
    z = model('individual', batch=data["individual"].y_index.to(device))
    y = data["individual"].y

    num_nodes = z.size(0)
    perm = torch.randperm(num_nodes)
    train_size = int(num_nodes * train_ratio)
    train_mask = perm[:train_size]
    test_mask = perm[train_size:]

    # Fit a simple linear model on top of embeddings:
    x_train = z[train_mask].cpu()
    y_train = y[train_mask].cpu()
    x_test = z[test_mask].cpu()
    y_test = y[test_mask].cpu()

    # We can do a tiny logistic regression or SVC, but here we use a
    # built-in model test from MetaPath2Vec or do a manual approach:
    return model.test(x_train, y_train, x_test, y_test, max_iter=50)

#@torch.no_grad()
def test_alternative(train_ratio=0.5, model_type='xgboost'):
    """
    Advanced test: we embed individuals, then train a more advanced model (XGBoost or Random Forest) on their labels.

    Args:
        train_ratio (float): Ratio of training data.
        model_type (str): Type of model to use ('xgboost' or 'random_forest').
    """
    model.eval()
    z = model('individual', batch=data["individual"].y_index.to(device))
    y = data["individual"].y

    num_nodes = z.size(0)
    perm = torch.randperm(num_nodes)
    train_size = int(num_nodes * train_ratio)
    train_mask = perm[:train_size]
    test_mask = perm[train_size:]

    # Prepare data
    x_train = z[train_mask].detach().cpu().numpy()
    y_train = y[train_mask].detach().cpu().numpy()
    x_test = z[test_mask].detach().cpu().numpy()
    y_test = y[test_mask].detach().cpu().numpy()

    # Choose and train the model
    if model_type == 'xgboost':
        clf = XGBClassifier(
            n_estimators=100,  # Number of boosting rounds
            max_depth=6,       # Maximum depth of a tree
            learning_rate=0.1, # Learning rate
            objective='binary:logistic',  # For binary classification
            random_state=42
        )
    elif model_type == 'random_forest':
        clf = RandomForestClassifier(
            n_estimators=100,  # Number of trees
            max_depth=10,       # Maximum depth of a tree
            random_state=42
        )
    else:
        raise ValueError("Unsupported model type. Choose 'xgboost' or 'random_forest'.")

    # Train the model
    clf.fit(x_train, y_train)

    # Evaluate the model
    y_pred = clf.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Test Accuracy ({model_type}): {accuracy:.4f}")
    return accuracy

In [48]:
# Run Training
for epoch in range(1, 6):
    train(epoch)
    acc = test_alternative()
    # acc = test()
    print(f'Epoch: {epoch}, Accuracy: {acc:.4f}')

Epoch: 1, Step: 001/2949, Loss: 0.9206
Epoch: 1, Step: 002/2949, Loss: 0.8994
Epoch: 1, Step: 003/2949, Loss: 0.8915
Epoch: 1, Step: 004/2949, Loss: 0.9089
Epoch: 1, Step: 005/2949, Loss: 0.8435
Epoch: 1, Step: 006/2949, Loss: 0.9167
Epoch: 1, Step: 007/2949, Loss: 0.9073
Epoch: 1, Step: 008/2949, Loss: 0.9114
Epoch: 1, Step: 009/2949, Loss: 0.8869
Epoch: 1, Step: 010/2949, Loss: 0.8480
Epoch: 1, Step: 011/2949, Loss: 0.9537
Epoch: 1, Step: 012/2949, Loss: 0.8660
Epoch: 1, Step: 013/2949, Loss: 0.8593
Epoch: 1, Step: 014/2949, Loss: 0.8359
Epoch: 1, Step: 015/2949, Loss: 0.8936
Epoch: 1, Step: 016/2949, Loss: 0.9069
Epoch: 1, Step: 017/2949, Loss: 0.8478
Epoch: 1, Step: 018/2949, Loss: 0.8746
Epoch: 1, Step: 019/2949, Loss: 0.8895
Epoch: 1, Step: 020/2949, Loss: 0.8255
Epoch: 1, Step: 021/2949, Loss: 0.8640
Epoch: 1, Step: 022/2949, Loss: 0.9123
Epoch: 1, Step: 023/2949, Loss: 0.8641
Epoch: 1, Step: 024/2949, Loss: 0.8651
Epoch: 1, Step: 025/2949, Loss: 0.8892
Epoch: 1, Step: 026/2949,