# Test the data requirement of metapath2vec
Reference: example using hetero [[Link](https://github.com/pyg-team/pytorch_geometric/blob/master/examples/hetero/metapath2vec.py)].

In [10]:
%load_ext autoreload
%autoreload 2
%cd D:\mobi-seg-net

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
D:\mobi-seg-net


In [11]:
import workers
import graph_workers
import sqlalchemy

In [12]:
# Data location
user = workers.keys_manager['database']['user']
password = workers.keys_manager['database']['password']
port = workers.keys_manager['database']['port']
db_name = workers.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}?gssencmode=disable')

## 1. Prepare graph

In [None]:
cdtg = graph_workers.CityDataToGraph(data_path='dbs/cities/stockholm.parquet',
                                     data_space_group='dbs/cities/stockholm_space_group.csv')
cdtg.edges_processing()

In [42]:
# Basic graph
basic, group_node = False, False
cdtg.hetero_graph_maker(basic=basic, group_node=group_node)
cdtg.prediction_target(individual=True, space=True)

Constructed HeteroData object: HeteroData(
  individual={
    y_index=[937937],
    y=[937937],
  },
  hexagon={
    y_index=[3146],
    y=[3146],
  },
  poi={ y_index=[5] },
  (individual, visits, hexagon)={ edge_index=[2, 24532983] },
  (hexagon, visited_by, individual)={ edge_index=[2, 24532983] },
  (hexagon, contains, poi)={ edge_index=[2, 27230] },
  (poi, located_in, hexagon)={ edge_index=[2, 27230] }
)
Individual labels: tensor([1, 1, 1,  ..., 1, 1, 1], dtype=torch.int32)
Hexagon labels: tensor([0, 1, 1,  ..., 0, 0, 0], dtype=torch.int32)


In [45]:
parameter_set = {'walks_per_node': 100,
                  'embedding_dim': 64,
                  'walk_length': 40,
                  'context_size': 7,
                  'num_negative_samples': 5}
para = parameter_set.copy()
para['basic'] = basic
para['group_node'] = group_node

## 2. Set up the model

In [46]:
# Define metapaths
paths = graph_workers.paths_design(basic=basic, group_node=group_node)
print(paths)
# Initialize the model
ge = graph_workers.Graph2EmbeddingSpace(graph=cdtg.graph)
ge.model_init(walks_per_node=para['walks_per_node'],
              embedding_dim=para['embedding_dim'],
              walk_length=para['walk_length'],
              context_size=para['context_size'],
              num_negative_samples=para['num_negative_samples'],
              metapath=paths, batch_size=16)

[('individual', 'visits', 'hexagon'), ('hexagon', 'contains', 'poi'), ('poi', 'located_in', 'hexagon'), ('hexagon', 'visited_by', 'individual')]
Device: cuda


## 3. Train the model

In [None]:
# Training loop
max_epochs = 6
for epoch in range(max_epochs):
    should_stop = ge.train_r(epoch, log_steps=50, patience=5, min_delta=0.0005)
    if should_stop:
        break

In [32]:
for i, (pos_rw, neg_rw) in enumerate(ge.loader):
    print(i)

0


## 4. Prediction tasks

In [7]:
print("Predict individual labels:")
accuracy_i, c_report_i, c_matrix_i = ge.prediction_task(individual=True)

print("Predict hexagon's presence of transit stations:")
accuracy_h, c_report_h, c_matrix_h = ge.prediction_task(individual=False)

Predict individual labels:
Accuracy: 0.3255
Classification Report:
              precision    recall  f1-score   support

           1       0.64      0.45      0.53    111306
           2       0.07      0.17      0.10     12275
           3       0.28      0.05      0.09     50891
           4       0.09      0.47      0.15     13116

    accuracy                           0.33    187588
   macro avg       0.27      0.29      0.22    187588
weighted avg       0.46      0.33      0.35    187588

Confusion Matrix:
[[50056 17242  5625 38383]
 [ 4469  2106   710  4990]
 [20177  8348  2762 19604]
 [ 3843  2331   801  6141]]
Predict hexagon's presence of transit stations:
Accuracy: 0.8127
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.86      0.84       348
           1       0.82      0.75      0.78       282

    accuracy                           0.81       630
   macro avg       0.81      0.81      0.81       630
weighted av

In [22]:
result = dict()
result['accuracy_i'] = accuracy_i
result['c_report_i'] = c_report_i
result['c_matrix_i'] = c_matrix_i
result['accuracy_h'] = accuracy_h
result['c_report_h'] = c_report_h
result['c_matrix_h'] = c_matrix_h
result['basic'] = 1

In [25]:
import pickle

with open('dbs/embeddings/result_set1.pickle', 'wb') as handle:
    pickle.dump(result, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [26]:
# Load data (deserialize)
with open('dbs/embeddings/result_set1.pickle', 'rb') as handle:
    result = pickle.load(handle)

{'accuracy_i': 0.32552721922511035, 'c_report_i': '              precision    recall  f1-score   support\n\n           1       0.64      0.45      0.53    111306\n           2       0.07      0.17      0.10     12275\n           3       0.28      0.05      0.09     50891\n           4       0.09      0.47      0.15     13116\n\n    accuracy                           0.33    187588\n   macro avg       0.27      0.29      0.22    187588\nweighted avg       0.46      0.33      0.35    187588\n', 'c_matrix_i': array([[50056, 17242,  5625, 38383],
       [ 4469,  2106,   710,  4990],
       [20177,  8348,  2762, 19604],
       [ 3843,  2331,   801,  6141]], dtype=int64), 'accuracy_h': 0.8126984126984127, 'c_report_h': '              precision    recall  f1-score   support\n\n           0       0.81      0.86      0.84       348\n           1       0.82      0.75      0.78       282\n\n    accuracy                           0.81       630\n   macro avg       0.81      0.81      0.81       63

In [10]:
import pandas as pd

In [15]:
z = ge.model('individual', batch=ge.graph["individual"].y_index.to(ge.device)).cpu().detach().numpy()
y = ge.graph["individual"].y_index.cpu().detach().numpy()
g = ge.graph["individual"].y.cpu().detach().numpy()
i_reverse_mapping = {v:k for k, v in cdtg.individuals_mapping.items()}
df_res = pd.DataFrame(z, columns=[f'x{i}' for i in range(z.shape[1])])
df_res.loc[:, 'y'] = y
df_res.loc[:, 'device_aid'] = df_res.loc[:, 'y'].apply(lambda x: i_reverse_mapping[x])
df_res.loc[:, 'group'] = g

In [17]:
z = ge.model('hexagon', batch=ge.graph["hexagon"].y_index.to(ge.device)).cpu().detach().numpy()
y = ge.graph["hexagon"].y_index.cpu().detach().numpy()
g = ge.graph["hexagon"].y.cpu().detach().numpy()
h_reverse_mapping = {v:k for k, v in cdtg.h3_mapping.items()}

df_resh = pd.DataFrame(z, columns=[f'x{i}' for i in range(z.shape[1])])
df_resh.loc[:, 'y'] = y
df_resh.loc[:, 'h3_id'] = df_resh.loc[:, 'y'].apply(lambda x: h_reverse_mapping[x])
df_resh.loc[:, 'group'] = g

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x57,x58,x59,x60,x61,x62,x63,y,h3_id,group
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,8808866e53fffff,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,8808867547fffff,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,8808866e57fffff,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,8808866e4bfffff,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4,880886618dfffff,1
