This script is modified from `notebooks/monarch_gnn/3_create_dataset_pair_tensor_nd_validate.ipynb`

In [21]:
import sys
sys.path.insert(0, '/home/rahit/projects/def-mtarailo/rahit/from_scratch/modspy-data/src/modspy_data')

In [22]:
import numpy as np
import pandas as pd
import torch
import optuna
from comet_ml import Experiment
from pytorch_lightning.loggers import CometLogger
from torch_geometric.transforms import AddSelfLoops
import wandb


import dask
import dask.dataframe as dd

from models.embed import MetaPath2VecLightningModule

In [2]:
monarch = torch.load('./data/05_model_input/2024-02-monarch_heterodata_v1.pt', map_location='cpu')

print(
    f""" 
Total nodes: {monarch.num_nodes}
Total node types: {len(monarch.node_types)}

Total edges: {monarch.num_edges}
Total edge types: {len(monarch.edge_types)}                
"""
)

# # Adding self loops to avoid 1. nodes without any edge, 2. consider intragenic modifier
# transform = AddSelfLoops()
# monarch = transform(monarch)

 
Total nodes: 862115
Total node types: 88

Total edges: 11412471
Total edge types: 289                



In [3]:
dataset_df = pd.read_csv('./data/04_feature/2024-06-11-modifier_merged_dataset.tsv', sep="\t")
# dataset_df = catalog.load("modifiers")

In [4]:
dataset_df['species'] = 'Homo sapiens'
dataset_df.loc[dataset_df['datasource']=='MTG', 'species'] = 'Caenorhabditis elegans'

# Reordering important columns
desired_first_columns = ['datasource', 'species', 'modifier_gene_symbol',
                         'target_gene_symbol', 'is_modifier'] 
remaining_columns = [col for col in dataset_df.columns if col not in desired_first_columns]
new_column_order = desired_first_columns + remaining_columns
dataset_df = dataset_df[new_column_order]
dataset_df.head()


Unnamed: 0,datasource,species,modifier_gene_symbol,target_gene_symbol,is_modifier,go_n_common_ancestors_max,go_n_common_ancestors_avg,go_n_common_ancestors_bma,go_n_union_ancestors_max,go_n_union_ancestors_avg,...,homology,coexpression,coexpression_transferred,experiments,experiments_transferred,database,database_transferred,textmining,textmining_transferred,combined_score
0,JVL,Homo sapiens,ETV1,ATR,1.0,59.0,2.960317,6.109155,101.0,28.846782,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,JVL,Homo sapiens,DCLRE1C,BRCA1,1.0,22.0,4.16309,7.260417,75.0,29.659375,...,0.0,80.0,0.0,292.0,0.0,540.0,0.0,321.0,83.0,779.0
2,JVL,Homo sapiens,SMARCAL1,BRCA1,1.0,45.0,5.598765,10.268041,87.0,31.152206,...,0.0,49.0,62.0,0.0,0.0,0.0,0.0,506.0,204.0,602.0
3,JVL,Homo sapiens,TRIP13,BRCA1,1.0,43.0,4.51511,11.19802,128.0,39.033929,...,0.0,243.0,110.0,0.0,0.0,0.0,0.0,285.0,95.0,505.0
4,JVL,Homo sapiens,MUS81,BRCA2,1.0,29.0,4.599469,9.086957,88.0,28.369439,...,0.0,0.0,61.0,0.0,0.0,500.0,0.0,768.0,350.0,919.0


In [5]:
nodes_df = dd.read_parquet('./data/02_intermediate/monarch/nodes_with_type_idx')  
edges_df = dd.read_parquet('./data/02_intermediate/monarch/edges_pre_df_reduction_v2')


In [6]:
print(nodes_df.index.shape[0].compute())
display(nodes_df.head())

# Use dataset's source and target gene id to pull the `type_index` column From `nodes_df``

862115


Unnamed: 0_level_0,category,name,in_taxon,in_taxon_label,symbol,type_index
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
APO:0000017,biolink:PhenotypicFeature,,,,,0
BFO:0000001,biolink:NamedThing,entity,,,,0
BFO:0000002,biolink:NamedThing,continuant,,,,1
BFO:0000003,biolink:BiologicalProcessOrActivity,occurrent,,,,0
BFO:0000004,biolink:NamedThing,independent continuant,,,,2


In [7]:
nodes_df[(nodes_df['category']=='biolink:Gene') & (nodes_df['symbol']=='APOE') & (nodes_df['in_taxon_label']=='Homo sapiens')].compute()

Unnamed: 0_level_0,category,name,in_taxon,in_taxon_label,symbol,type_index
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HGNC:613,biolink:Gene,APOE,NCBITaxon:9606,Homo sapiens,APOE,516790


In [8]:
nodes_df['in_taxon_label'].unique().compute()

0                              <NA>
1                      Homo sapiens
2                      Mus musculus
3                     Gallus gallus
4                        Bos taurus
5                 Rattus norvegicus
0           Drosophila melanogaster
1                        Sus scrofa
2            Canis lupus familiaris
3          Dictyostelium discoideum
4         Schizosaccharomyces pombe
5    Saccharomyces cerevisiae S288C
6            Caenorhabditis elegans
7                    Xenopus laevis
8                Xenopus tropicalis
9                       Danio rerio
Name: in_taxon_label, dtype: string

### Merging Node Index from Monarch

In [9]:
nodes_pdf = nodes_df.compute()
_dataset = dataset_df.merge(nodes_pdf, how='left', left_on=['species','target_gene_symbol'], right_on=['in_taxon_label','symbol'])
_dataset.rename(columns={'type_index': 'target_type_index'}, inplace=True)
_dataset.drop(columns=['category','name','in_taxon_label','in_taxon','symbol'], inplace=True)
dataset_w_ninfo = _dataset.merge(nodes_pdf, how='left', left_on=['species','modifier_gene_symbol'], right_on=['in_taxon_label','symbol'])
dataset_w_ninfo.rename(columns={'type_index': 'modifier_type_index'}, inplace=True)
dataset_w_ninfo.drop(columns=['name','in_taxon_label','symbol'], inplace=True)
del(_dataset)
del(nodes_pdf)
dataset_w_ninfo

Unnamed: 0,datasource,species,modifier_gene_symbol,target_gene_symbol,is_modifier,go_n_common_ancestors_max,go_n_common_ancestors_avg,go_n_common_ancestors_bma,go_n_union_ancestors_max,go_n_union_ancestors_avg,...,experiments_transferred,database,database_transferred,textmining,textmining_transferred,combined_score,target_type_index,category,in_taxon,modifier_type_index
0,JVL,Homo sapiens,ETV1,ATR,1.0,59.0,2.960317,6.109155,101.0,28.846782,...,0.0,0.0,0.0,0.0,0.0,0.0,517533,biolink:Gene,NCBITaxon:9606,523802.0
1,JVL,Homo sapiens,DCLRE1C,BRCA1,1.0,22.0,4.163090,7.260417,75.0,29.659375,...,0.0,540.0,0.0,321.0,83.0,779.0,518104,biolink:Gene,NCBITaxon:9606,521725.0
2,JVL,Homo sapiens,SMARCAL1,BRCA1,1.0,45.0,5.598765,10.268041,87.0,31.152206,...,0.0,0.0,0.0,506.0,204.0,602.0,518104,biolink:Gene,NCBITaxon:9606,551530.0
3,JVL,Homo sapiens,TRIP13,BRCA1,1.0,43.0,4.515110,11.198020,128.0,39.033929,...,0.0,0.0,0.0,285.0,95.0,505.0,518104,biolink:Gene,NCBITaxon:9606,555869.0
4,JVL,Homo sapiens,MUS81,BRCA2,1.0,29.0,4.599469,9.086957,88.0,28.369439,...,0.0,500.0,0.0,768.0,350.0,919.0,518106,biolink:Gene,NCBITaxon:9606,537363.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5791,MTG,Caenorhabditis elegans,egl-13,zyg-1,0.0,8.0,2.379699,4.038462,162.0,31.414583,...,0.0,0.0,0.0,0.0,0.0,0.0,86692,biolink:Gene,NCBITaxon:6239,82814.0
5792,MTG,Caenorhabditis elegans,F41C6.7,zyg-1,0.0,3.0,1.646259,1.851064,112.0,21.500000,...,0.0,0.0,0.0,0.0,0.0,0.0,86692,biolink:Gene,NCBITaxon:6239,95948.0
5793,MTG,Caenorhabditis elegans,F48E3.8,zyg-1,0.0,5.0,1.589286,2.121212,110.0,21.241667,...,0.0,0.0,0.0,0.0,0.0,0.0,86692,,,
5794,MTG,Caenorhabditis elegans,C39D10.7,zyg-1,0.0,3.0,1.653846,1.947368,103.0,17.625000,...,0.0,0.0,0.0,0.0,0.0,0.0,86692,biolink:Gene,NCBITaxon:6239,94433.0


### Did we detect all IDs for the genes?

In [10]:
print(dataset_w_ninfo['target_type_index'].unique().shape == dataset_w_ninfo['target_gene_symbol'].unique().shape)
print(dataset_w_ninfo['modifier_type_index'].unique().shape == dataset_w_ninfo['modifier_gene_symbol'].unique().shape)

True
False


In [11]:
print(f"{dataset_w_ninfo['modifier_gene_symbol'].unique().shape[0] - dataset_w_ninfo['modifier_type_index'].unique().shape[0]} genes are missing/not recognized from Monarch")

254 genes are missing/not recognized from Monarch


In [12]:
print(f"Unrecognized Modifiers:")
dataset_w_ninfo[(dataset_w_ninfo['modifier_type_index'].isna() & (dataset_w_ninfo['is_modifier']==1))]

Unrecognized Modifiers:


Unnamed: 0,datasource,species,modifier_gene_symbol,target_gene_symbol,is_modifier,go_n_common_ancestors_max,go_n_common_ancestors_avg,go_n_common_ancestors_bma,go_n_union_ancestors_max,go_n_union_ancestors_avg,...,experiments_transferred,database,database_transferred,textmining,textmining_transferred,combined_score,target_type_index,category,in_taxon,modifier_type_index
672,MTG,Caenorhabditis elegans,mat-2,zyg-1,1.0,34.0,3.151899,7.222222,125.0,27.442857,...,0.0,0.0,0.0,0.0,0.0,0.0,86692,,,
2317,MTG,Caenorhabditis elegans,apc-17,zyg-1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,86692,,,


In [13]:
print(f"Missing indecies")
dataset_w_ninfo[(dataset_w_ninfo['modifier_type_index'].isna()) | (dataset_w_ninfo['target_type_index'].isna())]

Missing indecies


Unnamed: 0,datasource,species,modifier_gene_symbol,target_gene_symbol,is_modifier,go_n_common_ancestors_max,go_n_common_ancestors_avg,go_n_common_ancestors_bma,go_n_union_ancestors_max,go_n_union_ancestors_avg,...,experiments_transferred,database,database_transferred,textmining,textmining_transferred,combined_score,target_type_index,category,in_taxon,modifier_type_index
499,OLIDA,Homo sapiens,NOT_HGNC 4q35,SMCHD1,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,551559,,,
623,MTG,Caenorhabditis elegans,T20B6.3,zyg-1,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,86692,,,
642,MTG,Caenorhabditis elegans,Y51A2D.7,zyg-1,0.0,8.0,2.774194,3.031250,121.0,31.575000,...,0.0,0.0,0.0,0.0,0.0,0.0,86692,,,
672,MTG,Caenorhabditis elegans,mat-2,zyg-1,1.0,34.0,3.151899,7.222222,125.0,27.442857,...,0.0,0.0,0.0,0.0,0.0,0.0,86692,,,
690,MTG,Caenorhabditis elegans,Y71H2AM.13,zyg-1,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,86692,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5661,MTG,Caenorhabditis elegans,W08E12.3,zyg-1,0.0,3.0,1.600000,1.727273,102.0,17.250000,...,0.0,0.0,0.0,0.0,0.0,0.0,86692,,,
5737,MTG,Caenorhabditis elegans,F09C3.2,zyg-1,0.0,9.0,2.393939,2.727273,117.0,23.912500,...,0.0,0.0,0.0,0.0,0.0,0.0,86692,,,
5748,MTG,Caenorhabditis elegans,D1081.7,zyg-1,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,86692,,,
5787,MTG,Caenorhabditis elegans,F59D12.1,zyg-1,0.0,10.0,2.769231,2.911111,106.0,22.370000,...,0.0,0.0,0.0,0.0,0.0,0.0,86692,,,


In [14]:
# Dropping these NA rows
dataset_w_ninfo.dropna(subset=['in_taxon','modifier_type_index'], inplace=True)

print(f"Do we now have proper indecies?")
print(f"Target genes: {dataset_w_ninfo['target_type_index'].unique().shape == dataset_w_ninfo['target_gene_symbol'].unique().shape}")
print(f"Modifier genes: {dataset_w_ninfo['modifier_type_index'].unique().shape == dataset_w_ninfo['modifier_gene_symbol'].unique().shape}")

Do we now have proper indecies?
Target genes: True
Modifier genes: True


### Adding target-modifier pairs as edges of Monarch

In [15]:
dataset_w_ninfo['modifier_type_index'] = dataset_w_ninfo['modifier_type_index'].astype(int)
dataset_w_ninfo[['target_type_index','modifier_type_index','is_modifier']]

Unnamed: 0,target_type_index,modifier_type_index,is_modifier
0,517533,523802,1.0
1,518104,521725,1.0
2,518104,551530,1.0
3,518104,555869,1.0
4,518106,537363,1.0
...,...,...,...
5790,86692,86202,0.0
5791,86692,82814,0.0
5792,86692,95948,0.0
5794,86692,94433,0.0


In [16]:
# dataset_w_ninfo.to_csv('./data/04_feature/2024-06-12-merged-dataset-nodeinfo.tsv', sep='\t', index=False)

In [49]:
# Convert DataFrame to tensor
dataset_arr = dataset_w_ninfo[['target_type_index','modifier_type_index','is_modifier']].to_numpy()
data_tensor = torch.from_numpy(dataset_arr).to(torch.int)
print(data_tensor)
print(data_tensor.shape)

tensor([[517533, 523802,      1],
        [518104, 521725,      1],
        [518104, 551530,      1],
        ...,
        [ 86692,  95948,      0],
        [ 86692,  94433,      0],
        [ 86692,  93262,      0]], dtype=torch.int32)
torch.Size([5541, 3])


In [32]:
# torch.save(data_t, './data/05_model_input/2024-06-12-merged-dataset-nodeinfo.pt')

<b>Get GNN Model<b> [Need GPU]

Loading pre-trained KG model

In [17]:
device = 'cpu' # to convert to cpu

In [18]:
# Top models
# Consolidated meptapaths: /home/rahit/projects/def-mtarailo/rahit/from_scratch/modspy-data/data/06_models/modspy-experiments/ray-reults/train_m2vec_2024-04-18_17-24-55/train_m2vec_19f34_00000_0_batch_size=512,embedding_dim=128,lr=0.0029,num_negative_samples=5,walk_length=21,walks_per_node=12_2024-04-18_17-25-05/checkpoint_000000
# entire elk: /home/rahit/projects/def-mtarailo/rahit/from_scratch/modspy-data/data/06_models/modspy-experiments/ray-reults/train_m2vec_2024-04-13_02-43-09/train_m2vec_178a4_00000_0_batch_size=256,embedding_dim=128,lr=0.0248,metapath=biolink_Gene_biolink_interacts_with_biolink_Gene_bio_2024-04-13_02-43-23/checkpoint_000000
# "/home/rahit/projects/def-mtarailo/rahit/from_scratch/modspy-data/data/06_models/modspy-experiments/ray-reults/train_m2vec_2024-04-18_17-24-55/train_m2vec_19f34_00000_0_batch_size=512,embedding_dim=128,lr=0.0029,num_negative_samples=5,walk_length=21,walks_per_node=12_2024-04-18_17-25-05/checkpoint_000000/checkpoint"
# precious_carp_5539: /home/rahit/projects/def-mtarailo/rahit/from_scratch/modspy-data/data/06_models/modspy-experiments/ray-reults/train_m2vec_2024-04-03_01-44-54/train_m2vec_4bf4d_00001_1_batch_size=128,embedding_dim=128,lr=0.0050,walk_length=12_2024-04-03_01-44-55/checkpoint_000000/checkpoint
# (active) hilarious_airway_5834: /home/rahit/projects/def-mtarailo/rahit/from_scratch/modspy-data/data/06_models/modspy-experiments/ray-reults/train_m2vec_2024-04-14_17-21-49/train_m2vec_016cc_00000_0_batch_size=32,embedding_dim=128,lr=0.0104,metapath=biolink_Gene_biolink_interacts_with_biolink_Gene_biol_2024-04-14_17-22-01/checkpoint_000000

chkpt_filepath = "/home/rahit/projects/def-mtarailo/rahit/from_scratch/modspy-data/data/06_models/modspy-experiments/ray-reults/train_m2vec_2024-04-14_17-21-49/train_m2vec_016cc_00000_0_batch_size=32,embedding_dim=128,lr=0.0104,metapath=biolink_Gene_biolink_interacts_with_biolink_Gene_biol_2024-04-14_17-22-01/checkpoint_000000/checkpoint"
# chkpt_filepath = "/home/rahit/projects/def-mtarailo/rahit/from_scratch/modspy-data/data/06_models/modspy-experiments/ray-reults/train_m2vec_2024-04-03_01-44-54/train_m2vec_4bf4d_00001_1_batch_size=128,embedding_dim=128,lr=0.0050,walk_length=12_2024-04-03_01-44-55/checkpoint_000000/checkpoint"

In [23]:
model = MetaPath2VecLightningModule.load_from_checkpoint(chkpt_filepath).to(device)
print(model.device)
print(model.eval())
graph = torch.load(model.hparams['network_filepath']).to(device)

Computing on cpu
True
1
Tesla P100-PCIE-12GB
 
        Metapath: [('biolink:Gene', 'biolink:interacts_with', 'biolink:Gene'), ('biolink:Gene', 'biolink:orthologous_to', 'biolink:Gene'), ('biolink:Gene', 'biolink:interacts_with', 'biolink:Gene')]
        Total nodes: 862115
        Total node types: 88

        Total edges: 11412471
        Total edge types: 289                
        
cpu
MetaPath2VecLightningModule(
  (model): MetaPath2Vec(559272, 128)
  (val_precision): BinaryPrecision()
  (val_recall): BinaryRecall()
)


<b>Creating dataset with node representation<b>

In [73]:
if type(data_tensor) is torch.tensor:
    data_tensor = data_tensor.detach().numpy()

X = np.concatenate((model.model.embedding(torch.tensor(data_tensor[:,0])).detach().cpu().numpy(), model.model.embedding(torch.tensor(data_tensor[:,1])).detach().cpu().numpy()), axis=1)
y = data_tensor[:,2]
assert X.shape[0] == y.shape[0]
print(X.shape)
print(dataset_w_ninfo.shape)

col_names = [f"emb_{i}" for i in range(0, X.shape[1])]
emb_df = pd.DataFrame(X.astype(np.float64), columns=col_names, dtype=np.float128)

# Droping index for alignment
dataset_w_ninfo = dataset_w_ninfo.reset_index(drop=True)
emb_df = emb_df.reset_index(drop=True)

dataset_w_emb = pd.concat([dataset_w_ninfo, emb_df], axis=1)
print(dataset_w_emb.shape)

print("Resultant array")
display(dataset_w_emb.loc[0,['emb_0','emb_1','emb_2','emb_3','emb_4']])
print("\nQuerying individual 1st node's embeding")
print(model.model.embedding(torch.tensor(data_tensor[0,0])).detach().cpu().numpy()[:5])
print("\n")

(5541, 256)
(5541, 77)
(5541, 333)
Resultant array


  X = np.concatenate((model.model.embedding(torch.tensor(data_tensor[:,0])).detach().cpu().numpy(), model.model.embedding(torch.tensor(data_tensor[:,1])).detach().cpu().numpy()), axis=1)


emb_0    0.185309
emb_1    0.150688
emb_2   -0.201403
emb_3   -0.201173
emb_4    0.227061
Name: 0, dtype: object


Querying individual 1st node's embeding
[ 0.18530875  0.15068765 -0.20140305 -0.20117337  0.22706136]




  print(model.model.embedding(torch.tensor(data_tensor[0,0])).detach().cpu().numpy()[:5])


In [74]:
# dataset_w_emb.to_csv('./data/05_model_input/2024-06-12-merged-dataset-with_node_embedding.tsv', sep='\t', index=False)

#### Creating ModifierDataset

In [26]:
from torch_geometric.data import HeteroData
from tqdm import tqdm

def verify_heterodata_construction(data: HeteroData, edges_ddf, node_ids):
    edge_type_to_chk = 'biolink:interacts_with'
    
    # Access the edge index for the relation
    edge_index = monarch['biolink:Gene', edge_type_to_chk, 'biolink:Gene'].edge_index    
    src, dest = edge_index
    
    for node_idx in tqdm(node_ids):
        graph_in_count = src[dest == node_idx].shape[0]
        graph_out_count = dest[src == node_idx].shape[0]
        
        orig_in_count = edges_df[(edges_df['object_id']==node_idx) & (edges_df['predicate']==edge_type_to_chk)]['id'].compute().shape[0]
        orig_out_count = edges_df[(edges_df['subject_id']==node_idx) & (edges_df['predicate']==edge_type_to_chk)]['id'].compute().shape[0]


        try:
            assert graph_in_count == orig_in_count
            assert graph_out_count == orig_out_count
        except AssertionError as e:
            print(f"AssertionError: {e}")
            print(f"{graph_in_count} != {orig_in_count}")
            print('or')
            print(f"{graph_out_count} != {orig_out_count}")

        # # Print results
        # print("Outgoing edges for node", node_idx, " (", outgoing_edges.shape[0] ,")" ":", outgoing_edges.tolist())
        # print("Incoming edges for node", node_idx, " (", incoming_edges.shape[0] ,")" ":", incoming_edges.tolist())
        

In [27]:
node_ids = list(set(data_t[:,0].tolist()))
print(len(node_ids))

153


In [28]:
verify_heterodata_construction(monarch, edges_df, node_ids)
print(f"Succesfully verified {len(node_ids)} genes in the network!")

  0%|          | 0/153 [00:00<?, ?it/s]

100%|██████████| 153/153 [12:20<00:00,  4.84s/it]

Succesfully verified 153 genes in the network!





In [60]:

from torch.utils.data import Dataset, DataLoader

class ModifierDataset(Dataset):
    def __init__(self, filepath: str = None):
        """
        Args:
            data (Tensor): A tensor containing node pairs and their similarity label.
                           Shape: [num_pairs, 3], where each row is (node1, node2, label).
        """
        self.data = torch.load(filepath)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        modifier, target, label = self.data[idx]
        return modifier, target, label

In [61]:
dataset = ModifierDataset("/home/rahit/projects/def-mtarailo/rahit/from_scratch/modspy-data/data/05_model_input/2024-03-31-merged-dataset.pt")
val_loader = DataLoader(dataset, batch_size=len(dataset), shuffle=False)

In [62]:
next(iter(dataset))

(tensor(516790, dtype=torch.int32),
 tensor(518908, dtype=torch.int32),
 tensor(1, dtype=torch.int32))

In [63]:
data = torch.load("/home/rahit/projects/def-mtarailo/rahit/from_scratch/modspy-data/data/05_model_input/2024-03-31-merged-dataset.pt")
np.unique(data.numpy())

array([     0,      1,  81868, ..., 558429, 558431, 558747], dtype=int32)