Take sample from KG file. 

In [6]:
import pandas as pd
import random

# Load the TSV file
df = pd.read_csv('drkg.tsv', sep='\t')

# Check if there are at least 10,000 records
if len(df) < 10000:
    print("The file has fewer than 10,000 records.")
else:
    # Randomly select 10,000 records
    df_sample = df.sample(n=10000, random_state=random.randint(1, 100))

    # Save the sample to a new TSV file
    df_sample.to_csv('sample_drkg.tsv', sep='\t', index=False)


Adding rules to KG

In [7]:
import pandas as pd

# Define your rules
rules = {
    "DRUGBANK::target::Compound:Gene": "bioarx::DrugHumGen:Compound:Gene",
    "DRUGBANK::carrier::Compound:Gene": "bioarx::DrugHumGen:Compound:Gene",
    "DRUGBANK::enzyme::Compound:Gene": "Hetionet::CbG::Compound:Gene",
    "DRUGBANK::enzyme::Compound:Gene": "bioarx::DrugHumGen:Compound:Gene",
    "Hetionet::CbG::Compound:Gene": "bioarx::DrugHumGen:Compound:Gene",
    "GNBR::V+::Gene:Gene": "GNBR::V+::Gene:Gene",
    "GNBR::I::Gene:Gene": "GNBR::I::Gene:Gene",
    "GNBR::Rg::Gene:Gene": "GNBR::Rg::Gene:Gene",
    "STRING::EXPRESSION::Gene:Gene": "STRING::EXPRESSION::Gene:Gene",
    # Add all other rules
}

# Read the triples from the file
df = pd.read_csv('sample_drkg.tsv', sep='\t', header=None, names=['head', 'relation', 'tail'])

# Apply the rules to generate new triples
new_triples = []
for idx, row in df.iterrows():
    if row['relation'] in rules:
        new_relation = rules[row['relation']]
        new_triples.append([row['head'], new_relation, row['tail']])

# Convert the list of new triples to a DataFrame and concatenate it with the original DataFrame
new_triples_df = pd.DataFrame(new_triples, columns=['head', 'relation', 'tail'])
df = pd.concat([df, new_triples_df])




TransE + Rules

In [9]:
import torch
from pykeen.models import TransE
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline

# Create a triples factory from the DataFrame
triples_factory = TriplesFactory.from_labeled_triples(df.values)

# Train the model using the stochastic local closed world assumption training approach
result = pipeline(
    model=TransE,
    training=triples_factory,
    testing=triples_factory,  # Here we're testing on the training set, but you should split your data
    model_kwargs=dict(embedding_dim=50),  # Choose this carefully
    training_kwargs=dict(num_epochs=100),  # Choose this carefully
    random_seed=1234,
    device='cuda',  # use 'cuda' for GPU
)

# After training, the model's embeddings can be accessed with
model = result.model
print(model)

INFO:pykeen.pipeline.api:Using device: cuda
Training epochs on cpu: 100%|██████████| 100/100 [00:24<00:00,  4.09epoch/s, loss=0.00604, prev_loss=0.00456]
INFO:pykeen.evaluation.evaluator:Currently automatic memory optimization only supports GPUs, but you're using a CPU. Therefore, the batch_size will be set to the default value.
INFO:pykeen.evaluation.evaluator:No evaluation batch_size provided. Setting batch_size to '32'.
Evaluating on cpu: 100%|██████████| 10.1k/10.1k [00:40<00:00, 250triple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 40.37s seconds


TransE(
  (loss): MarginRankingLoss(
    (margin_activation): ReLU()
  )
  (interaction): TransEInteraction()
  (entity_representations): ModuleList(
    (0): Embedding(
      (_embeddings): Embedding(10842, 50)
    )
  )
  (relation_representations): ModuleList(
    (0): Embedding(
      (_embeddings): Embedding(87, 50)
    )
  )
  (weight_regularizers): ModuleList()
)


In [11]:
print(model)


TransE(
  (loss): MarginRankingLoss(
    (margin_activation): ReLU()
  )
  (interaction): TransEInteraction()
  (entity_representations): ModuleList(
    (0): Embedding(
      (_embeddings): Embedding(10842, 50)
    )
  )
  (relation_representations): ModuleList(
    (0): Embedding(
      (_embeddings): Embedding(87, 50)
    )
  )
  (weight_regularizers): ModuleList()
)


TransE (no rules)

In [13]:
# Read the triples from the file
df = pd.read_csv('sample_drkg.tsv', sep='\t', header=None, names=['head', 'relation', 'tail'])

# Create a triples factory from the DataFrame
triples_factory = TriplesFactory.from_labeled_triples(df.values)

# Train the model using the stochastic local closed world assumption training approach
result = pipeline(
    model=TransE,
    training=triples_factory,
    testing=triples_factory,  # Here we're testing on the training set, but you should split your data
    model_kwargs=dict(embedding_dim=50),  # Choose this carefully
    training_kwargs=dict(num_epochs=100),  # Choose this carefully
    random_seed=1234,
    device='cuda',  # use 'cuda' for GPU
)

# After training, the model's embeddings can be accessed with
model_B = result.model
print(model_B)

INFO:pykeen.pipeline.api:Using device: cuda
Training epochs on cpu: 100%|██████████| 100/100 [00:22<00:00,  4.47epoch/s, loss=0.00547, prev_loss=0.00586]
INFO:pykeen.evaluation.evaluator:Currently automatic memory optimization only supports GPUs, but you're using a CPU. Therefore, the batch_size will be set to the default value.
INFO:pykeen.evaluation.evaluator:No evaluation batch_size provided. Setting batch_size to '32'.
Evaluating on cpu: 100%|██████████| 10.0k/10.0k [00:40<00:00, 248triple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 40.36s seconds


TransE(
  (loss): MarginRankingLoss(
    (margin_activation): ReLU()
  )
  (interaction): TransEInteraction()
  (entity_representations): ModuleList(
    (0): Embedding(
      (_embeddings): Embedding(10842, 50)
    )
  )
  (relation_representations): ModuleList(
    (0): Embedding(
      (_embeddings): Embedding(87, 50)
    )
  )
  (weight_regularizers): ModuleList()
)


In [14]:
print(model_B)

TransE(
  (loss): MarginRankingLoss(
    (margin_activation): ReLU()
  )
  (interaction): TransEInteraction()
  (entity_representations): ModuleList(
    (0): Embedding(
      (_embeddings): Embedding(10842, 50)
    )
  )
  (relation_representations): ModuleList(
    (0): Embedding(
      (_embeddings): Embedding(87, 50)
    )
  )
  (weight_regularizers): ModuleList()
)
