In [None]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import tensorflow as tf
import keras
import pysmiles
import json
import networkx as nx
import random

In [None]:
import logging
logging.getLogger('pysmiles').setLevel(logging.CRITICAL)

Load and pre-process data

In [None]:
de_data_train = pq.read_table("../data/de_train.parquet").to_pandas()
de_data_train

In [None]:
de_data_train["cell_type"].unique()[1]

In [None]:
cellNameToInt = {de_data_train["cell_type"].unique()[i]: i for i in range(len(de_data_train["cell_type"].unique()))}

In [None]:
de_data_train["cell_type_int"] = de_data_train["cell_type"].map(cellNameToInt)

In [None]:
cellNameToInt

In [None]:
gene_names = de_data_train.columns[5:-2]

## Divide into train and test

In [None]:
# Cell types where all (cell_type, sm) pairs will be used for training
train_only_cell_types     = ["T cells CD4+", "T cells CD8+", "T regulatory cells"]
# Cell types where only some (cell_type, sm) pairs will be used for training
train_and_test_cell_types = ["B cells", "Myeloid cells", "NK cells"]

In [None]:
# Create a dict mapping cell_name -> list of sm given for cell_name
sm_names_by_cell_type = de_data_train.groupby("cell_type")["sm_name"].unique().to_dict()
# Get list of small molecules given for cell types with a reduced set of (cell_type, sm) pairs
train_and_test_sm = sm_names_by_cell_type["B cells"]

In [None]:
# For cell types where only some (cell_type, sm) pairs will be used for training
# Choose which small molecules will be used for training and which for test
num_b_sm       = len(sm_names_by_cell_type["B cells"])
num_myeloid_sm = len(sm_names_by_cell_type["Myeloid cells"])
num_nk_sm      = len(sm_names_by_cell_type["NK cells"])

b_cell_train       = sm_names_by_cell_type["B cells"][:num_b_sm//2]
myeloid_cell_train = sm_names_by_cell_type["Myeloid cells"][:num_myeloid_sm//2]
nk_cell_train      = sm_names_by_cell_type["NK cells"][:num_nk_sm//2]

b_cell_test       = sm_names_by_cell_type["B cells"][num_b_sm//2:]
myeloid_cell_test = sm_names_by_cell_type["Myeloid cells"][num_myeloid_sm//2:]
nk_cell_test      = sm_names_by_cell_type["NK cells"][num_nk_sm//2:]

In [None]:
# Create training combinations with all (cell_type, sm) pairs for train only cell types
training_combinations = dict((cell_type, sm_names_by_cell_type[cell_type]) for cell_type in train_only_cell_types)

In [None]:
# Include training (cell_type, sm) pairs from train_test cell types
training_combinations["B cells"] = b_cell_train
training_combinations["Myeloid cells"] = myeloid_cell_train
training_combinations["NK cells"] = nk_cell_train

In [None]:
# Create testing combinations
testing_combinations = {}
testing_combinations["B cells"] = b_cell_test
testing_combinations["Myeloid cells"] = myeloid_cell_test
testing_combinations["NK cells"] = nk_cell_test

In [None]:
training_combinations

In [None]:
testing_combinations

In [None]:
# Convert into (cell_type, sm) pairs
training_pairs = set({})
for cell_type in training_combinations.keys():
    for sm in training_combinations[cell_type]:
        training_pairs.add(cell_type+", "+sm)

testing_pairs = set({})
for cell_type in testing_combinations.keys():
    for sm in testing_combinations[cell_type]:
        testing_pairs.add(cell_type+", "+sm)

list(training_pairs)[:10]

In [None]:
de_data_train["cell_type_sm_pair"] = de_data_train["cell_type"]+", "+de_data_train["sm_name"]

In [None]:
de_data_train[de_data_train["cell_type_sm_pair"] == "T regulatory cells, FK 866"].iloc[0][5:-2]

In [None]:
sequences_csv = "../data/sequences.jsonl"

In [None]:
gene_symbol_to_dna = {}
gene_symbol_to_id = {}

missing = 0
total = 0

with open(sequences_csv, "r") as sequences_file:
    for line in sequences_file:
        json_line = json.loads(line)
        if "seq" not in json_line["seq_data"]:
            gene_symbol_to_dna[json_line["location"]] = ""
            missing += 1
        else:
            gene_symbol_to_dna[json_line["location"]] = json_line["seq_data"]["seq"]
        gene_symbol_to_id[json_line["location"]] = total
        total += 1

print(f"{missing} gene sequences missing out of {total}")

In [None]:
not_found = 0
idx = total

for gene_name in gene_names:
    if gene_name not in gene_symbol_to_dna:
        not_found += 1
        gene_symbol_to_dna[gene_name] = ""
        gene_symbol_to_id[gene_name] = idx
        idx += 1

print(f"{not_found} ({(not_found/total)*100}%) genes not found in data.")

# Create Dataset Generator

In [None]:
import GraphLayers

In [None]:
MAX_NODES = 150
MAX_EDGES = 200
EMBEDDING_DIM = 120

MAX_DNA_LEN = 2473539

In [None]:
def smiles_to_graph(smiles_molecule):
    graph = pysmiles.read_smiles(smiles_molecule, explicit_hydrogen=True)
    return GraphLayers.convertFromNetworkX(graph, 
                               MAX_NODES,
                               MAX_EDGES, 
                               EMBEDDING_DIM)

In [None]:
def generate_examples(pair_names):
    for pair_name in pair_names:
        pair_data = de_data_train[de_data_train["cell_type_sm_pair"] == pair_name.decode()].iloc[0]

        cell_type = pair_data["cell_type_int"]
        
        molecule_name = pair_data["sm_name"]

        molecule_name = pair_data["sm_name"]
        molecule_smiles = pair_data["SMILES"]
        mol_ver, mol_edj, mol_uni, mol_am, mol_conn, mol_edgeAdj = smiles_to_graph(molecule_smiles)
        
        for gene_name in gene_names[:3]:
            gene_id = gene_symbol_to_id[gene_name]
            dna_sequence = gene_symbol_to_dna[gene_name]
            
            differential_expression = pair_data[gene_name]

            example = {
                "mol_ver": mol_ver,
                "mol_edj": mol_edj,
                "mol_uni": mol_uni,
                "mol_am": mol_am,
                "mol_conn": mol_conn,
                "mol_edgeAdj": mol_edgeAdj,
                "gene_id": gene_id,
                "dna_seq": dna_sequence,
                "cell_type": cell_type,
            }
            
            yield example, differential_expression

In [None]:
from GraphLayers import *

In [None]:
def build_model(params):
    vertices = Input(shape=((MAX_NODES, EMBEDDING_DIM,)), name="mol_ver")
    edges = Input(shape=((MAX_EDGES, EMBEDDING_DIM,)), name="mol_edj")
    universal = Input(shape=((EMBEDDING_DIM,)), name="mol_uni")
    adj = Input(shape=((MAX_NODES, MAX_NODES,)), name="mol_am")
    conEd = Input(shape=((MAX_NODES, MAX_EDGES,)), name="mol_conn")
    edgeAdj = Input(shape=((MAX_EDGES, MAX_EDGES,)), name="mol_edgeAdj")
    geneID = Input(shape=((1,)), name="gene_id")
    cellType = Input(shape=((1,)), name="cell_type")
    
    x = [vertices, edges, universal, adj, conEd, edgeAdj]

    for i in range(params["graph_layers"]):
        for k in range(params["pool_steps"]):
            x = PoolStep(params[f"step_{k}_pve"],
                        params[f"step_{k}_pee"],
                        params[f"step_{k}_pue"],
                        params[f"step_{k}_pvv"],
                        params[f"step_{k}_pev"],
                        params[f"step_{k}_puv"],
                        params[f"step_{k}_pvu"],
                        params[f"step_{k}_peu"])(x)
        x = GraphUpdate(params["embedding_dim"], params["embedding_dim"], params["embedding_dim"], params["update_function_depth"])(x)

    x = PoolStep(p_ve=False,
                p_ee=False,
                p_ue=False,
                p_vv=False,
                p_ev=False,
                p_uv=False,
                p_vu=True,
                p_eu=True)(x)        
    
    u = x[2]

    #geneID = tf.expand_dims(geneID, 1)
    #cellType = tf.expand_dims(cellType, 1)
    
    u = Concatenate()([u, geneID, cellType])
    
    for i in range(params["num_final_layers"]):
        u = Dense(4, activation="relu")(u)

    u = Dense(1)(u)
    
    return Model(inputs=[vertices, edges, universal, adj, conEd, edgeAdj, geneID, cellType], outputs=u)

In [None]:
def generate_parameter_set():
    params = {}

    params["graph_layers"] = random.randint(1, 5)
    params["pool_steps"] = random.randint(0, 5)

    params["update_function_depth"] = random.randint(1, 5)

    for k in range(params["pool_steps"]):
        params[f"step_{k}_pve"] = random.choice([True, False])
        params[f"step_{k}_pee"] = random.choice([True, False])
        params[f"step_{k}_pue"] = random.choice([True, False])
        params[f"step_{k}_pvv"] = random.choice([True, False])
        params[f"step_{k}_pev"] = random.choice([True, False])
        params[f"step_{k}_puv"] = random.choice([True, False])
        params[f"step_{k}_pvu"] = random.choice([True, False])
        params[f"step_{k}_peu"] = random.choice([True, False])

    params["embedding_dim"] = random.randint(100, 100)
    params["num_final_layers"] = random.randint(1, 5)

    params["optimizer"] = random.choice(["RMSProp", "Adam", "SGD"])

    if params["optimizer"] == "RMSProp":
        #params["learning_rate"] = random.uniform(0.0001, 0.1)
        params["learning_rate"] = 0.001

    if params["optimizer"] == "Adam":
        #params["learning_rate"] = random.uniform(0.00001, 0.1)
        params["learning_rate"] = 0.001

    if params["optimizer"] == "SGD":
        #params["learning_rate"] = random.uniform(0.001, 0.1)
        params["learning_rate"] = 0.01

    params["batch_size"] = 4
    
    return params

In [None]:
params = generate_parameter_set()

model = build_model(params)

if params["optimizer"] == "RMSProp":
    optimizer=tf.keras.optimizers.RMSprop(params["learning_rate"])

if params["optimizer"] == "Adam":
    optimizer=tf.keras.optimizers.Adam(params["learning_rate"])

if params["optimizer"] == "SGD":
    optimizer=tf.keras.optimizers.SGD(params["learning_rate"])

model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.MeanSquaredError(),
)

In [None]:
ds = tf.data.Dataset.from_generator(generate_examples, 
                                            args=[list(training_pairs)], 
                                            output_types=({"mol_ver": tf.float32, 
                                                           "mol_edj": tf.float32,
                                                           "mol_uni": tf.float32,
                                                           "mol_am": tf.float32,
                                                           "mol_conn": tf.float32,
                                                           "mol_edgeAdj": tf.float32,
                                                           "gene_id": tf.float32,
                                                           "dna_seq": tf.string,
                                                           "cell_type": tf.float32,
                                                          }, tf.float32), 
                                            output_shapes=({"mol_ver": (MAX_NODES, EMBEDDING_DIM,), 
                                                            "mol_edj": (MAX_EDGES, EMBEDDING_DIM,),
                                                            "mol_uni": (EMBEDDING_DIM,),
                                                            "mol_am":  (MAX_NODES, MAX_NODES,),
                                                            "mol_conn": (MAX_NODES, MAX_EDGES,),
                                                            "mol_edgeAdj": (MAX_EDGES, MAX_EDGES,),
                                                            "gene_id": (),
                                                            "dna_seq": (),
                                                            "cell_type": (),
                                                           }, ()),)

batched_ds = ds.batch(10)

In [None]:
model.fit(batched_ds)