Notebook to write main project TFRecords for training

In [None]:
record_folder = "../data/records/"

In [None]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import tensorflow as tf
import keras
import pysmiles
import json
import networkx as nx

Load and pre-process data

In [None]:
de_data_train = pq.read_table("../data/de_train.parquet").to_pandas()
de_data_train

In [None]:
de_data_train["cell_type"].unique()[1]

In [None]:
cellNameToInt = {de_data_train["cell_type"].unique()[i]: i for i in range(len(de_data_train["cell_type"].unique()))}

In [None]:
de_data_train["cell_type_int"] = de_data_train["cell_type"].map(cellNameToInt)

In [None]:
cellNameToInt

In [None]:
gene_names = de_data_train.columns[5:-2]

## Divide into train and test

In [None]:
# Cell types where all (cell_type, sm) pairs will be used for training
train_only_cell_types     = ["T cells CD4+", "T cells CD8+", "T regulatory cells"]
# Cell types where only some (cell_type, sm) pairs will be used for training
train_and_test_cell_types = ["B cells", "Myeloid cells", "NK cells"]

In [None]:
# Create a dict mapping cell_name -> list of sm given for cell_name
sm_names_by_cell_type = de_data_train.groupby("cell_type")["sm_name"].unique().to_dict()
# Get list of small molecules given for cell types with a reduced set of (cell_type, sm) pairs
train_and_test_sm = sm_names_by_cell_type["B cells"]

In [None]:
# For cell types where only some (cell_type, sm) pairs will be used for training
# Choose which small molecules will be used for training and which for test
num_b_sm       = len(sm_names_by_cell_type["B cells"])
num_myeloid_sm = len(sm_names_by_cell_type["Myeloid cells"])
num_nk_sm      = len(sm_names_by_cell_type["NK cells"])

b_cell_train       = sm_names_by_cell_type["B cells"][:num_b_sm//2]
myeloid_cell_train = sm_names_by_cell_type["Myeloid cells"][:num_myeloid_sm//2]
nk_cell_train      = sm_names_by_cell_type["NK cells"][:num_nk_sm//2]

b_cell_test       = sm_names_by_cell_type["B cells"][num_b_sm//2:]
myeloid_cell_test = sm_names_by_cell_type["Myeloid cells"][num_myeloid_sm//2:]
nk_cell_test      = sm_names_by_cell_type["NK cells"][num_nk_sm//2:]

In [None]:
# Create training combinations with all (cell_type, sm) pairs for train only cell types
training_combinations = dict((cell_type, sm_names_by_cell_type[cell_type]) for cell_type in train_only_cell_types)

In [None]:
# Include training (cell_type, sm) pairs from train_test cell types
training_combinations["B cells"] = b_cell_train
training_combinations["Myeloid cells"] = myeloid_cell_train
training_combinations["NK cells"] = nk_cell_train

In [None]:
# Create testing combinations
testing_combinations = {}
testing_combinations["B cells"] = b_cell_test
testing_combinations["Myeloid cells"] = myeloid_cell_test
testing_combinations["NK cells"] = nk_cell_test

In [None]:
training_combinations

In [None]:
testing_combinations

In [None]:
# Convert into (cell_type, sm) pairs
training_pairs = set({})
for cell_type in training_combinations.keys():
    for sm in training_combinations[cell_type]:
        training_pairs.add(cell_type+", "+sm)

testing_pairs = set({})
for cell_type in testing_combinations.keys():
    for sm in testing_combinations[cell_type]:
        testing_pairs.add(cell_type+", "+sm)

list(training_pairs)[:10]

In [None]:
de_data_train["cell_type_sm_pair"] = de_data_train["cell_type"]+", "+de_data_train["sm_name"]

In [None]:
de_data_train[de_data_train["cell_type_sm_pair"] == "T regulatory cells, FK 866"].iloc[0][5:-2]

In [None]:
sequences_csv = "../data/sequences.jsonl"

In [None]:
gene_symbol_to_dna = {}

missing = 0
total = 0

with open(sequences_csv, "r") as sequences_file:
    for line in sequences_file:
        json_line = json.loads(line)
        if "seq" not in json_line["seq_data"]:
            gene_symbol_to_dna[json_line["location"]] = None
            missing += 1
        else:
            gene_symbol_to_dna[json_line["location"]] = json_line["seq_data"]["seq"]
        total += 1

print(f"{missing} gene sequences missing out of {total}")

# Write Records

In [None]:
import GraphLayers

In [None]:
MAX_NODES = 8500
MAX_EDGES = 8500
EMBEDDING_DIM = 120

In [None]:
def smiles_to_graph(smiles_molecule):
    graph = pysmiles.read_smiles(smiles_molecule, explicit_hydrogen=True)
    return GraphLayers.convertFromNetworkX(graph, 
                               MAX_NODES,
                               MAX_EDGES, 
                               EMBEDDING_DIM,
                               "element",
                               GraphLayers.atomic_num_to_int,
                               "order",
                               GraphLayers.bond_order_to_int)

In [None]:
def dna_to_graph(sequence):
    graph = nx.Graph()
    i = 0
    for char in sequence:
        graph.add_node(i, symbol=char)
        if i > 0:
            graph.add_edge(i-1, i, edge_num=0)
        i += 1

    return GraphLayers.convertFromNetworkX(graph, 
                               MAX_NODES,
                               MAX_EDGES, 
                               EMBEDDING_DIM,
                               "symbol",
                               GraphLayers.dna_symbol_to_int,
                               "edge_num",
                               {0: 0})

In [None]:
gene_symbol_to_graph = {}

for gene_name in gene_names:
    dna_sequence = gene_symbol_to_dna[gene_name]
    gene_symbol_to_graph[gene_name] = dna_to_graph(dna_sequence)

In [None]:
def write_example_for_pair(pair_name, data_frame, training=True):
    pair_data = data_frame[data_frame["cell_type_sm_pair"] == pair_name].iloc[0]

    molecule_smiles = pair_data["SMILES"]
    mol_ver, mol_edj, mol_uni, mol_am, mol_conn, mol_edgeAdj = smiles_to_graph(molecule_smiles)

    for gene_name in gene_names:
        dna_sequence = gene_symbol_to_dna[gene_name]

In [None]:
write_example_for_pair("T regulatory cells, FK 866", de_data_train, training=True)