Notebook to write main project TFRecords for training

In [None]:
record_folder = "../data/records/"

In [None]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import tensorflow as tf
import keras
import pysmiles
import json
import networkx as nx

Load and pre-process data

In [None]:
de_data_train = pq.read_table("../data/de_train.parquet").to_pandas()
de_data_train

In [None]:
de_data_train["cell_type"].unique()[1]

In [None]:
cellNameToInt = {de_data_train["cell_type"].unique()[i]: i for i in range(len(de_data_train["cell_type"].unique()))}

In [None]:
de_data_train["cell_type_int"] = de_data_train["cell_type"].map(cellNameToInt)

In [None]:
cellNameToInt

In [None]:
gene_names = de_data_train.columns[5:-2]

## Divide into train and test

In [None]:
# Cell types where all (cell_type, sm) pairs will be used for training
train_only_cell_types     = ["T cells CD4+", "T cells CD8+", "T regulatory cells"]
# Cell types where only some (cell_type, sm) pairs will be used for training
train_and_test_cell_types = ["B cells", "Myeloid cells", "NK cells"]

In [None]:
# Create a dict mapping cell_name -> list of sm given for cell_name
sm_names_by_cell_type = de_data_train.groupby("cell_type")["sm_name"].unique().to_dict()
# Get list of small molecules given for cell types with a reduced set of (cell_type, sm) pairs
train_and_test_sm = sm_names_by_cell_type["B cells"]

In [None]:
# For cell types where only some (cell_type, sm) pairs will be used for training
# Choose which small molecules will be used for training and which for test
num_b_sm       = len(sm_names_by_cell_type["B cells"])
num_myeloid_sm = len(sm_names_by_cell_type["Myeloid cells"])
num_nk_sm      = len(sm_names_by_cell_type["NK cells"])

b_cell_train       = sm_names_by_cell_type["B cells"][:num_b_sm//2]
myeloid_cell_train = sm_names_by_cell_type["Myeloid cells"][:num_myeloid_sm//2]
nk_cell_train      = sm_names_by_cell_type["NK cells"][:num_nk_sm//2]

b_cell_test       = sm_names_by_cell_type["B cells"][num_b_sm//2:]
myeloid_cell_test = sm_names_by_cell_type["Myeloid cells"][num_myeloid_sm//2:]
nk_cell_test      = sm_names_by_cell_type["NK cells"][num_nk_sm//2:]

In [None]:
# Create training combinations with all (cell_type, sm) pairs for train only cell types
training_combinations = dict((cell_type, sm_names_by_cell_type[cell_type]) for cell_type in train_only_cell_types)

In [None]:
# Include training (cell_type, sm) pairs from train_test cell types
training_combinations["B cells"] = b_cell_train
training_combinations["Myeloid cells"] = myeloid_cell_train
training_combinations["NK cells"] = nk_cell_train

In [None]:
# Create testing combinations
testing_combinations = {}
testing_combinations["B cells"] = b_cell_test
testing_combinations["Myeloid cells"] = myeloid_cell_test
testing_combinations["NK cells"] = nk_cell_test

In [None]:
training_combinations

In [None]:
testing_combinations

In [None]:
# Convert into (cell_type, sm) pairs
training_pairs = set({})
for cell_type in training_combinations.keys():
    for sm in training_combinations[cell_type]:
        training_pairs.add(cell_type+", "+sm)

testing_pairs = set({})
for cell_type in testing_combinations.keys():
    for sm in testing_combinations[cell_type]:
        testing_pairs.add(cell_type+", "+sm)

list(training_pairs)[:10]

In [None]:
de_data_train["cell_type_sm_pair"] = de_data_train["cell_type"]+", "+de_data_train["sm_name"]

In [None]:
de_data_train[de_data_train["cell_type_sm_pair"] == "T regulatory cells, FK 866"].iloc[0][5:-2]

In [None]:
sequences_csv = "../data/sequences.jsonl"

In [None]:
gene_symbol_to_dna = {}
gene_symbol_to_id = {}

missing = 0
total = 0

with open(sequences_csv, "r") as sequences_file:
    for line in sequences_file:
        json_line = json.loads(line)
        if "seq" not in json_line["seq_data"]:
            gene_symbol_to_dna[json_line["location"]] = ""
            missing += 1
        else:
            gene_symbol_to_dna[json_line["location"]] = json_line["seq_data"]["seq"]
        gene_symbol_to_id[json_line["location"]] = total
        total += 1

print(f"{missing} gene sequences missing out of {total}")

In [None]:
not_found = 0
idx = total

for gene_name in gene_names:
    if gene_name not in gene_symbol_to_dna:
        not_found += 1
        gene_symbol_to_dna[gene_name] = ""
        gene_symbol_to_id[gene_name] = idx
        idx += 1

print(f"{not_found} ({(not_found/total)*100}%) genes not found in data.")

# Write Records

In [None]:
import GraphLayers

In [None]:
MAX_NODES = 150
MAX_EDGES = 200
EMBEDDING_DIM = 120

MAX_DNA_LEN = 2473539

In [None]:
def smiles_to_graph(smiles_molecule):
    graph = pysmiles.read_smiles(smiles_molecule, explicit_hydrogen=True)
    return GraphLayers.convertFromNetworkX(graph, 
                               MAX_NODES,
                               MAX_EDGES, 
                               EMBEDDING_DIM)

In [None]:
def dna_to_int_seq(dna, max_len):
    dna_map = {"C": 1, "A": 2, "T": 3, "G": 4, "N": 5}
    output = []

    for symbol in dna:
        output.append(dna_map[symbol])

    while len(output) < max_len:
        output.append(0)
    
    return output

In [None]:
#max_dna_len = 0
#for gene in gene_symbol_to_dna:
#    max_dna_len = max(max_dna_len, len(gene_symbol_to_dna[gene]))
#print(max_dna_len)

In [None]:
#gene_symbol_to_dna_ints = {}

#for gene in gene_symbol_to_dna:
#    gene_symbol_to_dna_ints[gene] = dna_to_int_seq(gene_symbol_to_dna[gene], MAX_DNA_LEN)

In [None]:
def write_example_for_pair(pair_name, data_frame, writer):
    pair_data = data_frame[data_frame["cell_type_sm_pair"] == pair_name].iloc[0]

    cell_type = pair_data["cell_type_int"]
    
    molecule_name = pair_data["sm_name"]
    molecule_smiles = pair_data["SMILES"]
    mol_ver, mol_edj, mol_uni, mol_am, mol_conn, mol_edgeAdj = smiles_to_graph(molecule_smiles)

    mol_ver = mol_ver.flatten()
    mol_edj = mol_edj.flatten()
    mol_uni = mol_uni.flatten()
    mol_am = mol_am.flatten()
    mol_conn = mol_conn.flatten()
    mol_edgeAdj = mol_edgeAdj.flatten()
    
    for gene_name in gene_names:
        gene_id = gene_symbol_to_id[gene_name]
        dna_sequence = gene_symbol_to_dna[gene_name]
        #dna_ints = gene_symbol_to_dna_ints[gene_name]
        
        differential_expression = pair_data[gene_name]
        
        features = {
            "mol_name": tf.train.Feature(bytes_list=tf.train.BytesList(value=[molecule_name.encode("utf-8")])),
            "mol_ver": tf.train.Feature(float_list=tf.train.FloatList(value=mol_ver)),
            "mol_edj": tf.train.Feature(float_list=tf.train.FloatList(value=mol_edj)),
            "mol_uni": tf.train.Feature(float_list=tf.train.FloatList(value=mol_uni)),
            "mol_am": tf.train.Feature(float_list=tf.train.FloatList(value=mol_am)),
            "mol_conn": tf.train.Feature(float_list=tf.train.FloatList(value=mol_conn)),
            "mol_edgeAdj": tf.train.Feature(float_list=tf.train.FloatList(value=mol_edgeAdj)),
            "gene_name": tf.train.Feature(bytes_list=tf.train.BytesList(value=[gene_name.encode("utf-8")])),
            "gene_id": tf.train.Feature(int64_list=tf.train.Int64List(value=[gene_id])),
            "dna_sequence": tf.train.Feature(bytes_list=tf.train.BytesList(value=[dna_sequence.encode("utf-8")])),
            "cell_type": tf.train.Feature(int64_list=tf.train.Int64List(value=[cell_type])),
            "differential_expression": tf.train.Feature(float_list=tf.train.FloatList(value=[differential_expression])),
        }

        example = tf.train.Example(features=tf.train.Features(feature=features))

        writer.write(example.SerializeToString())

In [None]:
ex_writer = tf.io.TFRecordWriter('./example.tfrecord')

In [None]:
import time

In [None]:
start = time.time()
write_example_for_pair("T regulatory cells, FK 866", de_data_train, ex_writer)
print(f"One pair took {time.time()-start} seconds.")

In [None]:
pair_size_gb = 11
pair_time_sec = 174.2762

total_pairs = len(training_pairs)+len(testing_pairs)

print(f"All pairs will take {total_pairs*pair_time_sec} seconds and will take up {total_pairs*pair_size_gb} gb.")

In [None]:
de_data_train.iloc[0]["ZXDB"]

In [None]:
def count(A, B):
    for a in A:
        for b in B:
            yield a+b

In [None]:
A = [1, 2, 3, 4, 5]
B = [10, 20, 30, 40, 50]

ds_counter = tf.data.Dataset.from_generator(count, args=[A, B], output_types=tf.int32, output_shapes = (), )

In [None]:
for count_batch in ds_counter.repeat().batch(10).take(10):
  print(count_batch.numpy())