In [None]:
import json
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [None]:
de_data_train = pq.read_table("../data/de_train.parquet").to_pandas()

In [None]:
gene_names = de_data_train.columns[5:-2]

In [None]:
sequences_filepath = "../data/sequences.jsonl"

In [None]:
gene_symbol_to_dna = {}
gene_symbol_to_id = {}

missing = 0
total = 0

with open(sequences_filepath, "r") as sequences_file:
    for line in sequences_file:
        json_line = json.loads(line)
        if "seq" not in json_line["seq_data"]:
            gene_symbol_to_dna[json_line["location"]] = ""
            missing += 1
        else:
            gene_symbol_to_dna[json_line["location"]] = json_line["seq_data"]["seq"]
        gene_symbol_to_id[json_line["location"]] = total
        total += 1

print(f"{missing} gene sequences missing out of {total}")

In [None]:
not_found = 0
idx = total

for gene_name in gene_names:
    if gene_name not in gene_symbol_to_dna:
        not_found += 1
        gene_symbol_to_dna[gene_name] = ""
        gene_symbol_to_id[gene_name] = idx
        idx += 1

print(f"{not_found} ({(not_found/total)*100}%) genes not found in data.")

In [None]:
dna_seq_lengths = []
for gene in gene_symbol_to_dna:
    dna_seq_lengths.append(len(gene_symbol_to_dna[gene]))

DNA_SEQ_CAP = 100000

num_smaller = (np.array(dna_seq_lengths) < DNA_SEQ_CAP).sum()

print(f"With a cap of {DNA_SEQ_CAP}. {num_smaller} ({(num_smaller/len(dna_seq_lengths))*100}%) are not truncated.")

In [None]:
def dna_to_int_seq(dna_seq):
    mapping = {"C": 1, "A": 2, "T": 3, "G": 4, "N": 5}
    output = []

    i = 0
    for symbol in dna_seq:
        if i >= DNA_SEQ_CAP:
            break
        output.append(mapping[symbol])
        i += 1

    while len(output) < DNA_SEQ_CAP:
        output.append(0)

    return output

In [None]:
with open("../data/sequences_int.jsonl", "a") as output_file:
    for gene in gene_symbol_to_dna:
        line = {"gene": gene, "seq": dna_to_int_seq(gene_symbol_to_dna[gene])}
        output_file.write(json.dumps(line)+"\n")