In [20]:
from Bio import SeqIO
from Bio.Seq import Seq
import re
import os
from tqdm.notebook import tqdm
# import requests # Keep requests in case one has to use a ZFIN search in the "search engine"
import requests
import json
import pickle


In [21]:
record = list(SeqIO.parse("../fastas/Danio_rerio.GRCz11.pep.all.fa", "fasta"))

In [22]:
def get_protein_name(protein):
    match = re.search('gene_symbol:(.*?)(\.|\s|$)' , protein.description)
    if match:
        return {match.group(1).lower() : protein}
    else:
        # print(protein.description)
        return {protein.id : protein}

def all_equal(iterator):
    iterator = iter(iterator)
    try:
        first = next(iterator)
    except StopIteration:
        return True
    return all(first == x for x in iterator)

with open('../fastas/ZFIN_1.0.1.4_basicGeneInformation.json') as jsn_fle:
    zfin_db = json.load(jsn_fle)
    # print(type(zfin_db))

def get_uniprot_seq(protein_id):
    base_url = "https://www.uniprot.org/uniprot/"
    query_url = f"{base_url}{protein_id}.fasta"

    res = requests.get(query_url)
    if res.status_code == 200:
        return ''.join(res.text.splitlines()[1:])
    else:
        print('Protein sequence not found!')


def get_unique_items(input_list):
    unique_items = []
    seen_items = set()

    for item in input_list:
        item_seq = item.seq
        if item_seq not in seen_items:
            unique_items.append(item)
            seen_items.add(item_seq)

    return unique_items

def get_update_sequence(q_gene, s_gene):
    """

    :param q_gene:  query gene
    :param s_gene: searched gene
    :return:
    """
    multi= []
    if list(q_gene.keys())[0] == s_gene:
        multi.append(q_gene.get(list(q_gene.keys())[0]).seq)
    for var_num, protein_seq in enumerate(get_unique_items(multi)):
        if var_num == 0 and protein_seq is not None and len(protein_seq) > 1:
            return {s_gene: SeqIO.SeqRecord(Seq(protein_seq), id=s_gene, description='')}
        elif var_num != 0 and protein_seq is not None and len(protein_seq) > 1:
            return {f'{s_gene}_{var_num}': SeqIO.SeqRecord(Seq(protein_seq), id=s_gene, description='')}


def dict_get(di, key):
    if di.get(key) is not None:
        return di.get(key)#.seq

def phoenix(record_dict, gene_list):
    gnnme = list(record_dict.keys())[0]

    all_multiplcates = list(map(dict_get, gene_list, len(gene_list)*[gnnme]))
    all_multiplcates = [i for i in all_multiplcates if i is not None]

    unique_multis = get_unique_items(all_multiplcates)

    unique_multis = [q for q in unique_multis if q is not None]

    out_dict = {}
    for var_num, mult in enumerate(unique_multis):
        if var_num == 0 and mult is not None and len(mult) > 1:
            # out_dict.update({gnnme: SeqIO.SeqRecord(Seq(mult), id=gnnme, description='')})
            out_dict.update({gnnme: mult})
        elif var_num != 0 and mult is not None and len(mult) > 1:
            # out_dict.update({f'{gnnme}_{var_num}': SeqIO.SeqRecord(Seq(mult), id=gnnme, description='')})
            out_dict.update({f'{gnnme}_{var_num}': mult})
    return out_dict





In [23]:


DICT_GENES = {} # This stays no matter what
list_genes = list(map(get_protein_name, record))

# # Version 1 of how to generate the DICT_GENES via data from a proteome fasta
# for d in tqdm(list_genes):
#     registered = False
#     turn = 1
#     while not registered:
#         local_gene_name = list((d.keys()))[0]
#         if DICT_GENES.get(local_gene_name) is None:
#             DICT_GENES.update(d)
#             registered = True
#         else:
#             if DICT_GENES.get(f'{local_gene_name}_{turn}') is None:
#                 DICT_GENES.update({f'{local_gene_name}_{turn}': d.get(local_gene_name)})
#                 registered = True
#             else:
#                 turn +=1

# # Version 2 of how to create the DICT_GENES via ZFIN DB and UniProt Pulls
# for zfin_entry in tqdm(zfin_db['data']):
#     gene_name = zfin_entry['symbol']
#     # print(zfin_entry['basicGeneticEntity']['crossReferences'])
#     multiple_entries = []
#     for uniprot_entry in zfin_entry['basicGeneticEntity']['crossReferences']:
#         if 'uniprot' in uniprot_entry['id'].lower():
#             matcha = re.match( '(UniProt.{2}):(.{6})', uniprot_entry['id'])
#             sequence = get_uniprot_seq(matcha.group(2))
#             multiple_entries.append(sequence)
#     for var_num, protein_seq in enumerate(get_unique_items(multiple_entries)):
#         if var_num == 0 and protein_seq is not None and len(protein_seq) > 1:
#             DICT_GENES.update({gene_name: SeqIO.SeqRecord(Seq(protein_seq), id=gene_name, description='')})
#         elif var_num != 0 and protein_seq is not None and len(protein_seq) > 1:
#             DICT_GENES.update({f'{gene_name}_{var_num}': SeqIO.SeqRecord(Seq(protein_seq), id=gene_name, description='')})



In [24]:


# Version 1.5 via proteome fasta- filtering extra sequences
# len_list_genes = len(list_genes)
# pre_dicts = list(map(phoenix, tqdm(list_genes), len_list_genes*[list_genes]))
#
# for pre_dict in pre_dicts:
#     DICT_GENES.update(pre_dict)
#
# print(len(list_genes))
# print(len(pre_dicts))
# print(len(DICT_GENES))
#
#
# # Dump into json
#
# # Writing to sample.json
# with open("DICT_GENES.pkl", "wb") as out_jsn_fle:
#     pickle.dump(DICT_GENES, out_jsn_fle)


# Verison 3 of how to create the DICT_GENES via recovery from previous endevers
with open('../DICT_GENES.pkl', 'rb') as in_jsn_fle:
    # Reading from json file
    DICT_GENES = pickle.load(in_jsn_fle)
    if type(DICT_GENES) == type(dict):
        raise TypeError(f'DICT_GENES does not have the right type. Instead it has type {type(DICT_GENES)}')




In [25]:
# This section generates a list of canonical transcripts
#
# def check_canonical_label(transcript_ids_set):
#     transcript_ids = list(transcript_ids_set)
#     # Ensembl REST API endpoint for fetching transcript information
#     endpoint = f"https://rest.ensembl.org/lookup/id/"
#     headers={ "Content-Type" : "application/json", "Accept" : "application/json"}
#     r = requests.post(endpoint, headers=headers, data='{"ids" : ' + '[' + ', '.join(f'"{elem}"' for elem in transcript_ids) + ']' + '}') #str({ "ids" : transcript_ids })
#     # Make a GET request to the endpoint
#     # response = requests.get(endpoint, headers={ "Content-Type" : "application/json"})
#     # Check if the request was successful (status code 200)
#     if r.status_code == 200:
#         out_dict = {}
#         print(r.status_code)
#         transcript_info = r.json()
#         for key, value in transcript_info.items():
#             out_dict.update({key: value.get('canonical_transcript')})
#         # print(transcript_info)#.get('canonical_transcript'))
#         return out_dict
#     else:
#         print(r.status_code)
#         return None
#
# ensemble_names = []
# for entry in list(DICT_GENES.values()):
#     desc = entry.description
#     desc_cut_1 = desc[desc.find('gene:')+5:]
#     desc_cut_2 = desc_cut_1[:desc_cut_1.find('.')]
#     ensemble_names.append(desc_cut_2)
#
# # Create subsets
# list_of_ensmbl_subsets = []
# counter = 0
# subset = set()
# check_sum = 0
# neg_sum = 0
# for gene_name in ensemble_names:
#     if counter < 50:
#         subset.add(gene_name)
#         counter += 1
#     elif len(gene_name) != 18:
#         neg_sum +=1
#     else:
#         list_of_ensmbl_subsets.append(subset)
#         check_sum += len(subset)
#         # print(len(subset))
#         subset = {gene_name}
#         counter = 1
# if len(subset) != 0:
#     list_of_ensmbl_subsets.append(subset)
#
# list_canonical_dicts = list(map(check_canonical_label, tqdm(list_of_ensmbl_subsets)))
#
# CANONICAL_DICT = {}
# for canonical_gene in list_canonical_dicts:
#    CANONICAL_DICT.update(canonical_gene)
#
# with open("canonical_transcripts.pkl", "wb") as out_jsn_fle_canon:
#     pickle.dump(CANONICAL_DICT, out_jsn_fle_canon)

In [26]:
with open("../canonical_transcripts.pkl", "rb") as in_jsn_fle_canon:
    CANONICAL_DICT = pickle.load(in_jsn_fle_canon)

In [27]:
# https://clinicaltables.nlm.nih.gov/api/ncbi_genes/v3/search?
def search_engine(search_query):
    foundit = False
    search_query = search_query.lower()
    while not foundit:
        if DICT_GENES.get(search_query) is not None:
            # print(f'found {search_query}')
            regex_search = f'{search_query}'+'_\d{1,3}'
            hit_list = [search_query]
            for some_gene_name in DICT_GENES.keys():
                if re.match(regex_search, some_gene_name) is not None:
                    hit_list.append(some_gene_name)
            foundit = True
            return hit_list



        response_selection = []
        for gene_name, gene_seqio in DICT_GENES.items():
            if search_query in gene_seqio.description.lower():
                if re.search('(.*)(_\d*)' , gene_name) is None:
                    response_selection.append(gene_name)
            elif search_query.replace(" ","") in gene_seqio.description.lower().replace(" ",""):
                response_selection.append(gene_name)
            elif search_query in gene_name.lower():
                response_selection.append(gene_name)
        variants_dict = {}
        # print(response_selection)
        for variant in response_selection:
            does_match = re.match(r'(.*?)(_\d*)', variant)
            if does_match:
                prefix_name = does_match.group(1)
                variants_dict.setdefault(prefix_name, []).append(variant)
            else:
                variants_dict.update({variant: [variant]})
        if list(variants_dict.keys()):
            precise_name = input(f'Search query: {search_query}\n If your gene is in this list type its exact name:{list(variants_dict.keys())} \n If not, press "ok".')
        else:
            print(f'No result for {search_query} was found, thus its not considered')
            return [None]

        if precise_name == '':
            search_query = input('Please enter an alternative search query or give up by pressing "Ok"')
            if search_query == '':
                foundit = True
        else:
            if DICT_GENES.get(precise_name) is not None:
                return variants_dict.get(precise_name)
            else:
                search_query = input(f'The name was not found in the databank. Perhaps a typo in {precise_name}?')



def searchengine_master(candidate_list, canonical):
    clean_candidate_list = []
    for candidate in candidate_list:
        se_result = search_engine(candidate)
        for ses_result in se_result:
            if ses_result is not None:
                se_desc = DICT_GENES.get(ses_result).description
                desc_cut_1 = se_desc[se_desc.find('gene:')+5:]
                gene_name = desc_cut_1[:desc_cut_1.find('.')]
                se_tr_cut1 = se_desc[se_desc.find('transcript:')+11:]
                transcript_name = se_tr_cut1[:se_tr_cut1.find(' ')]
                if ses_result is not None and canonical and CANONICAL_DICT.get(gene_name) == transcript_name:
                    clean_candidate_list.append(ses_result)

    return clean_candidate_list

def af_computetime(len_seq):
    # print(len_seq)
    range_to_m = {
        (0, 100): 4.9,
        (100, 200): 7.7,
        (200, 300): 13,
        (300, 400): 18,
        (400, 500): 29,
        (500, 600): 39,
        (600, 700): 53,
        (700, 800): 60,
        (800, 900): 91,
        (900, 1000): 96,
        (1000, 1100): 140,
        (1100, 1500): 280,
        (1500, 2000): 450,
        (2000, 2500): 969,
        (2500, 3000): 1240,
        (3000, 3500): 2465,
        (3500, 4000): 5660,
        (4000, 4500): 12475,
        (4500, 5000): 18824,
        (5000, 12000): 50000
    }

    for (low, high), value in range_to_m.items():
        if low < len_seq < high:
            len_prediction = range_to_m.get((low, high)) * 5 * 3
            # Taking 0.2min per residue for the MSA as an estimation
            msa_prediction = 15 * len_seq
            # predicted time +10%  tolerance
            return (len_prediction + msa_prediction) * 1.1

    return 0



def target_system(target_gene, missile_gene, separation='entry'):
    try:
        missile_sequence = str(DICT_GENES.get(missile_gene).seq)
        target_sequence = str(DICT_GENES.get(target_gene).seq)
        if separation == ':':
            return {''.join([missile_gene, '-', target_gene]): ''.join([missile_sequence, ':', target_sequence])}
        else:
            missile_object = SeqIO.SeqRecord(Seq(missile_sequence), id=missile_gene, description='')
            target_object = SeqIO.SeqRecord(Seq(target_sequence), id=target_gene, description='')
            return {''.join([missile_gene, '-', target_gene]): [missile_object, target_object]}
    except AttributeError as e:
        print(f'Problem with mseq: {missile_gene}; tseq: {target_gene}')


def seconds_to_hms(secs):
    hours, remainder = divmod(secs, 3600)
    minutes, seconds = divmod(remainder, 60)
    return "{:02d}:{:02d}:{:02d}".format(int(hours), int(minutes), int(seconds))


def af_statistics(seq_dict):
    """
    :param seq_dic:
    :return:
    """
    str_total_len = f'The are a total of {len(seq_dict)} fasta files being folded'
    # print(str_total_len)
    total_legnth = 0
    total_runtime = 0
    for sequence in seq_dict.values():
        if type(sequence) != type(list()):
            tmp_len = len(sequence) - 1
        else:
            tmp_len = 0
            for sequ in sequence:
                tmp_len += len(sequ)
        total_legnth += tmp_len
        total_runtime += af_computetime(tmp_len)
        # print(seconds_to_hms(round(af_computetime(tmp_len), -2)))
    total_runtime += total_runtime * 0.12 # Plus 12% buffer in time since a few jobs timed out
    str_avg_len = f'With an average lenth of {total_legnth/len(seq_dict)}'
    str_time = f'And a predicted total runtime of {seconds_to_hms(total_runtime)}'
    l_master_str = [str_total_len, '\n', str_avg_len, '\n', str_time]
    # return ''.join(l_master_str)
    return [seconds_to_hms(total_runtime), total_runtime]
def create_bash_script(experiment_name, comb_name, run_time, path, partition = "gpua100", user = "jwegner1@uni-muenster.de", nodes = 1, cores = 10, gres = 1, memory = 60, precomputed_msas = False):
    """

    :param experiment_name:
    :param comb_name:
    :param run_time:
    :param path:
    :param partition:
    :param user:
    :param nodes:
    :param cores:
    :param gres:
    :param memory:
    :param precomputed_msas:
    :return:
    """

    job_name = f"job_{comb_name}"
    pre_msas = str(precomputed_msas).lower()

    # bash_script_lines = ["#!/bin/bash \n",
    #                      f"#SBATCH --partition={partition} \n",
    #                      f"#SBATCH --nodes={nodes} \n",
    #                      f"#SBATCH --gres=gpu:{gres} \n",
    #                      f"#SBATCH --cpus-per-task={cores} \n",
    #                      f"#SBATCH --mem={memory}G \n",
    #                      f"#SBATCH --time={run_time} \n",
    #                      f"#SBATCH --job-name={job_name} \n",
    #                      "#SBATCH --account=uni \n",
    #                      "#SBATCH --mail-type=ALL \n",
    #                      f"#SBATCH --mail-user={user} \n",
    #                      " \n",
    #                      "module load palma/2021a \n",
    #                      "module load foss/2021a \n",
    #                      "module load AlphaFold/2.1.2 \n",
    #                      "wait \n",
    #                      "export ALPHAFOLD_DATA_DIR=/Applic.HPC/data/alphafold \n",
    #                      "\n",
    #                      "alphafold \\n",
    #                      f"    --fasta_paths=/scratch/tmp/jwegner1/{experiment_name}/fasta/{comb_name}.fasta \\n",
    #                      "    --model_preset=multimer \\n",
    #                      f"    --output_dir=/scratch/tmp/jwegner1/{experiment_name}/xprt \\n",
    #                      f"    --use_precomputed_msas={pre_msas} \\n",
    #                      "    --max_template_date=2021-11-25 \\n",
    #                      "    --is_prokaryote_list=false \\n",
    #                      "    --db_preset=reduced_dbs \\n",
    #                      "    --data_dir=/Applic.HPC/data/alphafold\n"
    #                      ]
    bash_script_text = f"""#!/bin/bash
#SBATCH --partition={partition}
#SBATCH --nodes={nodes}
#SBATCH --gres=gpu:{gres}
#SBATCH --cpus-per-task={cores}
#SBATCH --mem={memory}G
#SBATCH --time={run_time}
#SBATCH --job-name={job_name}
#SBATCH --account=uni
#SBATCH --mail-type=ALL
#SBATCH --mail-user={user}

module load palma/2021a
module load foss/2021a
module load AlphaFold/2.1.2
wait
export ALPHAFOLD_DATA_DIR=/Applic.HPC/data/alphafold

alphafold \\
    --fasta_paths=/scratch/tmp/jwegner1/{experiment_name}/fasta/{comb_name}.fasta \\
    --model_preset=multimer \\
    --output_dir=/scratch/tmp/jwegner1/{experiment_name}/xprt \\
    --max_template_date=2021-11-25 \\
    --use_precomputed_msas={pre_msas} \\
    --is_prokaryote_list=false \\
    --db_preset=reduced_dbs \\
    --data_dir=/Applic.HPC/data/alphafold
"""

    with open(os.path.join(path,f'{job_name}.sh'), 'w+') as bsh:
        # bsh.writelines(bash_script_lines)
        bsh.write(bash_script_text)
    return os.path.join('/scratch/tmp/jwegner1/af2_control_net/bash_scripts/', f'{job_name}.sh')


In [28]:
def export_fastas(pre_fasta_dict, output_directory):
    try:
        fasta_path = os.path.join(output_directory, 'fasta')
        os.makedirs(fasta_path)
    except FileExistsError:
        pass
    for fa_name, fa_sequence in pre_fasta_dict.items():
        fa_path = os.path.join(fasta_path, f'{fa_name}.fasta')
        if type(fa_sequence) == type(str()):
            fa_record = SeqIO.SeqRecord(Seq(fa_sequence), id=fa_name, description='')
            SeqIO.write(fa_record, fa_path, 'fasta-2line')
        elif type(fa_sequence) == type(list()):
            # print(fa_sequence)
            SeqIO.write(fa_sequence, fa_path, 'fasta-2line')
    # msg = af_statistics(pre_fasta_dict)
    try:
        bash_path = os.path.join(output_directory, 'bash_scripts')
        os.makedirs(bash_path)
        os.makedirs(os.path.join(output_directory, 'xprt'))
    except FileExistsError:
        pass
    master_bash_lines = ['#!/bin/bash \n']
    # Quick and dirty time to assesment
    tot_runtime = 0
    for k, v in pre_fasta_dict.items():
        msg = af_statistics({k:v})
        if msg[1] < 5500:
            print(f'----- {k} underbet t_min with : \t {msg[0]}')
            msg = [seconds_to_hms(5500), 5500]
        print(f'{k} needs: \t {msg[0]}')
        sh_pth = create_bash_script(os.path.basename(output_directory),k,msg[0],bash_path)
        master_bash_lines.append('sbatch ' + sh_pth + ' \n')
        # print(msg)
        tot_runtime += msg[1]

    with open(os.path.join(output_directory,f'dealer.sh'), 'w+') as bsh:
        bsh.writelines(master_bash_lines)

    for x in range(3):
        print('')
    print(f'The job will take approximately {seconds_to_hms(tot_runtime)}')
    print(f'Folding {len(pre_fasta_dict)} pairs')



    # with open(os.path.join(output_directory,'computing_prediction.txt'), "w") as cp_file:
    #     print(msg)
    #     cp_file.write(msg)
    #     cp_file.close()

def some_vs_some(output_directory: str, genes_group1: list, genes_group2: list, only_canonical: bool, seq_mode='entry'):
    """
    Generate fasta files to feed into alphafold 2.
    :param only_canonical:
    :param output_directory:
    :param genes_group1:
    :param genes_group2:
    :param seq_mode:
    :return:
    """
    clean_gene_group1 = searchengine_master(genes_group1, only_canonical)
    clean_gene_group2 = searchengine_master(genes_group2, only_canonical)

    # For each gene in this list generate all interacting
    pre_fasta_dict = {}
    for gene1 in clean_gene_group1:
        # print('tick')
        combination_list = list(map(target_system, clean_gene_group2, len(clean_gene_group2) * [gene1]))
        for combi_dict in combination_list:
            pre_fasta_dict.update(combi_dict)

    export_fastas(pre_fasta_dict, output_directory)


def all_vs_all(output_directory: str, gene_group: list, only_canonical: bool):
    clean_gene_group = searchengine_master(gene_group, only_canonical)
    print(clean_gene_group)

    # For each gene in this list generate all interacting
    pre_fasta_dict = {}
    for gene1 in clean_gene_group:
        # print('tick')
        combination_list = list(map(target_system, clean_gene_group, len(clean_gene_group) * [gene1]))
        for combi_dict in combination_list:
            pre_fasta_dict.update(combi_dict)

    export_fastas(pre_fasta_dict, output_directory)





In [29]:
zf_npc_components = ['aaas', 'gle1', 'ndc1', 'nup35', 'nup37', 'nup42', 'nup43', 'nup50', 'nup54', 'nup58', 'nup62l', 'nup85', 'nup88', 'nup93', 'nup98', 'nup107', 'nup133', 'nup153', 'nup160', 'nup188', 'nup205', 'nup210', 'nup214', 'nup358', 'pom121', 'rae1', 'sec13', 'seh1l', 'tpra', 'tprb']

zf_germ_cell_components = ['dnd1', 'nanos3', 'piwil1', 'buc', 'gra', 'ddx4', 'tdrd7a', 'tdrd6', 'tia1', 'dazl', 'dazap1', 'dynll2a', 'hook2', 'ranbp9', ]

In [30]:
all_vs_all('/Users/izbuser/Desktop/af2_control_net', ['dnd1', 'nanos3', 'cxcr4b', 'cxcl12a', 'ddx4', 'actb1', 'pou5f3', 'h2bc1'], True)

No result for h2bc1 was found, thus its not considered
['dnd1', 'nanos3', 'cxcr4b', 'cxcl12a', 'ddx4_5', 'actb1_3', 'pou5f3']
dnd1-dnd1 needs: 	 04:41:12
dnd1-nanos3 needs: 	 03:07:34
dnd1-cxcr4b needs: 	 04:13:47
dnd1-cxcl12a needs: 	 02:49:05
dnd1-ddx4_5 needs: 	 07:13:21
dnd1-actb1_3 needs: 	 04:20:34
dnd1-pou5f3 needs: 	 04:59:59
nanos3-dnd1 needs: 	 03:07:34
nanos3-nanos3 needs: 	 01:43:29
nanos3-cxcr4b needs: 	 02:49:42
----- nanos3-cxcl12a underbet t_min with : 	 01:23:28
nanos3-cxcl12a needs: 	 01:31:40
nanos3-ddx4_5 needs: 	 04:57:31
nanos3-actb1_3 needs: 	 02:56:29
nanos3-pou5f3 needs: 	 03:30:40
cxcr4b-dnd1 needs: 	 04:13:47
cxcr4b-nanos3 needs: 	 02:49:42
cxcr4b-cxcr4b needs: 	 03:55:55
cxcr4b-cxcl12a needs: 	 02:28:08
cxcr4b-ddx4_5 needs: 	 06:12:22
cxcr4b-actb1_3 needs: 	 04:02:42
cxcr4b-pou5f3 needs: 	 04:42:07
cxcl12a-dnd1 needs: 	 02:49:05
----- cxcl12a-nanos3 underbet t_min with : 	 01:23:28
cxcl12a-nanos3 needs: 	 01:31:40
cxcl12a-cxcr4b needs: 	 02:28:08
----- cxcl1

In [None]:
all_vs_all('npc_interactome',zf_npc_components, True)

In [1]:

some_vs_some('/Volumes/MartyMcDrve/experiments/tdrd7a_interactome_small', ['tdrd7a'], zf_npc_components, True)

NameError: name 'some_vs_some' is not defined

In [None]:
# for k, v in DICT_GENES.items():
#     if 'seh1' in k.lower():
#         print(k, v)

In [None]:
some_vs_some('granules_vs_npc_interactome', zf_germ_cell_components, zf_npc_components, True)

In [None]:
all_vs_all('granule_interactome', zf_germ_cell_components, True)

In [4]:
["dnd1", 'nanos3', 'piwil1', 'buc', 'gra', 'ddx4', 'tdrd7a', 'tdrd6', 'tia1', 'dazl', 'dazap1', 'dynll2a', 'hook2', 'ranbp9', ]

"['dnd1', 'nanos3', 'piwil1', 'buc', 'gra', 'ddx4', 'tdrd7a', 'tdrd6', 'tia1', 'dazl', 'dazap1', 'dynll2a', 'hook2', 'ranbp9']"

In [None]:

# Build this into the search engine to create mutations
search_stuff = "dnd1_3|Y72D|X123Y|"
matches = re.findall("\|\w\d*\w\|", search_stuff)

for mut in matches:
    before_mut = mut[1]
    after_mut = mut[-2]
    num_mut = int(mut[2:-2])
    
    print(before_mut, after_mut, num_mut)



In [3]:
import pandas as pd