In [1]:
# Env
import os
import pandas as pd
import numpy as np
from pathlib import Path


DATASET_DIR = Path('/mnt/f/spatial_data/probe')
RUNID = 'example_dataset'
workdir = DATASET_DIR / RUNID
PANEL = 'PRISM'
organism = 'mouse'

In [None]:
gene_info = pd.read_excel(os.path.join(workdir, "gene_list.xlsx"))
gene_info['gene'] = gene_info['gene_name']
if organism == 'mouse': gene_info['gene'] = gene_info['gene'].str.capitalize()
elif organism == 'human': gene_info['gene'] = gene_info['gene'].str.upper()
gene_list = [_.strip() for _ in gene_info['gene'].unique() if _!=0]
print(len(gene_list))
gene_info.head()

# Binding sites

In [None]:
binding_df = pd.read_excel(DATASET_DIR / RUNID / "gene_binding_site.xlsx")
if organism == 'mouse': binding_df['gene_name'] = binding_df['gene_name'].str.capitalize()
elif organism == 'human': binding_df['gene_name'] = binding_df['gene_name'].str.upper()
# binding_df = binding_df[binding_df['gene_name'].isin(gene_list)]
print(len(binding_df))
binding_df.head()

# Barcode_df

In [None]:
if PANEL == 'PRISM':
    probe_df = pd.DataFrame()
    barcode_df = pd.read_excel(DATASET_DIR / "PRISM_31plex_barcode.xlsx", index_col=0)[['Sequence']]
elif PANEL == 'SPRINTseq':
    barcode_df = pd.read_excel(DATASET_DIR / "SPRINTSEQ_369_barcode.xlsx", index_col=0)[['Barcode sequence']]
    primer_l = 'TCCCTACACGACGCTCTTCCGATCT'
    primer_r = 'CATTCCTGCTGAACCGCTCTTCCGA'
    barcode_df['Barcode(70bp)'] = primer_l + barcode_df['Barcode sequence'] + primer_r + barcode_df['Barcode sequence']
barcode_df.head()

# Stitch of probe

## direct combine of binding site all

In [None]:
# prism_list = [_ for _ in range(1, 31) if _ not in [1,5,9,]]
prism_list = [_ for _ in range(1, 31)]
for num, (prism, gene) in enumerate(zip(prism_list, binding_df["gene"].tolist())):
    binding = binding_df["binding"].loc[num]
    binding_l = binding_df["binding_left"].loc[num]
    binding_r = binding_df["binding_right"].loc[num]

    # assert len(binding) == 40, f"binding site length is not 40bp: {binding}"
    # binding_l = binding[:20].lower()
    # binding_r = binding[20:].lower()
    barcode = barcode_df.loc[f"Prism_{prism}", "Barcode (82bp)"]
    probe = binding_r.lower() + barcode.upper() + binding_l.lower()

    probe_info = pd.DataFrame({
            "PRISM": [f"PRISM_{prism}"],
            "gene":[f'{gene}'],
            "probe": [probe],
            "barcode": [barcode],
            "binding": [binding],
            })

    if len(probe_df) == 0:
        probe_df = probe_info
    else:
        probe_df = pd.concat([probe_df, probe_info])

probe_df = probe_df.set_index('PRISM')
probe_df.head()

## select middle one for multi binding sites

In [None]:
# 创建一个空的DataFrame来存储结果
middle_rows = pd.DataFrame()

# 按'Gene'分组，然后找到每组的中间行
for name, group in binding_df.groupby('gene_name',sort=False):
    middle_index = len(group) // 2
    # 使用.iloc来获取真实的DataFrame索引
    middle_row = group.iloc[[middle_index]]
    middle_rows = pd.concat([middle_rows, middle_row])

binding_df = middle_rows.copy()
print(len(binding_df))
binding_df.head()

In [None]:
# prism_list = [_ for _ in range(1, 31) if _ not in [1,5,9,]]
# prism_list = [_ for _ in range(1, 31)]
probe_df = pd.DataFrame()
seq_list = [_+1+167 for _ in range(len(binding_df))]
# seq_list = [_+1 for _ in range(30)]

for num, (id, gene) in enumerate(zip(seq_list, binding_df["gene_name"].tolist())):
    binding = binding_df["bds"].iloc[num]
    assert len(binding) == 40, f"binding site length is not 40bp: {binding}"
    binding_l = binding[:20]
    binding_r = binding[20:]
    # binding_l = binding_df["binding_left"].loc[num]
    # binding_r = binding_df["binding_right"].loc[num]
    barcode = barcode_df.loc[f'Prism_{id}', "Barcode (82bp)"] if PANEL == 'PRISM' else barcode_df.loc[id, "Barcode(70bp)"]
    probe = binding_r.lower() + barcode.upper() + binding_l.lower()

    if PANEL == 'PRISM':
        probe_info = pd.DataFrame({
            "PRISM": [f"PRISM_{id}"],
            "gene":[f'{gene.upper()}'],
            "probe_name":[f'PRISM_{id}_{gene}'],
            "probe_seq": [probe],
            "barcode_seq": [barcode],
            "binding_seq": [binding],})

    elif PANEL == 'SPRINTseq':
        probe_info = pd.DataFrame({
            "SPRINTseq": [f"SPRINTseq_{id}"],
            "gene":[f'{gene.upper()}'],
            "probe_name":[f'Seq_{id}_{gene}'],
            "probe": [probe],
            "barcode": [barcode],
            "binding": [binding],})
        
    if len(probe_df) == 0: probe_df = probe_info
    else: probe_df = pd.concat([probe_df, probe_info])

probe_df = probe_df.set_index('PRISM') if PANEL == 'PRISM' else probe_df.set_index('SPRINTseq')
probe_df.head()

## select 3 binding sites for each gene

In [None]:
probe_df = pd.DataFrame()

cont = 0
prism_pos = 0
prism_pos_list = [_+1 for _ in range(15)]
prism = prism_pos_list[prism_pos]
max_cont = 3
pre_gene_name = binding_df["gene_name"].iloc[0]
for num, gene in enumerate(binding_df["gene_name"]):
    if pre_gene_name != gene:
        pre_gene_name = gene
        cont = 0
        prism_pos += 1
        prism = prism_pos_list[prism_pos]
    elif cont == max_cont:
        continue
    # print(num, gene, prism)
    cont += 1
    binding = binding_df["bds"].iloc[num]
    assert len(binding) == 40, f"binding site at pos {num} length is not 40bp: {binding}, {len(binding)} instead."

    binding_l = binding[:20].lower()
    binding_r = binding[20:].lower()
    barcode = barcode_df.loc[prism, "Sequence"]
    probe = binding_r + barcode + binding_l

    if PANEL == 'PRISM':
        probe_info = pd.DataFrame({
            "PRISM": [f"PRISM_{prism}"],
            "gene":[f'{gene}'],
            "probe_name":[f'PR_{prism}_{gene}_{cont}'],
            "probe_seq": [probe],
            "barcode_seq": [barcode],
            "binding_seq": [binding],})

    elif PANEL == 'SPRINTseq':
        probe_info = pd.DataFrame({
            "SPRINTseq": [f"SPRINTseq_{prism}"],
            "gene":[f'{gene}'],
            "probe_name":[f'Seq_{prism}_{gene}_{cont}'],
            "probe": [probe],
            "barcode": [barcode],
            "binding": [binding],})
    if len(probe_df) == 0: probe_df = probe_info
    else: probe_df = pd.concat([probe_df, probe_info])
probe_df = probe_df.reset_index(drop=True)
probe_df

In [None]:
pd.DataFrame(probe_df['gene'].unique())

# Save probe

In [None]:
probe_df.to_excel(DATASET_DIR / RUNID / f'{PANEL}_probe.xlsx')
print(len(probe_df))
probe_df.head()