# **Supplementary Code 1**
This notebook was used for designing library for pegRNA abundance screening. For more detail, please read Methods and Supplementary Information.  
For detailed information about the completed library, please refer to Supplementary Table 1.

Lead contact: Hyoungbum Henry Kim (hkim1@gmail.com)

Technical contact: Goosang Yu (gsyu93@gmail.com), Yusang Jung (ys.jung@yuhs.ac)

## Directory tree

📦Working directory  
 ┣ 📂input  
 ┃ ┗ 📜ABL1_lib_design_input_exon4.csv  
 ┃ ┗ 📜ABL1_lib_design_input_exon5.csv  
 ┃ ┗ 📜...  
 ┗ 📂output  
 ┃ ┗ 📂ABL1_exon4  
 ┃ ┃ ┗ 📜ABL1_ex4_pos1T_A_pe2max_293T.parquet  
 ┃ ┃ ┗ 📜ABL1_ex4_pos2T_A_pe2max_293T.parquet  
 ┃ ┃ ┗ 📜...  
 ┗ 📜SuppleCode1.ipynb (this file)  

### Step 1: Load reference sequence for input of DeepPrime
The pegRNA sequences for creating SNVs in the CDS of each exon of ABL1 and the predicted prime editing efficiency of each pegRNA were generated using DeepPrime. 

The reference sequences for each input used in DeepPrime are organized in the input directory (example: ABL1_lib_design_input_exon4.csv).

In [2]:
import os
import pandas as pd

exon_num = 4

deepprime_input = f'input/ABL1_lib_design_input_exon{exon_num}.csv'
output_path     = f'output/ABL1_exon{exon_num}'
os.makedirs(output_path, exist_ok=True)

# Load reference sequence DataFrame
df_input = pd.read_csv(deepprime_input)

df_input

Unnamed: 0,ID,seq,pos,frame,cds_start,cds_end
0,ABL1_ex4_pos1T_A,GCTCACGTGAGCTCTTTGAGCTTGCCTGTCTCTGTGGGCTGAAGGC...,1,1,60,121
1,ABL1_ex4_pos2T_A,CTCACGTGAGCTCTTTGAGCTTGCCTGTCTCTGTGGGCTGAAGGCT...,2,2,59,121
2,ABL1_ex4_pos3C_A,TCACGTGAGCTCTTTGAGCTTGCCTGTCTCTGTGGGCTGAAGGCTG...,3,0,58,121
3,ABL1_ex4_pos5G_A,ACGTGAGCTCTTTGAGCTTGCCTGTCTCTGTGGGCTGAAGGCTGTT...,5,2,56,121
4,ABL1_ex4_pos6C_A,CGTGAGCTCTTTGAGCTTGCCTGTCTCTGTGGGCTGAAGGCTGTTC...,6,0,55,121
...,...,...,...,...,...,...
844,ABL1_ex4_pos278G_T,GGAGGTGTACGAGGGCGTGTGGAAGAAATACAGCCTGACGGTGGCC...,278,2,0,65
845,ABL1_ex4_pos279G_T,GAGGTGTACGAGGGCGTGTGGAAGAAATACAGCCTGACGGTGGCCG...,279,0,0,64
846,ABL1_ex4_pos281A_T,GGTGTACGAGGGCGTGTGGAAGAAATACAGCCTGACGGTGGCCGTG...,281,2,0,62
847,ABL1_ex4_pos282G_T,GTGTACGAGGGCGTGTGGAAGAAATACAGCCTGACGGTGGCCGTGA...,282,0,0,61


In [3]:
# Each data includes ID and input sequence containing WT / Edited information.
example_data = df_input.loc[0]

print(f'Sample ID:', example_data.ID)
print(f'Input seq:', example_data.seq)

Sample ID: ABL1_ex4_pos1T_A
Input seq: GCTCACGTGAGCTCTTTGAGCTTGCCTGTCTCTGTGGGCTGAAGGCTGTTCCCTGTTTCC(T/A)TTCAGCTCTACGTCTCCTCCGAGAGCCGCTTCAACACCCTGGCCGAGTTGGTTCATCATCA


### Step 2: Design pegRNAs and get DeepPrime score
Using DeepPrime-FT model (Yu et al., Cell, 2023), we designed all available pegRNAs and get predicted prime editing scores for each SNV as bellow.  

For using DeepPrime, we used python package 'GenET' version 0.14.x or higher. 

In [6]:
from genet.predict import DeepPrime
from tqdm import tqdm

for idx in tqdm(df_input.index, total=len(df_input),
                desc=f'DeepPrime-ABL1 exon{exon_num}',
                ncols=100, ascii = ' =', leave=True):

    data = df_input.iloc[idx]

    sample_id = data.ID
    sequence  = data.seq

    pegrna = DeepPrime(sequence, name=sample_id, pbs_min=6, pbs_max=17)
    pe2max = pegrna.predict(pe_system='PE2max', cell_type='HEK293T')

    pe2max.to_parquet(f'{output_path}/{sample_id}_pe2max_293T.parquet', index=False)

DeepPrime-ABL1 exon4:   0%|                                                 | 0/849 [00:00<?, ?it/s]

DeepPrime-ABL1 exon4:   1%|                                      | 10/849 [01:05<1:31:48,  6.57s/it]


### Step 3: Select best pegRNAs for screening
Based on DeepPrime-FT score, we selected 6 pegRNAs for each varinat as bellow. After then, we only choose 3 pegRNAs for each variants for final list of library.

In [None]:
from glob import glob


def reverse_complement(sSeq):
    dict_sBases = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'U': 'U', 'n': '',
                   '.': '.', '*': '*', 'a': 't', 'c': 'g', 'g': 'c', 't': 'a'}
    list_sSeq = list(sSeq)  # Turns the sequence in to a gigantic list
    list_sSeq = [dict_sBases[sBase] for sBase in list_sSeq]
    return ''.join(list_sSeq)[::-1]

# def END: reverse_complement

def make_top_pegrna_list(file_dir, pe_sys):

    ## Gather all parquet files containing DeepPrime scores for each pegRNAS in the target directory.
    file_list = list(glob('%s/%s/results/*.parquet' % (file_dir, pe_sys)))
    df_out = pd.DataFrame()

    for f in file_list:

        ## 1. sort_values('DeepPrime_score')
        df = pd.read_parquet(f)
        df = df.sort_values('DeepPrime_score', ascending=False)

        ## Remove entries with BsmBI sites in the RT-PBS region.
        df = df[~df['Edited74_On'].str.contains('GAGACG', na=False, case=False)]
        df = df[~df['Edited74_On'].str.contains('CGTCTC', na=False, case=False)]

        ## Remove entries with BsmBI sites in the spacer region.
        df['gN19'] = df['WT74_On'][5:24]
        df = df[~df['gN19'].str.contains('GAGACG', na=False, case=False)]
        df = df[~df['gN19'].str.contains('CGTCTC', na=False, case=False)]

        ## 2. Extract only the target as a series and keep only the unique values. 
        series_target = df.WT74_On.unique()

        ## 3. Group the sorted df by Seed target.
        grouped_df = df.groupby('WT74_On')

        ## 4. From ser_target, select the top 3 for each seed target (top 3 per seed target). 
        
        list_df_temp = []

        for idx, trgt in enumerate(series_target):
            df_top3_pegrna = grouped_df.get_group(trgt).head(2)
            list_df_temp.append(df_top3_pegrna)

            if idx == 2: break
        
        ## 5. Output: Store the selected pegRNA information in new dataframes. Add a rank column within those dataframes.
        df_temp = pd.concat(list_df_temp)
        df_temp['rank'] = df_temp.DeepPrime_score.rank(ascending=False)
        
        df_out = pd.concat([df_out, df_temp])

    return df_out


In [None]:
### Run ###

target_list = ['ABL1_ex4', 'ABL1_ex5', 'ABL1_ex6', 'ABL1_ex7', 'ABL1_ex8', 'ABL1_ex9']
pe_sys = ['PE2max']
base_dir = os.getcwd()


for pe in pe_sys:
    list_out = []

    for gene in target_list:
        file_dir = '%s/output/%s' %(base_dir, gene)

        df_out = make_top_pegrna_list(file_dir, pe)
        list_out.append(df_out)

    df = pd.concat(list_out)
    df.to_csv('%s/ABL1_top_pegRNAs_%s_220802_BsmBI_filtered_Top6.csv' % (base_dir, pe), index=False)
