# **Supplementary Code 1**
This notebook was used for designing library for pegRNA abundance screening. For more detail, please read Methods and Supplementary Information. 

Lead contact: Hyoungbum Henry Kim (hkim1@gmail.com)

Technical contact: Goosang Yu (gsyu93@gmail.com), Yusang Jung (ys.jung@yuhs.ac)

## Directory tree

📦Working directory  
 ┣ 📂input  
 ┃ ┗ 📜ABL1_ex4.csv  
 ┃ ┗ 📜ABL1_ex5.csv  
 ┃ ┗ 📜...  
 ┗ 📂output  
 ┃ ┗ 📂ABL1_ex4  
 ┃ ┃ ┗ 📜ABL1_ex4_pos1T_A_output.parquet  
 ┃ ┃ ┗ 📜ABL1_ex4_pos1T_C_output.parquet  
 ┃ ┃ ┗ 📜...  
 ┃ ┗ 📂ABL1_ex5  
 ┃ ┃ ┗ 📜ABL1_ex5_pos1T_A_output.parquet  
 ┃ ┃ ┗ 📜ABL1_ex5_pos1T_C_output.parquet  
 ┃ ┃ ┗ 📜...  
 ┗ 📜SuppleCode1.ipynb (this file)  

### Step 1: Make reference sequence pair
Reference sequence pair is composed with unedited and edited (with SNV) sequences for each variants. This file was generated by Excel. For example, please check Supplementary Table XX.

### Step 2: Design pegRNAs and get DeepPrime score
Using DeepPrime-FT model (Yu et al., Cell, 2023), we designed all available pegRNAs and get predicted prime editing scores for each SNV as bellow.  

For using DeepPrime, we used python package 'GenET' version 0.8x or higher. 

In [None]:
import pandas as pd
import os, sys
from glob import glob
from genet import predict as prd

# Set input / output file name 
sample_tag = 'ABL1_ex4'

OUT_DIR    = './output/%s' % sample_tag
os.mkdir(OUT_DIR)

# load reference sequence pair file described in 'Step 1'
df_input = pd.read_csv('./input/%s.csv' % sample_tag)

for idx in df_input.index:
    
    print('Index: %s' % idx)
    
    info = df_input.iloc[idx]

    sID    = info.ID
    seq_wt = info.Ref
    seq_ed = info.Edit
    alt_type = 'sub1'

    print('Variant ID: ', sID)
    df_pe_score = prd.pe_score(seq_wt, seq_ed, alt_type, sID=sID, pbs_min=6, pbs_max=17, pe_system='PE2max', cell_type='DLD1')

    df_pe_score.to_parquet('%s/%s_output.parquet' % (OUT_DIR, sID))

### Step 3: Select best pegRNAs for screening
Based on DeepPrime-FT score, we selected 6 pegRNAs for each varinat as bellow. After then, we only choose 3 pegRNAs for each variants for final list of library.

In [None]:
def reverse_complement(sSeq):
    dict_sBases = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'U': 'U', 'n': '',
                   '.': '.', '*': '*', 'a': 't', 'c': 'g', 'g': 'c', 't': 'a'}
    list_sSeq = list(sSeq)  # Turns the sequence in to a gigantic list
    list_sSeq = [dict_sBases[sBase] for sBase in list_sSeq]
    return ''.join(list_sSeq)[::-1]

# def END: reverse_complement

def make_top_pegrna_list(file_dir, pe_sys):

    ## Target directory에 있는 모든 feather file을 모아준다. 
    file_list = list(glob('%s/%s/results/*.parquet' % (file_dir, pe_sys)))
    df_out = pd.DataFrame()

    for f in file_list:

        ## 1. sort_values('DeepPrime_score')
        df = pd.read_parquet(f)
        df = df.sort_values('DeepPrime_score', ascending=False)

        ## 추가!! RT-PBS에 BsmBI site가 들어있는 것 제거하기
        ## ED seq에서 찾아보면 된다. 양방향 둘 다에서 찾을 것이기 때문.
        df = df[~df['Edited74_On'].str.contains('GAGACG', na=False, case=False)]
        df = df[~df['Edited74_On'].str.contains('CGTCTC', na=False, case=False)]

        ## 추가!! spacer 부분에서 BsmBI site가 들어있는 것 제거하기
        ## WT target에서 spacer 부분만 뽑은 것을 새 column으로 넣고, 똑같이 제거해주면 된다. 
        df['gN19'] = df['WT74_On'][5:24]
        df = df[~df['gN19'].str.contains('GAGACG', na=False, case=False)]
        df = df[~df['gN19'].str.contains('CGTCTC', na=False, case=False)]

        ## 2. target만 series로 가져온 다음, unique value만 남긴다. sorting을 먼저 했기 때문에 DeepPrime score 높은 것이 가장 먼저 나온 순으로 정렬된다. = srs_target (srs = series 약자)
        srs_target = df.WT74_On.unique()

        ## 3. sort_values 된 df를 Seed target으로 groupby한다.
        grouped_df = df.groupby('WT74_On')

        ## 4. ser_target에서 앞에서 3개에 대해서만 (top3) 각 seed target마다 top 3개씩 고른다. 총 3개의 seed에 대해 9개의 pegRNA가 선정되어야 함.
        ## Seed Target이 3개 이하일 수 있으니 for문으로 len(srs_target)만큼 돌리다가 3개 넘어가는 idx 3 넘기는 순간 continue.
        
        list_df_temp = []

        for idx, trgt in enumerate(srs_target):
            df_top3_pegrna = grouped_df.get_group(trgt).head(2)
            list_df_temp.append(df_top3_pegrna)

            if idx == 2: break
        
        ## 5. Output = 이렇게 뽑힌 pegRNA 정보들은 새 dataframe들에 담아준다. 그리고 그 안에서 rank column을 추가해준다. 그리고 최종 output df에 넣어준다. 
        df_temp = pd.concat(list_df_temp)
        df_temp['rank'] = df_temp.DeepPrime_score.rank(ascending=False)
        
        df_out = pd.concat([df_out, df_temp])

    return df_out


In [None]:
### Run ###

target_list = ['ABL1_ex4', 'ABL1_ex5', 'ABL1_ex6', 'ABL1_ex7', 'ABL1_ex8', 'ABL1_ex9']
pe_sys = ['PE2max']
base_dir = os.getcwd()


for pe in pe_sys:
    list_out = []

    for gene in target_list:
        file_dir = '%s/output/%s' %(base_dir, gene)

        df_out = make_top_pegrna_list(file_dir, pe)
        list_out.append(df_out)

    df = pd.concat(list_out)
    df.to_csv('%s/ABL1_top_pegRNAs_%s_220802_BsmBI_filtered_Top6.csv' % (base_dir, pe), index=False)
