# **Supplementary Code 4**
This notebook was used for analysis of NGS reads containing intended prime-editing and synonymous mutation marker. For more detail, please read Methods and Supplementary Information. 

Lead contact: Hyoungbum Henry Kim (hkim1@gmail.com)

Technical contact: Goosang Yu (gsyu93@gmail.com), Yusang Jung (ys.jung@yuhs.ac)

## Directory tree

📦Working directory  
 ┣ 📂data  
 ┃ ┣ 📂NGS_FASTQ_files  
 ┃ ┣ 📂NGS_frequency_table  
 ┃ ┃ ┣ 📜C4Bosutinib791.txt  
 ┃ ┃ ┣ 📜C4Control797.txt  
 ┃ ┃ ┗ 📜...  
 ┃ ┣ 📂read_counts  
 ┃ ┣ 📂statistics  
 ┃  
 ┣ 📂src  
 ┃ ┣ 📜Alignment.py  
 ┃ ┣ 📜VarCalling.py  
 ┃  
 ┣ 📂variants_info  
 ┃ ┣ 📜ex4_info.csv  
 ┃ ┣ 📜ex5_info.csv  
 ┃ ┣ 📜ex6_info.csv  
 ┃ ┣ 📜ex7_info.csv  
 ┃ ┣ 📜ex8_info.csv  
 ┃ ┣ 📜ex9_info.csv  
 ┃ ┣ 📜invivo_ex4_info.csv  
 ┃ ┗ 📜invivo_ex9_info.csv  
 ┃  
 ┗ 📜SuppleCode4.ipynb (this file)  

# Requirements
- CRISPResso2 (>= 2.x.x)
- pandas

## Variants calling and make read count file
After running CRISPResso, generate the read count file. This is the process of creating the foundational file for all analyses.

In [1]:
import os
import pandas as pd
from tqdm import tqdm
from glob import glob

from src.Alignment import ABL1VUS
from src.VarCalling import make_count_file, read_statistics, combine_data, VariantFilter, VariantScore, Normalizer

In [3]:
# Make count files from frequency table

freq_tables = glob('data/frequency_table/*HTS2*.txt')

for f in freq_tables:

    n_sample = os.path.basename(f).replace('.txt', '')
    exon_num = n_sample.split('Exon')[1][0]
    ref_info = f'variants_info/ex{exon_num}_info.csv'

    
    df_cnt = make_count_file(f, ref_info)
    df_cnt.to_csv(f'data/read_counts/Count_{n_sample}.csv', index=False)

[Info] Read counting: K562PE4K_HTS2Dose0.5xDay10_Exon5_Rep1_Asciminib: 100%|██████████| 139161/139161 [00:13<00:00, 10209.21it/s]
[Info] Read counting: K562PE4K_HTS2Dose0.5xDay10_Exon5_Rep1_Imatinib: 100%|██████████| 143326/143326 [00:14<00:00, 10022.73it/s]
[Info] Read counting: K562PE4K_HTS2Dose0.5xDay10_Exon5_Rep2_Asciminib: 100%|██████████| 210654/210654 [00:20<00:00, 10350.57it/s]
[Info] Read counting: K562PE4K_HTS2Dose0.5xDay10_Exon5_Rep2_Imatinib: 100%|██████████| 174170/174170 [00:16<00:00, 10468.49it/s]
[Info] Read counting: K562PE4K_HTS2Dose1xDay10_Exon5_Rep1_Asciminib: 100%|██████████| 130389/130389 [00:13<00:00, 10009.01it/s]
[Info] Read counting: K562PE4K_HTS2Dose1xDay10_Exon5_Rep1_Bosutinib: 100%|██████████| 145079/145079 [00:14<00:00, 10106.44it/s]
[Info] Read counting: K562PE4K_HTS2Dose1xDay10_Exon5_Rep1_Dasatinib: 100%|██████████| 146242/146242 [00:14<00:00, 10209.32it/s]
[Info] Read counting: K562PE4K_HTS2Dose1xDay10_Exon5_Rep1_Imatinib: 100%|██████████| 153268/153268

## Significance analysis
Compare the prime-edited sample with the unedited sample to calculate the odds ratio and Fisher's exact test p-value for each variant.

In [2]:
# Calculate statistics with read count for each variants

list_HTS23 = [
    # HTS2
    'K562PE4K_HTS2DoseControlDay10_Exon5_Rep1_DMSO',
    'K562PE4K_HTS2DoseControlDay10_Exon5_Rep2_DMSO',

    # HTS3
    'K562PE4K_HTS3DoseControlDay4_Exon5_Rep1_DMSO',
    'K562PE4K_HTS3DoseControlDay4_Exon5_Rep2_DMSO',
    'K562PE4K_HTS3DoseControlDay6_Exon5_Rep1_DMSO',
    'K562PE4K_HTS3DoseControlDay6_Exon5_Rep2_DMSO',
    'K562PE4K_HTS3DoseControlDay10_Exon5_Rep1_DMSO',
    'K562PE4K_HTS3DoseControlDay10_Exon5_Rep2_DMSO',
]

for sample in list_HTS23:
    
    test_file       = f'data/read_counts/Count_{sample}.csv'
    background_file = f'data/read_counts/Count_K562PE4K_unedit_Exon5.csv'

    df_stats = read_statistics(test_file, background_file)

    df_stats.to_csv(f'data/statistics/Stat_{sample}.csv', index=False)

Analysis: Count_K562PE4K_HTS2DoseControlDay10_Exon5_Rep1_DMSO
Analysis: Count_K562PE4K_HTS2DoseControlDay10_Exon5_Rep2_DMSO
Analysis: Count_K562PE4K_HTS3DoseControlDay4_Exon5_Rep1_DMSO
Analysis: Count_K562PE4K_HTS3DoseControlDay4_Exon5_Rep2_DMSO
Analysis: Count_K562PE4K_HTS3DoseControlDay6_Exon5_Rep1_DMSO
Analysis: Count_K562PE4K_HTS3DoseControlDay6_Exon5_Rep2_DMSO
Analysis: Count_K562PE4K_HTS3DoseControlDay10_Exon5_Rep1_DMSO
Analysis: Count_K562PE4K_HTS3DoseControlDay10_Exon5_Rep2_DMSO


## DMSO vs TKI response analysis
Analysis for resistance to drugs

- Test: Making variants using Prime editing for 20 days, followed by 10 days of TKI treatment.
- Control: Making variants using Prime editing for 20 days, followed by 10 days of DMSO treatment.

In [2]:
from data.sample_pair import dict_HTS2, dict_HTS3

In [3]:
# For HTS2 sample

for sample, data in dict_HTS2.items():
    
    test_1, test_2 = data['test']
    cont_1, cont_2 = data['cont']
    
    vf = VariantFilter(
        test_r1    = f'data/read_counts/Count_{test_1}.csv',
        test_r2    = f'data/read_counts/Count_{test_2}.csv',
        control_r1 = f'data/statistics/Stat_{cont_1}.csv',
        control_r2 = f'data/statistics/Stat_{cont_2}.csv',
    )
    
    df_rep1, df_rep2 = vf.filter(OR_cutoff=2, p_cutoff=0.05, rpm_cutoff=10)

    normal = Normalizer()

    # LOWESS regression normalization
    lws_frac = 0.15
    
    df_nor1 = normal.lowess(df_rep1, frac=lws_frac)
    df_nor2 = normal.lowess(df_rep2, frac=lws_frac)

    df_nor1.to_csv(f'data/statistics/Filtered_{test_1}.csv', index=False)
    df_nor2.to_csv(f'data/statistics/Filtered_{test_2}.csv', index=False)

    normalized_rep_1 = f'data/statistics/Filtered_{test_1}.csv'
    normalized_rep_2 = f'data/statistics/Filtered_{test_2}.csv'

    # Score calculation 함수 불러오기
    score = VariantScore()

    adjus_LFC = score.calculate(normalized_rep_1, normalized_rep_2, var_type='SNV')
    res_score = score.calculate(normalized_rep_1, normalized_rep_2, var_type='AA')

    n_sample = test_1.replace('Rep1_', '')

    adjus_LFC.to_csv(f'data/adjusted_LFC/AdjustedLFC_{n_sample}.csv')
    res_score.to_csv(f'data/resistance_score/ResistanceScore_{n_sample}.csv')

In [4]:
# For HTS3 sample

for sample, data in dict_HTS3.items():
    
    test_1, test_2 = data['test']
    cont_1, cont_2 = data['cont']
    
    vf = VariantFilter(
        test_r1    = f'data/read_counts/Count_{test_1}.csv',
        test_r2    = f'data/read_counts/Count_{test_2}.csv',
        control_r1 = f'data/statistics/Stat_{cont_1}.csv',
        control_r2 = f'data/statistics/Stat_{cont_2}.csv',
    )
    
    df_rep1, df_rep2 = vf.filter(OR_cutoff=2, p_cutoff=0.05, rpm_cutoff=10)

    normal = Normalizer()

    # LOWESS regression normalization
    lws_frac = 0.15
    
    df_nor1 = normal.lowess(df_rep1, frac=lws_frac)
    df_nor2 = normal.lowess(df_rep2, frac=lws_frac)

    df_nor1.to_csv(f'data/statistics/Filtered_{test_1}.csv', index=False)
    df_nor2.to_csv(f'data/statistics/Filtered_{test_2}.csv', index=False)

    normalized_rep_1 = f'data/statistics/Filtered_{test_1}.csv'
    normalized_rep_2 = f'data/statistics/Filtered_{test_2}.csv'

    # Score calculation 함수 불러오기
    score = VariantScore()

    adjus_LFC = score.calculate(normalized_rep_1, normalized_rep_2, var_type='SNV')
    res_score = score.calculate(normalized_rep_1, normalized_rep_2, var_type='AA')

    n_sample = test_1.replace('Rep1_', '')

    adjus_LFC.to_csv(f'data/adjusted_LFC/AdjustedLFC_{n_sample}.csv')
    res_score.to_csv(f'data/resistance_score/ResistanceScore_{n_sample}.csv')