# SupplementaryCode2
#### pegRNA_abundance_screening_analysis

CML VUS screening 결과 중, pegRNA abundance screening의 NGS data를 분석하기 위해 수행한 preprocessing용 python script. For more detail, please read Methods and Supplementary Information. 

Lead contact: Hyoungbum Henry Kim (hkim1@gmail.com)

Technical contact: Goosang Yu (gsyu93@gmail.com), Yusang Jung (ys.jung@yuhs.ac)

### Requirements
아래의 pipeline은 linux 환경에서 사용하는 것을 전제로 제작되었음. Ubuntu 22.04 LTS 또는 CentOS7에서 테스트 확인되었음.  
아래의 pipeline을 수행하기 위해서, 다음의 software들이 설치되어 있어야 함. 
- python (>= 3.8)
- genet (>= 0.14)
- cutadapt
- seqkit
- MAGeCK

## Directory tree
아래의 pipeline은 다음의 directory 구조로 구성되어 있음을 전제로 제작되었음.

📦Working directory  
 ┣ 📂0_barcode_info  
 ┃ ┗ 📜epegRNA_lib_reference.csv  
 ┣ 📂1_raw_data  
 ┃ ┣ 📜epeg_DMSO_A_merged_2.fq.gz  
 ┃ ┣ 📜epeg_DMSO_B_merged_2.fq.gz  
 ┃ ┣ 📜epeg_Imatinib_A_merged_2.fq.gz  
 ┃ ┗ 📜epeg_Imatinib_B_merged_2.fq.gz  
 ┣ 📂2_processed  
 ┣ 📂3_results  
 ┣ 📂4_mageck  
 ┣ 📂src  
 ┃ ┗ 📜preprocessing.py  
 ┗ 📜SuppleCodeXX.ipynb (this file)  
 

## Step 0: Import packagies

In [None]:
import subprocess, sys, os, time, glob, gzip
import pandas as pd
import numpy as np

from tqdm import tqdm
from Bio import SeqIO
from datetime import datetime
from genet.analysis import ReadDeduplicator
from src.preprocessing import Preprocess, make_df_umi, MAGeCKanalyzer

## Step1: Preprocessing NGS data

In [None]:
list_files = [   
    "/extdata2/CML_vus/10_NBT_revision/1_raw_data/epeg_DMSO_A_merged_2.fq.gz",
    "/extdata2/CML_vus/10_NBT_revision/1_raw_data/epeg_DMSO_B_merged_2.fq.gz",
    "/extdata2/CML_vus/10_NBT_revision/1_raw_data/epeg_Imatinib_A_merged_2.fq.gz",
    "/extdata2/CML_vus/10_NBT_revision/1_raw_data/epeg_Imatinib_B_merged_2.fq.gz"
]

def current_time():
    return str(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

def worker(seq_path:str):
    
    with open('preprocess.log', 'a+') as file:
        
        print(current_time())
        print('Step 0: Load data and initiation\n')
        file.write(f'{current_time()} Step 0: Load data and initiation')
        data_pp = Preprocess(data_path=seq_path, data_format='fq.gz')

        print('Step 1: to_fasta')
        file.write(f'{current_time()} Step 1: to_fasta\n')
        path = data_pp.to_fasta(gzip=True)
        print(f'[Done] save - {path}\n')
        file.write(f'{current_time()} [Done] save - {path}\n')

        print('Step 2: trim tevopreQ1 seq')
        file.write(f'{current_time()} Step 2: trim tevopreQ1 seq\n')
        path = data_pp.trim(finder='AAAAAATTCTAG', error=0)
        print(f'[Done] save - {path}\n')
        file.write(f'{current_time()} [Done] save - {path}\n')

        print('Step 3: revcom seq')
        file.write(f'{current_time()} Step 3: revcom seq\n')
        path = data_pp.revcom()
        print(f'[Done] save - {path}\n')
        file.write(f'{current_time()} [Done] save - {path}\n')

        print('Step 4: trim RP binding seq')
        file.write(f'{current_time()} Step 4: trim RP binding seq\n')
        path = data_pp.trim(finder='CTACTCTACCACTTGT', error=1)
        print(f'[Done] save - {path}\n')
        file.write(f'{current_time()} [Done] save - {path}\n')

        print('Step 5: Finalize')
        file.write(f'{current_time()} Step 5: Finalize\n')
        path = data_pp.finalize(save_path='2_processed')
        print(f'[Done] save - {path}\n')
        file.write(f'{current_time()} [Done] save - {path}\n')

for seq_path in list_files:
    worker(seq_path)


## Step 2: UMI counting

In [None]:

list_sample_id = [
    'pp_epeg_DMSO_A_merged_2',
    'pp_epeg_DMSO_B_merged_2',
    'pp_epeg_Imatinib_A_merged_2',
    'pp_epeg_Imatinib_B_merged_2',
]

df_barcode = pd.read_csv('CML_VUS_Barcode_SNV_final.csv')


for sample in list_sample_id:

    fa = f'2_processed/{sample}.fa.gz'
    f_name = fa.split('/')[-1]

    print('Parsing sequencing data...')
    df_umi = make_df_umi(list_barcode=list(df_barcode['Barcode for sorting (18nt)']),
                         data_path=fa,
                         len_umi=8,
                        )

    df_umi.to_csv('3_results/%s' % f_name.replace('.fa.gz', '_UMI_duplicated.csv'),)

    umi_group = df_umi.groupby(by=['Barcode'])
    list_bc   = df_umi['Barcode'].unique()

    list_df = []

    for bc in tqdm(list_bc, total = len(list_bc),
                   desc = 'UMI deduplication',
                   ncols=70, ascii=' =', leave=True
                   ):
        
        dict_out  = {'Barcode'  : [bc]*4, 
                     'UMI_dedup': ['startA', 'startC', 'startG', 'startT'], 
                     'count'    : [0, 0, 0, 0]}

        umis_dupple = umi_group.get_group(bc)
        
        # UMI-tools: ReadDeduplicator
        dedup = ReadDeduplicator()
        final_umis, umi_counts = dedup(umis_dupple, threshold=1)

        for umi, cnt in zip(final_umis, umi_counts):
            
            start_umi = umi[0]
            
            if   start_umi == 'A': dict_out['count'][0] += cnt
            elif start_umi == 'C': dict_out['count'][1] += cnt
            elif start_umi == 'G': dict_out['count'][2] += cnt
            elif start_umi == 'T': dict_out['count'][3] += cnt

        df_dedup = pd.DataFrame.from_dict(data=dict_out, orient='columns')
        list_df.append(df_dedup)
    
    df_out = pd.concat(list_df).reset_index(drop=True)

    df_out.to_csv("3_results/%s" % f_name.replace('.fa.gz', '_UMI_dedup_ATGC_subgroup.csv'),
                    index=False)
    


## Step 3: MAGeCK analysis

In [None]:
# Reference file path
lib_reference = '0_barcode_info/epegRNA_lib_reference.csv'

# Step1: Make mageck input file for each replicate
print('Step1: Make mageck input file for each replicate')
for rep in ['A', 'B']:

    dmso_umi_path = f'3_results/pp_epeg_DMSO_{rep}_merged_2_UMI_dedup_ATGC_subgroup.csv'
    tki_umi_path  = f'3_results/pp_epeg_Imatinib_{rep}_merged_2_UMI_dedup_ATGC_subgroup.csv'

    mageck = MAGeCKanalyzer()

    df_count = mageck.setup(lib_reference=lib_reference,
                            dmso_umi_path=dmso_umi_path, 
                            tki_umi_path=tki_umi_path
                            )
    
    df_count.to_csv(f'4_mageck/mageck_count_Imatinib_{rep}_AA_var.csv')

# Step2: Make average RPM files
print('Step2: Make average RPM files')
df_a = pd.read_csv('4_mageck/mageck_count_Imatinib_A_AA_var.csv', index_col='Barcode-UMI')
df_b = pd.read_csv('4_mageck/mageck_count_Imatinib_B_AA_var.csv', index_col='Barcode-UMI')

df_merge = df_a.copy()

df_merge['control'] = (df_a['control'] + df_b['control']) / 2
df_merge['test'] = (df_a['test'] + df_b['test']) / 2

df_merge.to_csv('4_mageck/mageck_count_Imatinib_merge_AA_var.csv')


# Step3: Run MAGeCK
print('Step3: Run MAGeCK')
mageck = MAGeCKanalyzer()

result = mageck.mageck(input_file='4_mageck/mageck_count_Imatinib_merge_AA_var.csv',
                       name='Imatinib_revision', save_path='4_mageck')

