# Read patterns analysis

Screening NGS 결과를 보면, SynPrime이 정확히 일어난 것 뿐만 아니라, 전혀 다른 read pattern이 나타나는 것들이 있다. 이러한 read pattern을 분석할 때 사용한 코드를 정리한 것.

In [1]:
import os
import pandas as pd
import pygwalker as pyg
from src.VarCalling import ReadPatternAnalyzer
from tqdm import tqdm
from glob import glob

In [2]:
rpa = ReadPatternAnalyzer()

files = glob('data/frequency_table/*KCL*.txt')

for freq_table in tqdm(files,
                       total = len(files),
                       desc = 'Read Pattern Analyzer', ## 진행률 앞쪽 출력 문장
                       ncols = 70,                     ## 진행률 출력 폭 조절
                       ascii = ' =',                   ## 바 모양, 첫 번째 문자는 공백이어야 작동
                      ):
    
    sample   = os.path.basename(freq_table).replace('.txt', '')
    exon_num = sample.split('Exon')[1][0]
    ref_info = f'variants_info/ex{exon_num}_info.csv'

    df_out = rpa.run(freq_table, ref_info)
    df_out.to_csv(f'data/read_patterns/{sample}_patterns.csv', index=False)



In [2]:
df = pd.read_csv('data/read_patterns/K562PE4K_HTS_Exon5_Rep1_DMSO_patterns.csv')
df

Unnamed: 0,#Reads,%Reads,mut_type,mut_class
0,5394215,47.116323,WT_refseq,WT_refseq
1,530543,4.634082,Intended_only,Single_edit
2,157719,1.377613,sub1,sub1
3,149326,1.304303,SynPE,SynPE
4,127368,1.112509,SynPE,SynPE
...,...,...,...,...
243508,1,0.000009,sub5,sub5more
243509,1,0.000009,sub4,sub4
243510,1,0.000009,sub5,sub5more
243511,1,0.000009,sub4,sub4


In [4]:
gwalger = pyg.walk(df)

Box(children=(HTML(value='<div id="ifr-pyg-000618104e77eb84tuIOWxy4JED1Tcwl" style="height: auto">\n    <head>…

In [2]:
# summary file 만들기

read_type = '%Reads' # '#Reads' or '%Reads'

files = glob('data/read_patterns/*unedit*')
dict_summary = {}

for pattern_file in tqdm(files, total = len(files),
                       desc = 'Read Pattern Summary',  ## 진행률 앞쪽 출력 문장
                       ncols = 100,                    ## 진행률 출력 폭 조절
                       ascii = ' =',                   ## 바 모양, 첫 번째 문자는 공백이어야 작동
                      ):
    
    df_pattern  = pd.read_csv(pattern_file)
    sample_name = os.path.basename(pattern_file).replace('.csv', '')
    dict_temp   = {}

    for idx in tqdm(df_pattern.index, total=len(df_pattern.index),
                    desc='Summarize mut classes',
                    ncols=100,
                    leave=False,
                    ):
        data = df_pattern.loc[idx]

        try   : dict_temp[data.mut_class] += data[read_type]
        except: dict_temp[data.mut_class]  = data[read_type]

    dict_summary[sample_name] = dict_temp

df_summary = pd.DataFrame.from_dict(dict_summary, orient='index')
df_summary

Read Pattern Summary:   0%|                                                  | 0/26 [00:00<?, ?it/s]



Unnamed: 0,WT_refseq,sub1,Single_edit,ins,del,SynPE,sub2,sub5more,sub3,sub4,Complex,sub0
K562PE2_unedit_Exon5_patterns,18840658.0,255687,602032.0,14236,11871,138810.0,31295,9646,13367,13035,37.0,
K562PE4K_unedit_Exon4_patterns,16650729.0,351284,2412249.0,5106,57398,1814.0,431956,212753,150058,85691,33.0,
K562PE4K_unedit_Exon5_patterns,18798667.0,220120,553037.0,14412,13609,131098.0,24341,10101,11088,9448,38.0,
K562PE4K_unedit_Exon6_patterns,22405288.0,371982,2350896.0,2774,19153,91018.0,320240,132442,94178,50908,197.0,
K562PE4K_unedit_Exon7_patterns,17886596.0,248499,1903222.0,2469,26801,13396.0,246514,162258,79436,47835,142.0,
K562PE4K_unedit_Exon8_patterns,18545897.0,310245,1624535.0,1700,14522,5050.0,212515,154886,80240,50615,11.0,
K562PE4K_unedit_Exon9_patterns,12589496.0,148423,1822776.0,488,7818,4214.0,92713,3059,8667,2965,3.0,
K562PE4K_unedit_miniseq_Exon5_patterns,291969.0,3229,19452.0,199,751,990.0,2916,1244,1119,664,,
K562PE4K_unedit_miniseq_Exon6_patterns,225628.0,6100,40106.0,33,862,3064.0,9866,5986,4255,2366,8.0,
K562PE4K_unedit_miniseq_Exon7_patterns,193485.0,5560,51860.0,27,1123,246.0,17184,15540,8231,4974,12.0,


In [4]:
df_summary.to_csv('reads_patterns_summary_percent_KCLonly.csv')

## K562 WT cell (No PE expressing) analysis

In [2]:
rpa = ReadPatternAnalyzer()

files = glob('data/frequency_table_K562WT_unedit/*.txt')

for freq_table in tqdm(files,
                       total = len(files),
                       desc = 'Read Pattern Analyzer', ## 진행률 앞쪽 출력 문장
                       ncols = 100,                    ## 진행률 출력 폭 조절
                       ascii = ' =',                   ## 바 모양, 첫 번째 문자는 공백이어야 작동
                      ):
    
    sample   = os.path.basename(freq_table).replace('.txt', '')
    exon_num = sample.split('Exon')[1][0]
    ref_info = f'variants_info/ex{exon_num}_info.csv'

    df_out = rpa.run(freq_table, ref_info)
    df_out.to_csv(f'data/read_patterns/{sample}_patterns.csv', index=False)



In [3]:
# summary file 만들기

files = glob('data/read_patterns/K562WT*')
dict_summary = {}

for pattern_file in tqdm(files, total = len(files),
                       desc = 'Read Pattern Summary',  ## 진행률 앞쪽 출력 문장
                       ncols = 100,                    ## 진행률 출력 폭 조절
                       ascii = ' =',                   ## 바 모양, 첫 번째 문자는 공백이어야 작동
                      ):
    
    df_pattern  = pd.read_csv(pattern_file)
    sample_name = os.path.basename(pattern_file).replace('.csv', '')
    dict_temp   = {}

    for idx in tqdm(df_pattern.index, total=len(df_pattern.index),
                    desc='Summarize mut classes',
                    ncols=100,
                    leave=False,
                    ):
        data = df_pattern.loc[idx]

        try   : dict_temp[data.mut_class] += data['%Reads']
        except: dict_temp[data.mut_class]  = data['%Reads']

    dict_summary[sample_name] = dict_temp

df_summary = pd.DataFrame.from_dict(dict_summary, orient='index')
df_summary



Unnamed: 0,WT_refseq,Single_edit,sub1,del,SynPE,ins,sub2,sub3,sub5more,sub4,Complex
K562WT_Exon5_unedit_patterns,92.064492,5.484306,1.098682,0.211824,0.381633,0.044115,0.483868,0.123943,0.036763,0.070024,0.00035
K562WT_Exon6_unedit_patterns,77.995248,12.85762,2.015104,0.265515,0.192157,0.006928,3.187609,1.29212,1.449024,0.738674,
K562WT_Exon7_unedit_patterns,78.008136,12.116159,1.761963,0.334452,0.066281,0.006792,3.222497,1.462408,2.112341,0.90897,
K562WT_Exon8_unedit_patterns,83.746087,9.364901,1.740722,0.222471,0.028917,0.005488,2.065775,0.946453,1.317942,0.561244,
K562WT_Exon9_unedit_patterns,78.844152,17.617041,1.438342,0.182988,0.017145,0.001247,1.577376,0.229125,0.027433,0.065152,
K562WT_unedit_Exon5_patterns,91.722892,5.440721,1.157977,0.095526,0.369475,0.061343,0.664063,0.218619,0.147657,0.12167,5.5e-05
K562WT_unedit_Exon6_patterns,83.282429,10.690078,1.818367,0.089361,0.211066,0.009905,1.947898,0.691292,0.876027,0.382956,0.000621
K562WT_unedit_Exon7_patterns,84.416692,9.474025,1.066441,0.13931,0.077045,0.011212,1.790215,0.82905,1.66825,0.527484,0.000276
K562WT_unedit_Exon8_patterns,84.419236,8.816763,1.244912,0.085084,0.02501,0.00708,1.95076,0.983005,1.831175,0.63696,1.5e-05
K562WT_unedit_Exon9_patterns,81.284878,16.567132,1.013227,0.046149,0.006112,0.002494,0.924194,0.09141,0.032625,0.031747,3.2e-05


In [4]:
df_summary.to_csv('K562WT_ReadPatterns.csv')

## PCR cycle에 따른 분석

In [2]:
rpa = ReadPatternAnalyzer()

files = glob('data/frequency_table/PCR*.txt')

for freq_table in tqdm(files,
                       total = len(files),
                       desc = 'Read Pattern Analyzer', ## 진행률 앞쪽 출력 문장
                       ncols = 100,                    ## 진행률 출력 폭 조절
                       ascii = ' =',                   ## 바 모양, 첫 번째 문자는 공백이어야 작동
                      ):
    
    sample   = os.path.basename(freq_table).replace('.txt', '')
    exon_num = sample.split('Exon')[1][0]
    ref_info = f'variants_info/ex{exon_num}_info.csv'

    df_out = rpa.run(freq_table, ref_info)
    df_out.to_csv(f'data/read_patterns/{sample}_patterns.csv', index=False)



In [3]:
# summary file 만들기

files = glob('data/read_patterns/PCR*')
dict_summary = {}

for pattern_file in tqdm(files, total = len(files),
                       desc = 'Read Pattern Summary',  ## 진행률 앞쪽 출력 문장
                       ncols = 100,                    ## 진행률 출력 폭 조절
                       ascii = ' =',                   ## 바 모양, 첫 번째 문자는 공백이어야 작동
                      ):
    
    df_pattern  = pd.read_csv(pattern_file)
    sample_name = os.path.basename(pattern_file).replace('.csv', '')
    dict_temp   = {}

    for idx in tqdm(df_pattern.index, total=len(df_pattern.index),
                    desc='Summarize mut classes',
                    ncols=100,
                    leave=False,
                    ):
        data = df_pattern.loc[idx]

        try   : dict_temp[data.mut_class] += data['%Reads']
        except: dict_temp[data.mut_class]  = data['%Reads']

    dict_summary[sample_name] = dict_temp

df_summary = pd.DataFrame.from_dict(dict_summary, orient='index')
df_summary



Unnamed: 0,WT_refseq,SynPE,Single_edit,sub1,sub2,ins,del,sub3,sub4,sub5more,Complex
PCR17cycle_K562PE4K_HTS_Exon5_Rep1_DMSO_patterns,55.408465,32.359899,3.321113,0.626887,1.474589,0.224939,0.206941,2.095405,3.851653,0.426247,0.003862
PCR17cycle_K562PE4K_HTS_Exon5_Rep2_DMSO_patterns,55.494765,32.22533,3.42869,0.681515,1.538212,0.227054,0.212147,2.111057,3.688163,0.389406,0.00366
PCR20cycle_K562PE4K_HTS_Exon5_Rep1_DMSO_patterns,56.339116,31.756827,3.355387,0.679239,1.431946,0.21071,0.207698,2.039951,3.610037,0.365694,0.003394
PCR20cycle_K562PE4K_HTS_Exon5_Rep2_DMSO_patterns,54.3772,32.716031,3.505689,0.654098,1.579866,0.230889,0.222521,2.241886,4.017279,0.450233,0.004308
PCR23cycle_K562PE4K_HTS_Exon5_Rep1_DMSO_patterns,55.26985,31.843589,3.717663,0.720033,1.464259,0.211959,0.216743,2.290381,3.839781,0.422056,0.003686
PCR23cycle_K562PE4K_HTS_Exon5_Rep2_DMSO_patterns,53.700806,33.243758,3.514127,0.67225,1.594168,0.217793,0.23275,2.284571,4.092037,0.442868,0.004871


In [4]:
df_summary.to_csv('PCR_cycles_ReadPatterns.csv')

In [5]:
rpa = ReadPatternAnalyzer()

files = glob('data/frequency_table/K562PE4K_HTS2DoseControl*.txt')

for freq_table in tqdm(files,
                       total = len(files),
                       desc = 'Read Pattern Analyzer', ## 진행률 앞쪽 출력 문장
                       ncols = 100,                    ## 진행률 출력 폭 조절
                       ascii = ' =',                   ## 바 모양, 첫 번째 문자는 공백이어야 작동
                      ):
    
    sample   = os.path.basename(freq_table).replace('.txt', '')
    exon_num = sample.split('Exon')[1][0]
    ref_info = f'variants_info/ex{exon_num}_info.csv'

    df_out = rpa.run(freq_table, ref_info)
    df_out.to_csv(f'data/read_patterns/{sample}_patterns.csv', index=False)



In [6]:
# summary file 만들기

files = glob('data/read_patterns/K562PE4K_HTS2DoseControl*')
dict_summary = {}

for pattern_file in tqdm(files, total = len(files),
                       desc = 'Read Pattern Summary',  ## 진행률 앞쪽 출력 문장
                       ncols = 100,                    ## 진행률 출력 폭 조절
                       ascii = ' =',                   ## 바 모양, 첫 번째 문자는 공백이어야 작동
                      ):
    
    df_pattern  = pd.read_csv(pattern_file)
    sample_name = os.path.basename(pattern_file).replace('.csv', '')
    dict_temp   = {}

    for idx in tqdm(df_pattern.index, total=len(df_pattern.index),
                    desc='Summarize mut classes',
                    ncols=100,
                    leave=False,
                    ):
        data = df_pattern.loc[idx]

        try   : dict_temp[data.mut_class] += data['%Reads']
        except: dict_temp[data.mut_class]  = data['%Reads']

    dict_summary[sample_name] = dict_temp

df_summary = pd.DataFrame.from_dict(dict_summary, orient='index')
df_summary



Unnamed: 0,WT_refseq,SynPE,Single_edit,sub1,sub2,del,ins,sub3,sub4,sub5more,Complex
K562PE4K_HTS2DoseControlDay10_Exon5_Rep1_DMSO_patterns,53.632505,32.902781,3.730768,0.672526,1.528786,0.234672,0.21031,2.428246,4.18624,0.468829,0.004339
K562PE4K_HTS2DoseControlDay10_Exon5_Rep2_DMSO_patterns,53.031354,33.123071,3.767399,0.6876,1.61226,0.243695,0.21757,2.501433,4.311158,0.499078,0.005383


In [7]:
df_summary.to_csv('PCR_cycles_ReadPatterns_26cycle.csv')