In [216]:
import pandas as pd
from Bio import SeqIO
import re

### Get position of T stretches

In [217]:
fname_ref_hsv = '../../resources/deletion_analysis/resources/references/hsv.fasta'

fasta_sequences = SeqIO.parse(open(fname_ref_hsv),'fasta')
for item in fasta_sequences:
    ref_hsv = str(item.seq)

In [252]:
list_T_stretch_start_positions = [i.start() for i in re.finditer("TTTTTT", ref_hsv)]

In [255]:
def length_of_T_stretch(start_position):
    length_of_T_stretch=0
    for base in ref_hsv[start_position:]: 
        if base =='T':
            length_of_T_stretch+=1
        else: 
            return length_of_T_stretch

In [256]:
range_T_stretches = [(pos, pos+length_of_T_stretch(pos)) for pos in list_T_stretch_start_positions]

In [259]:
list_all_T_stretches_pos = []

for T_range in range_T_stretches:
    list_all_T_stretches_pos+=list(range(T_range[0], T_range[1]))

In [262]:
list_all_T_stretches_pos_one_based = [i +1 for i in list_all_T_stretches_pos]

### Load deletions

In [220]:
# load deletions of HSV sample 
fname_mutations = '../../resources/deletion_analysis/results/deletion_analysis_hsv/all_mutations.csv'

In [221]:
df = pd.read_csv(fname_mutations)

df['n_reads_var'] = df['Rvar'] + df['Fvar']
df['coverage'] = df['Rtot'] + df['Ftot']
df['frequency'] = df['n_reads_var'] / df['coverage']
df['position'] = df['Pos']

In [222]:
df[(df['Var']=='-') & (df['Ref']=='T')].to_csv("hsv_T_deletions.csv")

### Filter for postions of T strechtes

In [263]:
# filter for position for TTTTTT
df[df['Pos'].isin(list_all_T_stretches_pos_one_based)]

Unnamed: 0.1,Unnamed: 0,Chromosome,Pos,Ref,Var,Frq1,Frq2,Frq3,Pst1,Pst2,...,Rtot,Pval,Qval,sample,patient,date,n_reads_var,coverage,frequency,position
3850,3850,MK855052.1,70730,T,G,0.0022,-,0.0067,0.9046,-,...,390,0.069203,1.0,results_vpipe,munoz2021,hsv2_wt,5,781,0.006402,70730
5095,5095,MK855052.1,91321,T,C,0.0013,0.0013,-,0.9189,0.9240,...,997,1.0,1.0,results_vpipe,munoz2021,hsv2_wt,2,1630,0.001227,91321
6134,6134,MK855052.1,117039,T,A,-,0.0025,0.0021,-,0.9251,...,375,1.0,1.0,results_vpipe,munoz2021,hsv2_wt,1,951,0.001052,117039
6135,6135,MK855052.1,117040,T,C,-,0.0025,0.0021,-,0.9251,...,373,1.0,1.0,results_vpipe,munoz2021,hsv2_wt,1,945,0.001058,117040


In [264]:
# --> no deletions in those positions

### Check coverage in those positions

In [310]:
# check coverage in those positions

fname_coverage = "coverage.tsv"
df_cov = pd.read_csv(fname_coverage, sep="\t")

In [311]:
df_cov.shape

(155975, 3)

In [312]:
len(ref_hsv)

155975

In [313]:
df_cov['ref_base'] = list(ref_hsv)
df_cov['pos_one_based'] = df_cov['pos']+1 

In [314]:
# some stats
print('mean: ', df_cov[df_cov['pos'].isin(list_all_T_stretches_pos)]['munoz2021-hsv2_wt'].mean())
print('median: ', df_cov[df_cov['pos'].isin(list_all_T_stretches_pos)]['munoz2021-hsv2_wt'].median())
print('std: ', df_cov[df_cov['pos'].isin(list_all_T_stretches_pos)]['munoz2021-hsv2_wt'].std())
print('max: ', df_cov[df_cov['pos'].isin(list_all_T_stretches_pos)]['munoz2021-hsv2_wt'].max())
print('min: ', df_cov[df_cov['pos'].isin(list_all_T_stretches_pos)]['munoz2021-hsv2_wt'].min())

mean:  607.110655737705
median:  557.0
std:  435.29330735158635
max:  1645
min:  21


### Formatting table for manuscript

In [315]:
df_cov = df_cov.rename(columns={'ref': 'Reference Genome',
                                'pos_one_based': 'T stretch position',
                                'munoz2021-hsv2_wt': 'Coverage'})
df_cov = df_cov[df_cov['pos'].isin(list_T_stretch_start_positions)]
df_cov = df_cov[['Reference Genome', 'T stretch position', 'Coverage']]

df_cov['T stretch length'] = df_cov['T stretch position'].apply(length_of_T_stretch)
df_cov['T stretch length'] = df_cov['T stretch length']+1

df_cov['Deletion Frequency'] = '0%'

In [316]:
df_cov

Unnamed: 0,Reference Genome,T stretch position,Coverage,T stretch length,Deletion Frequency
9708,MK855052.1,9709,996,6,0%
18500,MK855052.1,18501,887,6,0%
25012,MK855052.1,25013,81,10,0%
25917,MK855052.1,25918,742,6,0%
29432,MK855052.1,29433,1382,6,0%
35075,MK855052.1,35076,1056,6,0%
35392,MK855052.1,35393,311,10,0%
49044,MK855052.1,49045,252,6,0%
49111,MK855052.1,49112,41,12,0%
49117,MK855052.1,49118,24,6,0%


In [317]:
df_cov.to_csv('Supplementary_Table_3.csv')