In [1]:
import pandas, gumpy

# Deciphering the Illumina Ampliseq manifest

This has two key blocks `targets` and `probes`. The latter has the primer sequences -- let's look at that first (I've split the `manifest` file into these blocks)

In [2]:
probes = pandas.read_csv('SARS-CoV-2.dna_manifest.20200408.probes.txt', sep='\t')
probes[:3]

Unnamed: 0,Target ID,ULSO Sequence,DLSO Sequence
0,r1_1.1.363702,GTCACACCCGGACGAAACCTAGATG,AAGTTGGTTGGTTTGTTACCTGGGA
1,r1_1.1.80142,TGTTTTCTCGTTGAAACCAGGGACA,AAGAGATCGAAAGTTGGTTGGTTTG
2,r1_1.1.876634,ATGAGGTGCAGTTCGAGCATCCGAA,TTCGGTCACACCCGGACGAAACCTA


Since there is no length of the primers recorded anywhere, let's see how long all these primers are

In [3]:
def length_sequences(row):
    return(pandas.Series([len(row['ULSO Sequence']), len(row['DLSO Sequence'])]))

probes[['left', 'right']] = probes.apply(length_sequences, axis=1)

probes[:3]

Unnamed: 0,Target ID,ULSO Sequence,DLSO Sequence,left,right
0,r1_1.1.363702,GTCACACCCGGACGAAACCTAGATG,AAGTTGGTTGGTTTGTTACCTGGGA,25,25
1,r1_1.1.80142,TGTTTTCTCGTTGAAACCAGGGACA,AAGAGATCGAAAGTTGGTTGGTTTG,25,25
2,r1_1.1.876634,ATGAGGTGCAGTTCGAGCATCCGAA,TTCGGTCACACCCGGACGAAACCTA,25,25


In [4]:
probes.left.value_counts(dropna=False)

25    242
Name: left, dtype: int64

In [5]:
probes.right.value_counts(dropna=False)

25    242
Name: right, dtype: int64

Turns out they are ALL 25 bases long which makes life easier.

Now let's turn our attention to the `targets` block

In [6]:
df = pandas.read_csv('SARS-CoV-2.dna_manifest.20200408.targets.txt', sep='\t')
df[:3]

Unnamed: 0,TargetA,TargetB,Target Number,Chromosome,Start Position,End Position,Probe Strand,Sequence
0,r1_1.1.363702,r1_1.1.363702,1,MN908947v3,18,254,+,TCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTG...
1,r1_1.1.80142,r1_1.1.80142,1,MN908947v3,28,301,+,GTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCAC...
2,r1_1.1.876634,r1_1.1.876634,1,MN908947v3,234,508,+,AGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACAC...


For whatever reason, there are five targets that are NOT SARS-CoV-2, so let's remove them and measure how long the given `Sequence` is

In [7]:
df.Chromosome.value_counts(dropna=False)

MN908947v3    237
NM_000190       1
NM_003194       1
NM_002467       1
NM_000889       1
NM_002332       1
Name: Chromosome, dtype: int64

In [8]:
df = df[df.Chromosome == 'MN908947v3']

def count_sequence(row):
    return(len(row.Sequence))

df['length'] = df.apply(count_sequence, axis=1)

df[:3]

Unnamed: 0,TargetA,TargetB,Target Number,Chromosome,Start Position,End Position,Probe Strand,Sequence,length
0,r1_1.1.363702,r1_1.1.363702,1,MN908947v3,18,254,+,TCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTG...,187
1,r1_1.1.80142,r1_1.1.80142,1,MN908947v3,28,301,+,GTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCAC...,224
2,r1_1.1.876634,r1_1.1.876634,1,MN908947v3,234,508,+,AGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACAC...,225


Now to double check exactly what the `Start Position` and `End Position` are (0/1-based etc), let's extract the sequence from the reference using `gumpy` and compare

In [10]:
reference = gumpy.Genome('../MN908947.3.gbk')
reference

MN908947
MN908947.3
Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome
29903 bases
attaaa...aaaaaa
metadata for all genes/loci have been included

`gumpy` exposes the sequence and its (1-based) numbering via two `numpy` arrays

In [11]:
reference.nucleotide_sequence

array(['a', 't', 't', ..., 'a', 'a', 'a'], dtype='<U1')

In [12]:
reference.nucleotide_index

array([    1,     2,     3, ..., 29901, 29902, 29903])

Hence, we can extract what we think should be the sequence given the start/end positions and compare to the given sequence

In [13]:
def check_sequence(row):
    
    # the primers are all 25 bases long so
    start = row['Start Position']+25
    end = row['End Position']-25
    
    mask = (reference.nucleotide_index>=start) & (reference.nucleotide_index<=end)
    tmp = reference.nucleotide_sequence[mask]
    inferred_sequence = ''.join(i.upper() for i in tmp)
    
    # insist they are identical
    assert row.Sequence == inferred_sequence

df.apply(check_sequence, axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
237    None
238    None
239    None
240    None
241    None
Length: 237, dtype: object

Ok, so that works - `Start Position` and `End Position` are 1-based and inclusive.

We know everything we need now to create a `CSV` file in the format to create synthetic reads

In [14]:
def find_tips(row):

    start_left, end_right = row['Start Position'], row['End Position']

    end_left = start_left + 25 - 1
    start_right = end_right - 25 + 1
    
    length = end_right - start_left + 1
    
    start_amplicon = end_left + 1
    end_amplicon = start_right - 1
    
    results = [start_left, end_left, start_right, end_right, start_left, end_right, 
               start_amplicon, end_amplicon]
    
    return(pandas.Series(results))
    
df[['start_left', 'end_left', 'start_right', 'end_right', 'start', 'end', 'start_amplicon', 'end_amplicon']] = df.apply(find_tips, axis=1)

df[:3]

Unnamed: 0,TargetA,TargetB,Target Number,Chromosome,Start Position,End Position,Probe Strand,Sequence,length,start_left,end_left,start_right,end_right,start,end,start_amplicon,end_amplicon
0,r1_1.1.363702,r1_1.1.363702,1,MN908947v3,18,254,+,TCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTG...,187,18,42,230,254,18,254,43,229
1,r1_1.1.80142,r1_1.1.80142,1,MN908947v3,28,301,+,GTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCAC...,224,28,52,277,301,28,301,53,276
2,r1_1.1.876634,r1_1.1.876634,1,MN908947v3,234,508,+,AGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACAC...,225,234,258,484,508,234,508,259,483


Let's make up a name for each amplicon to be consistent with ARTIC etc

In [15]:
def define_name(row):
    return ('SARS-CoV-2_' + str(row.number))

df['number'] = df.index + 1

df['name'] = df.apply(define_name, axis=1)

df[:3]

Unnamed: 0,TargetA,TargetB,Target Number,Chromosome,Start Position,End Position,Probe Strand,Sequence,length,start_left,end_left,start_right,end_right,start,end,start_amplicon,end_amplicon,number,name
0,r1_1.1.363702,r1_1.1.363702,1,MN908947v3,18,254,+,TCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTG...,187,18,42,230,254,18,254,43,229,1,SARS-CoV-2_1
1,r1_1.1.80142,r1_1.1.80142,1,MN908947v3,28,301,+,GTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCAC...,224,28,52,277,301,28,301,53,276,2,SARS-CoV-2_2
2,r1_1.1.876634,r1_1.1.876634,1,MN908947v3,234,508,+,AGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACAC...,225,234,258,484,508,234,508,259,483,3,SARS-CoV-2_3


Let's subset down to the columns used for the other schemes

In [16]:
df = df[['name', 'number', 'start_left', 'end_left', 'start_right', 'end_right',
  'start', 'end', 'start_amplicon', 'end_amplicon']]

df[:3]

Unnamed: 0,name,number,start_left,end_left,start_right,end_right,start,end,start_amplicon,end_amplicon
0,SARS-CoV-2_1,1,18,42,230,254,18,254,43,229
1,SARS-CoV-2_2,2,28,52,277,301,28,301,53,276
2,SARS-CoV-2_3,3,234,258,484,508,234,508,259,483


AFAIK, there is only a single version but let's add a version number just in case

In [17]:
df.to_csv('../covid-ampliseq-v1.amplicons.csv')