In [1]:
import pandas, copy

from tqdm import tqdm

pandas.options.display.max_columns=999

In [6]:
cog=pandas.read_csv('../create-coguk-dataset/cog_dataset-20210624.csv.gz')

# Exploring the dataset

In [7]:
cog[:3]

Unnamed: 0,cogid,sequence_name,cog_country,adm1,is_pillar_2,sample_date,epi_week,lineage,lineages_version,lineage_conflict,lineage_ambiguity_score,scorpio_call,scorpio_support,scorpio_conflict,t1001i,n501y,d614g,del_21765_6,y453f,del_1605_3,a222v,n439k,mutations,p681h,q27stop,e484k,p323l,study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,instrument_platform,ena_country,submitted_ftp
0,PHWC-PYFPIR,Wales/PHWC-PYFPIR/2021,UK,UK-WLS,N,2021-05-19,73,,PANGO-v1.2.13,,,,,,I,Y,G,del,Y,ref,A,N,synSNP:C913T|synSNP:C1288T|synSNP:C3037T|orf1a...,H,*,E,L,PRJEB37886,SAMEA9132940,ERS6847341,ERX5658438,ERR6019078,ILLUMINA,United Kingdom,ftp.sra.ebi.ac.uk/vol1/run/ERR601/ERR6019078/P...
1,HSLL-160F062,England/HSLL-160F062/2021,UK,UK-ENG,Y,2021-06-03,75,B.1.617.2,PLEARN-v1.2.13,,,Delta (B.1.617.2-like),1.0,0.0,T,N,G,ref,Y,ref,A,N,synSNP:C3037T|orf1ab:A1306S|orf1ab:P2046L|orf1...,R,Q,E,L,PRJEB37886,SAMEA9148402,ERS6873131,ERX5686029,ERR6047046,ILLUMINA,United Kingdom,ftp.sra.ebi.ac.uk/vol1/run/ERR604/ERR6047046/H...
2,QEUH-160DE00,Scotland/QEUH-160DE00/2021,UK,UK-SCT,Y,2021-06-03,75,B.1.617.2,PLEARN-v1.2.13,,,Delta (B.1.617.2-like),1.0,0.0,T,N,G,X,Y,ref,A,N,orf1ab:S443F|synSNP:C3037T|orf1ab:A1306S|synSN...,R,Q,E,L,PRJEB37886,SAMEA9202513,ERS6927129,ERX5695457,ERR6056590,ILLUMINA,United Kingdom,ftp.sra.ebi.ac.uk/vol1/run/ERR605/ERR6056590/Q...


In [8]:
print("This gives us "+str(len(cog))+" COGUK samples with an ENA accession number")

print("There are " +str(len(cog.loc[cog.lineage.isna()]))+" samples with no PANGO lineage and " +str(len(cog.loc[cog.scorpio_call.isna()]))+ " samples with no SCORPIO lineage")

This gives us 455352 COGUK samples with an ENA accession number
There are 0 samples with no PANGO lineage and 169437 samples with no SCORPIO lineage


Let's look at the `pango` lineages and `scorpio` constellations

In [11]:
cog.lineage.value_counts(dropna=False)

B.1.1.7       259517
B.1.177        44385
B.1.617.2      25225
B.1.1          15033
B.1             7655
               ...  
B.1.590            1
B.1.177.46         1
B.1.36.18          1
B.1.375            1
B.1.623            1
Name: lineage, Length: 673, dtype: int64

In [12]:
cog.scorpio_call.value_counts(dropna=False)

Alpha (B.1.1.7-like)             257726
NaN                              169437
Delta (B.1.617.2-like)            25226
Beta (B.1.351-like)                 931
Eta (B.1.525-like)                  477
B.1.617.1-like                      427
B.1.1.318-like                      288
B.1.1.7-like+E484K                  214
Gamma (P.1-like)                    164
AV.1-like                           145
A.23.1-like                         106
A.23.1-like+E484K                    63
Zeta (P.2-like)                      55
Delta (B.1.617.2-like) +K417N        43
Iota (B.1.526-like)                  17
Epsilon (B.1.427/429-like)           12
B.1.617.3-like                       12
Theta (P.3-like)                      7
B.1.324.1-like                        2
Name: scorpio_call, dtype: int64

What about the P.1 lineage?

In [13]:
cog.loc[cog.lineage.str[0]=='P'].scorpio_call.value_counts()

Gamma (P.1-like)    164
Zeta (P.2-like)      55
Theta (P.3-like)      7
Name: scorpio_call, dtype: int64

And the newer Iota and Epsilon VUIs?

In [14]:
cog.loc[cog.scorpio_call=='Iota (B.1.526-like)'].lineage.value_counts()

B.1.526    17
Name: lineage, dtype: int64

In [15]:
cog.loc[cog.scorpio_call=='Epsilon (B.1.427/429-like)'].lineage.value_counts()

B.1.427/429    12
Name: lineage, dtype: int64

# Logic for creating testset

Let's arbitrarily choose the top 42 lineages, then add the others we want to sample and pick two samples from each.

I've picked 48 since that will fit onto a single `VM.Standard2.24` compute shape in OCI

This is all very arbitrary and can be changed!

In [29]:
a=cog.lineage.value_counts()
lineages_for_testset=list(a.loc[a>100][:42].index)
lineages_for_testset.append('P.1')
lineages_for_testset.append('P.2')
lineages_for_testset.append('P.3')
lineages_for_testset.append('B.1.526')
lineages_for_testset.append('B.1.427/429')
lineages_for_testset.append('None')
print(lineages_for_testset)
len(lineages_for_testset)


['B.1.1.7', 'B.1.177', 'B.1.617.2', 'B.1.1', 'B.1', 'B.1.177.7', 'B.1.1.37', 'B.1.177.4', 'AD.2', 'B.1.177.57', 'B.1.1.311', 'B.1.258', 'B.1.1.369', 'B.1.160', 'B.40', 'B.1.36.17', 'B', 'B.1.1.1', 'B.1.177.15', 'B.1.177.65', 'B.1.177.16', 'B.1.177.54', 'B.1.177.56', 'B.1.177.8', 'B.1.177.87', 'B.1.1.372', 'B.1.177.17', 'B.1.177.10', 'B.1.177.6', 'B.1.93', 'B.1.1.279', 'B.1.1.301', 'B.1.177.5', 'B.1.177.18', 'B.1.391', 'B.1.351', 'W.4', 'B.1.177.9', 'B.1.389', 'B.1.177.58', 'B.1.177.19', 'B.1.177.20', 'P.1', 'P.2', 'P.3', 'B.1.526', 'B.1.427/429', 'None']


48

## Algorithm

1. for a given lineage, randomly pick 10 samples (with replacement to deal with cases where there are fewer than 10 samples in that lineage)
2. for each, make a set out of its mutations ($A_i$)
3. for the samples stored to date, make a single set of all their collective mutations ($B$)
4. by considering $B-A_i$ in turn, choose the sample which has the most mutations not already in $B$
5. add to to the testset and repeat

In [30]:
def create_diverse_testset(coguk_df,required_lineages,random_seed=42,samples_to_pick=50):

    testset={}

    # iterate through the list of pango lineages we wish to cover in the testset
    for lineage in tqdm(lineages_for_testset):

        # before we pick some new samples to consider, build the set of existing mutations
        existing_mutations=set()    
        for idx in testset:    
            existing_mutations|=testset[idx]['mutations']

        # now 
        df=copy.deepcopy(coguk_df.loc[(coguk_df.lineage==lineage)])

        try:
            random_samples=list(df.sample(n=samples_to_pick,random_state=random_seed,replace=True).index)
        except:
            print("No samples available for lineage "+lineage)
            continue 
        random_seed+=1

        sample_set={}
        additional_mutations=0
        next_sample=None

        for idx in random_samples:

            del1=cog.loc[idx].del_1605_3
            del2=cog.loc[idx].del_21765_6

            mutations=set()

            if del1=='X':
                mutations|=set(['del_1605_3_X'])
            elif del1=='del':
                mutations|=set(['del_1605_3'])

            if del2=='X':
                mutations|=set(['del_21765_3_X'])
            elif del2=='del':
                mutations|=set(['del_21765_3'])

            sample_set[idx]={}
            if isinstance(cog.loc[idx].mutations,str):
                mutations|=set(cog.loc[idx].mutations.split('|'))      

            sample_set[idx]['mutations']=mutations
            sample_set[idx]['pango_lineage']=cog.loc[idx].lineage
            sample_set[idx]['scorpio_lineage']=cog.loc[idx].scorpio_call

        next_sample=random_samples[0]

        for idx in random_samples:    
            if idx in sample_set.keys() and len(sample_set[idx]['mutations']-existing_mutations)> additional_mutations:
                next_sample=idx
                additional_mutations=len(sample_set[idx]['mutations']-existing_mutations)
        try:
            testset[next_sample]=sample_set[next_sample]        
        except:
            print(next_sample,sample_set)

    muts=set()
    for i in testset:
        muts|=testset[i]['mutations']
        
    print("There are "+str(len(testset))+" samples in the testset with "+str(len(muts))+" unique mutations")
    
    return(testset.keys())




In [49]:
df=copy.deepcopy(cog.loc[(cog.instrument_platform=='OXFORD_NANOPORE')])

samples=create_diverse_testset(df,lineages_for_testset)

nanopore_testset=cog.loc[samples]

nanopore_testset.to_csv('cog-testset-genetically-diverse-'+str(len(nanopore_testset))+'samples-nanopore-v0.1.csv.gz')

nanopore_testset['bucket']='ena'

nanopore_testset[['bucket','run_accession']].to_csv('sp3-testset-genetically-diverse-'+str(len(nanopore_testset))+'samples-nanopore-v0.1.csv',index=False,header=False)

100%|██████████| 48/48 [00:03<00:00, 15.24it/s]

No samples available for lineage P.3
There are 47 samples in the testset with 622 unique mutations





In [48]:
df=copy.deepcopy(cog.loc[(cog.instrument_platform=='ILLUMINA')])

samples=create_diverse_testset(df,lineages_for_testset)

illumina_testset=cog.loc[samples]

illumina_testset.to_csv('cog-testset-genetically-diverse-'+str(len(illumina_testset))+'samples-illumina-v0.1.csv.gz')

illumina_testset['bucket']='ena'

illumina_testset[['bucket','run_accession']].to_csv('sp3-testset-genetically-diverse-'+str(len(illumina_testset))+'samples-illumina-v0.1.csv',\
                                                    index=False,header=False)

100%|██████████| 48/48 [00:04<00:00, 10.46it/s]

There are 48 samples in the testset with 701 unique mutations





In [46]:
cog.loc[cog.lineage=='P.3']

Unnamed: 0,cogid,sequence_name,cog_country,adm1,is_pillar_2,sample_date,epi_week,lineage,lineages_version,lineage_conflict,lineage_ambiguity_score,scorpio_call,scorpio_support,scorpio_conflict,t1001i,n501y,d614g,del_21765_6,y453f,del_1605_3,a222v,n439k,mutations,p681h,q27stop,e484k,p323l,study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,instrument_platform,ena_country,submitted_ftp
16474,MILK-15BDBBE,England/MILK-15BDBBE/2021,UK,UK-ENG,Y,2021-05-25,74,P.3,PLEARN-v1.2.13,,,Theta (P.3-like),0.9412,0.0,T,Y,G,ref,Y,ref,A,N,synSNP:C1093T|orf1ab:Y496C|orf1ab:A599V|synSNP...,H,Q,K,L,PRJEB37886,SAMEA9094797,ERS6820938,ERX5639827,ERR5999072,ILLUMINA,United Kingdom,ftp.sra.ebi.ac.uk/vol1/run/ERR599/ERR5999072/M...
48040,MILK-154EAF1,England/MILK-154EAF1/2021,UK,UK-ENG,Y,2021-04-23,69,P.3,PLEARN-v1.2.13,,,Theta (P.3-like),0.8824,0.0,T,Y,G,ref,Y,ref,X,N,orf1ab:Y496C|orf1ab:A599V|synSNP:C3037T|synSNP...,H,X,K,X,PRJEB37886,SAMEA8721023,ERS6405465,ERX5545810,ERR5904669,ILLUMINA,United Kingdom,ftp.sra.ebi.ac.uk/vol1/run/ERR590/ERR5904669/M...
142950,CAMC-139B57E,England/CAMC-139B57E/2021,UK,UK-ENG,Y,2021-03-05,62,P.3,PLEARN-v1.2.13,,,Theta (P.3-like),0.9412,0.0,T,Y,G,ref,Y,ref,A,N,synSNP:C3037T|orf1ab:D1554G|synSNP:C6472T|synS...,H,Q,K,L,PRJEB37886,SAMEA8387314,ERS6072814,ERX5300275,ERR5519727,ILLUMINA,United Kingdom,ftp.sra.ebi.ac.uk/vol1/run/ERR551/ERR5519727/C...
151561,CAMC-134E3F8,England/CAMC-134E3F8/2021,UK,UK-ENG,Y,2021-03-01,62,P.3,PLEARN-v1.2.13,,,Theta (P.3-like),0.9412,0.0,T,Y,G,ref,Y,ref,A,N,synSNP:C3037T|orf1ab:D1554G|synSNP:C7564A|orf1...,H,Q,K,L,PRJEB37886,SAMEA8249507,ERS5935763,ERX5250933,ERR5469328,ILLUMINA,United Kingdom,ftp.sra.ebi.ac.uk/vol1/run/ERR546/ERR5469328/C...
151700,CAMC-134E61D,England/CAMC-134E61D/2021,UK,UK-ENG,Y,2021-03-01,62,P.3,PLEARN-v1.2.13,,,Theta (P.3-like),0.9412,0.0,T,Y,G,ref,Y,ref,A,N,synSNP:C3037T|orf1ab:D1554G|synSNP:C7564A|orf1...,H,Q,K,L,PRJEB37886,SAMEA8249541,ERS5935797,ERX5250967,ERR5469362,ILLUMINA,United Kingdom,ftp.sra.ebi.ac.uk/vol1/run/ERR546/ERR5469362/C...
155976,CAMC-137AA67,England/CAMC-137AA67/2021,UK,UK-ENG,Y,2021-03-03,62,P.3,PLEARN-v1.2.13,,,Theta (P.3-like),0.9412,0.0,T,Y,G,ref,Y,ref,A,N,synSNP:C3037T|orf1ab:D1554G|synSNP:C7564A|orf1...,H,Q,K,X,PRJEB37886,SAMEA8408562,ERS6094028,ERX5317074,ERR5536668,ILLUMINA,United Kingdom,ftp.sra.ebi.ac.uk/vol1/run/ERR553/ERR5536668/C...
156010,CAMC-137AA76,England/CAMC-137AA76/2021,UK,UK-ENG,Y,2021-03-03,62,P.3,PLEARN-v1.2.13,,,Theta (P.3-like),0.9412,0.0,T,Y,G,ref,Y,ref,A,N,synSNP:C3037T|orf1ab:D1554G|synSNP:C7564A|orf1...,H,Q,K,L,PRJEB37886,SAMEA8247520,ERS5933779,ERX5248949,ERR5467344,ILLUMINA,United Kingdom,ftp.sra.ebi.ac.uk/vol1/run/ERR546/ERR5467344/C...


In [47]:
nanopore_testset[:5]

Unnamed: 0,cogid,sequence_name,cog_country,adm1,is_pillar_2,sample_date,epi_week,lineage,lineages_version,lineage_conflict,lineage_ambiguity_score,scorpio_call,scorpio_support,scorpio_conflict,t1001i,n501y,d614g,del_21765_6,y453f,del_1605_3,a222v,n439k,mutations,p681h,q27stop,e484k,p323l,study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,instrument_platform,ena_country,submitted_ftp,bucket
54371,SHEF-10DBE7A,England/SHEF-10DBE7A/2021,UK,UK-ENG,Y,2021-04-16,68,B.1.1.7,PLEARN-v1.2.13,,,Alpha (B.1.1.7-like),0.7826,0.1304,I,Y,G,del,Y,ref,A,N,synSNP:C913T|synSNP:C1684T|synSNP:C3037T|orf1a...,H,*,E,L,PRJEB37886,SAMEA8685487,SRS8946229,ERX5518574,ERR5875043,OXFORD_NANOPORE,United Kingdom,ftp.sra.ebi.ac.uk/vol1/run/ERR587/ERR5875043/S...,ena
101420,BRIS-26478B,England/BRIS-26478B/2021,UK,UK-ENG,N,2021-03-15,64,B.1.177,PLEARN-v1.2.13,,,,,,T,N,G,ref,Y,ref,V,N,synSNP:T445C|synSNP:C3037T|orf1ab:T1310I|orf1a...,H,Q,E,L,PRJEB37886,SAMEA8559204,ERS6243890,ERX5421054,ERR5705673,OXFORD_NANOPORE,United Kingdom,ftp.sra.ebi.ac.uk/vol1/run/ERR570/ERR5705673/B...,ena
35808,CAMB-AA61A5,England/CAMB-AA61A5/2021,UK,UK-ENG,N,2021-05-10,72,B.1.617.2,PLEARN-v1.2.13,,,Delta (B.1.617.2-like),1.0,0.0,T,N,G,ref,Y,ref,A,N,orf1ab:K261N|orf1ab:P309L|synSNP:C1684T|orf1ab...,R,Q,E,L,PRJEB37886,SAMEA8776345,ERS6460718,ERX5582734,ERR5942234,OXFORD_NANOPORE,United Kingdom,ftp.sra.ebi.ac.uk/vol1/run/ERR594/ERR5942234/C...,ena
140682,CAMB-AA4286,England/CAMB-AA4286/2021,UK,UK-ENG,N,2021-03-10,63,B.1.1,PLEARN-v1.2.13,,,,,,T,N,G,ref,Y,ref,A,N,synSNP:C1648T|synSNP:C3037T|synSNP:A5695G|synS...,H,Q,E,L,PRJEB37886,SAMEA8468238,ERS6153077,ERX5350447,ERR5633925,OXFORD_NANOPORE,United Kingdom,ftp.sra.ebi.ac.uk/vol1/run/ERR563/ERR5633925/C...,ena
388977,CVR4501,Scotland/CVR4501/2020,UK,UK-SCT,N,2020-09-21,39,B.1,PANGO-v1.2.13,,,,,,T,N,G,ref,Y,ref,A,N,orf1ab:E87D|orf1ab:E217G|orf1ab:A903V|synSNP:C...,P,Q,E,L,PRJEB37886,SAMEA7459597,ERS5217778,ERX4614703,ERR4693621,OXFORD_NANOPORE,United Kingdom,ftp.sra.ebi.ac.uk/vol1/run/ERR469/ERR4693621/C...,ena
