# Get data

In [451]:
import pandas as pd
import json


df_mutations = pd.read_csv('./sample_mutations_list.csv', index_col=0)

with open('./barcode_ref.json', 'r') as f:
    barcode_ref = json.load(f)
    
df_barcode_ref = pd.DataFrame(barcode_ref).T
df_barcode_ref = df_barcode_ref.reset_index()
df_barcode_ref.columns = ['barcode_position', 'gene', 'amino_acid_number']

with open('./unknown_sample_mutations.txt', 'r') as f:
    unknown_sample_mutations = f.readlines()

In [452]:
df_mutations

Unnamed: 0_level_0,mutations,Month
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1
sample_1,synSNP:C913T|synSNP:C3037T|orf1ab:T1001I|orf1a...,Jan
sample_2,synSNP:T445C|synSNP:C1057T|synSNP:C3037T|orf1a...,Jan
sample_3,synSNP:C913T|synSNP:A1699G|orf1ab:L730F|synSNP...,Jan
sample_4,synSNP:C913T|orf1ab:P254L|synSNP:C3037T|orf1ab...,Jan
sample_5,synSNP:C913T|orf1ab:L730F|synSNP:C3037T|orf1ab...,Jan
...,...,...
sample_196,orf1ab:E87D|synSNP:G598T|synSNP:C3037T|orf1ab:...,Aug
sample_197,synSNP:C3037T|orf1ab:A1306S|synSNP:T4237C|orf1...,Aug
sample_198,synSNP:C3037T|orf1ab:A1306S|orf1ab:P2046L|orf1...,Aug
sample_199,synSNP:C3037T|orf1ab:A1306S|orf1ab:P2046L|orf1...,Aug


In [453]:
df_barcode_ref

Unnamed: 0,barcode_position,gene,amino_acid_number
0,1,N,203
1,2,M,82
2,3,N,204
3,4,ORF3a,26
4,5,S,681
...,...,...,...
95,96,S,1162
96,97,orf1ab,2105
97,98,orf1ab,3571
98,99,S,262


# Making dictionary by mutations dataframe

In [454]:
import re


def get_dict(mutations: list()):
    
    genes_dict = dict()
    
    for mutation in mutations:
        mutation_spl = mutation.split(':')
        if mutation_spl[0] not in genes_dict:
            genes_dict[mutation_spl[0]] = dict()
            
        substitutions = re.findall(r"([A-Za-z*]+)(\d+)([A-Za-z*]+)", mutation_spl[1])[0]
        
        amino_acid_number = substitutions[1]
        substitutions_str = f"{substitutions[0]}:{substitutions[2]}"
            
        genes_dict[mutation_spl[0]][int(amino_acid_number)] = substitutions_str
        
    return genes_dict
    

samples_dict = dict()

for sample, row in df_mutations.iterrows():
    
    genes_dict = get_dict(mutations=row['mutations'].split('|'))
    
    samples_dict[sample] = {'genes_dict': genes_dict, 'month': row['Month']}

## Dictionary format

In [455]:
samples_dict['sample_1']

{'genes_dict': {'synSNP': {913: 'C:T',
   3037: 'C:T',
   5986: 'C:T',
   12970: 'C:T',
   14676: 'C:T',
   15279: 'C:T',
   16176: 'T:C',
   28312: 'C:T'},
  'orf1ab': {1001: 'T:I', 1708: 'A:D', 2230: 'I:T', 4715: 'P:L'},
  'S': {501: 'N:Y',
   570: 'A:D',
   614: 'D:G',
   681: 'P:H',
   716: 'T:I',
   982: 'S:A',
   1118: 'D:H'},
  'ORF3a': {44: 'G:R', 254: 'G:V'},
  'ORF8': {27: 'Q:*', 52: 'R:I', 73: 'Y:C'},
  'N': {3: 'D:L', 203: 'R:K', 204: 'G:R', 235: 'S:F'}},
 'month': 'Jan'}

# Making list of dataframes by this dictionary

In [456]:
df_lst = list()
for sample, genes_data in samples_dict.items():
    genes_dict = genes_data['genes_dict']

    df_gene = pd.DataFrame()
    for gene, number_substitutions in genes_dict.items():
        amino_acid_numbers = number_substitutions.keys()
        df_gene = df_barcode_ref[df_barcode_ref['gene'] == gene].copy()
        
        if df_gene.shape[0] != 0:
            df_gene['substitutions'] = df_gene['amino_acid_number'].apply(
                lambda x: number_substitutions[x] if x in number_substitutions else None
            )
            df_gene['R/A'] = df_gene['substitutions'].apply(
                lambda x: 'A' if x is not None else 'R'
            )
            
            df_gene['sample'] = sample
            df_gene['month'] = genes_data['month']
            
            df_lst.append(
                df_gene[['sample', 'gene', 'barcode_position', 'R/A', 'substitutions', 'amino_acid_number', 'month']]
            )

## Concatenated dataframes

In [457]:
df_sample_genes = pd.concat(df_lst, axis=0, ignore_index=True)
df_sample_genes

Unnamed: 0,sample,gene,barcode_position,R/A,substitutions,amino_acid_number,month
0,sample_1,orf1ab,10,R,,5401,Jan
1,sample_1,orf1ab,11,A,T:I,1001,Jan
2,sample_1,orf1ab,14,R,,5063,Jan
3,sample_1,orf1ab,20,R,,3646,Jan
4,sample_1,orf1ab,21,A,A:D,1708,Jan
...,...,...,...,...,...,...,...
18111,sample_200,N,34,A,G:C,215,Aug
18112,sample_200,N,39,R,,220,Aug
18113,sample_200,N,52,R,,327,Aug
18114,sample_200,N,66,R,,9,Aug


## Check

In [458]:
t1 = df_sample_genes[df_sample_genes['sample'] == 'sample_197']
t1[t1['amino_acid_number'] == 1306]

Unnamed: 0,sample,gene,barcode_position,R/A,substitutions,amino_acid_number,month
17745,sample_197,orf1ab,24,A,A:S,1306,Aug


In [459]:
t2 = df_barcode_ref[df_barcode_ref['gene'] == 'orf1ab']
t2[t2['amino_acid_number'] == 1306]

Unnamed: 0,barcode_position,gene,amino_acid_number
23,24,orf1ab,1306


# Samples with unknown month

In [460]:
unknown_sample_mutations[0]

'synSNP:C913T|synSNP:C3037T|orf1ab:T1001I|orf1ab:A1708D|synSNP:C5986T|orf1ab:I2230T|orf1ab:P4715L|synSNP:C14676T|synSNP:C15279T|orf1ab:G5166S|synSNP:T16176C|synSNP:C21697A|S:N501Y|S:A570D|S:D614G|S:P681H|S:T716I|S:S982A|S:D1118H|ORF8:Q27*|ORF8:R52I|ORF8:Y73C|N:D3L|N:R203K|N:G204R|N:S235F'

In [461]:
unknown_m_genes_dict = get_dict(mutations=unknown_sample_mutations[0].split('|'))
unknown_m_genes_dict

{'synSNP': {913: 'C:T',
  3037: 'C:T',
  5986: 'C:T',
  14676: 'C:T',
  15279: 'C:T',
  16176: 'T:C',
  21697: 'C:A'},
 'orf1ab': {1001: 'T:I', 1708: 'A:D', 2230: 'I:T', 4715: 'P:L', 5166: 'G:S'},
 'S': {501: 'N:Y',
  570: 'A:D',
  614: 'D:G',
  681: 'P:H',
  716: 'T:I',
  982: 'S:A',
  1118: 'D:H'},
 'ORF8': {27: 'Q:*', 52: 'R:I', 73: 'Y:C'},
 'N': {3: 'D:L', 203: 'R:K', 204: 'G:R', 235: 'S:F'}}

In [462]:
df_lst_unknown = list()
for gene, number_substitutions in unknown_m_genes_dict.items():
    amino_acid_numbers = number_substitutions.keys()
    df_gene = df_barcode_ref[df_barcode_ref['gene'] == gene].copy()
    
    if df_gene.shape[0] != 0:
        df_gene['substitutions'] = df_gene['amino_acid_number'].apply(
            lambda x: number_substitutions[x] if x in number_substitutions else None
        )
        df_gene['R/A'] = df_gene['substitutions'].apply(
            lambda x: 'A' if x is not None else 'R'
        )
        
        df_lst_unknown.append(
            df_gene[['gene', 'barcode_position', 'R/A', 'substitutions', 'amino_acid_number']]
        )
        
df_sample_genes_unknown = pd.concat(df_lst_unknown, axis=0, ignore_index=True)

In [463]:
for gene in set(df_sample_genes_unknown['gene'].values):
    df_subset_unknown = df_sample_genes_unknown[df_sample_genes_unknown['gene'] == gene].copy()
    df_subset = df_sample_genes[df_sample_genes['gene'] == gene].copy()
    print(gene, df_subset_unknown.shape, df_subset.shape)
    print(set(df_subset['month'].values))
    
    barcode_month = dict(df_subset[['barcode_position', 'month']].values)
    df_subset_unknown['month'] = df_subset_unknown['barcode_position'].apply(
        lambda x: barcode_month[x] if x in barcode_month else None
    )
    
    print(set(df_subset_unknown['month'].values))
    
    print(df_subset_unknown)

S (17, 5) (3400, 7)
{'Jan', 'Aug'}
{'Aug'}
   gene barcode_position R/A substitutions amino_acid_number month
53    S                5   A           P:H               681   Aug
54    S                6   A           P:H               681   Aug
55    S                7   A           T:I               716   Aug
56    S                9   A           S:A               982   Aug
57    S               12   R          None                19   Aug
58    S               13   A           A:D               570   Aug
59    S               15   A           D:H              1118   Aug
60    S               31   A           N:Y               501   Aug
61    S               38   R          None               478   Aug
62    S               40   R          None               452   Aug
63    S               41   R          None                18   Aug
64    S               42   R          None               950   Aug
65    S               48   R          None                95   Aug
66    S            

## The sample from “unknown_sample_mutations.txt” is more likely to have been sampled in August