In [14]:
import pandas as pd
import os
import numpy as np

print(os.getcwd())
os.chdir("/home/lferraz/lentideep-lucas")

/home/lferraz/lentideep-lucas


### Load original csv

In [15]:
## Ignores the first 6 lines
df = pd.read_csv('./datasets/aav_packaging_all.csv', header=None, skiprows=6)
df.columns = df.iloc[0]
df = df[1:]  
df["count_average_plasmid_CMV"] = df.iloc[:, 11:14].mean(axis=1)
df["count_average_virus_CMV"] = df.iloc[:, 14:20].mean(axis=1)
df["count_average_plasmid_Rep"] = df.iloc[:, 20:25].mean(axis=1)
df["count_average_virus_Rep"] = df.iloc[:, 25:31].mean(axis=1)

df = df.drop(df.columns[11:31],axis = 1)
  

  df = pd.read_csv('./datasets/aav_packaging_all.csv', header=None, skiprows=6)


In [16]:
print(df.columns)

Index(['abs_pos', 'tile_num', 'aa', 'is_wt_aa', 'is_wt_codon', 'wt_bc',
       'lib_type', 'enzyme', 'codon', 'aa-codon', 'barcode',
       'count_average_plasmid_CMV', 'count_average_virus_CMV',
       'count_average_plasmid_Rep', 'count_average_virus_Rep'],
      dtype='object', name=0)


### Save generated csv

In [17]:
df.to_csv("./datasets/1st_order_AAVcapsid_CMV-Rep_basis.csv", index=False)

### Adding new columns

Add a 1 new column (mutant) before column abs position and generate labels to identify individual mutants (XXX_X_Y_A_CCC_ZZZ)



In [18]:
def generate_label(row):
    if pd.isna(row['barcode']):
        return 'guide'
    is_wt_codon = int(row['is_wt_codon'])
    is_wt_aa = int(row['is_wt_aa'])
    if is_wt_aa == 1 and is_wt_codon == 1:
        wt_code = "WT2"
    elif is_wt_aa == 1 and is_wt_codon == 0:
        wt_code = "WT1"
    else:
        wt_code = "MTT"
    row_name = str(row["abs_pos"]).replace(".", "_")
    
    while len(row_name) < 5:
        row_name = "0" + row_name
    full_code = row_name+"_"+row["lib_type"]+"_"+row["aa"]+"_"+row["codon"]+"_"+wt_code
    
    return full_code

df['label'] = df.apply(generate_label, axis=1)
counts = {}
label_keywords = ['guide','MTT', 'WT1', 'WT2']

for keyword in label_keywords:
    counts[keyword] = df['label'].str.contains(keyword).sum()
    print(f"Number of rows with '{keyword}' in the label: {counts[keyword]}")
print(df["label"][:20])

Number of rows with 'guide' in the label: 714
Number of rows with 'MTT' in the label: 206050
Number of rows with 'WT1' in the label: 0
Number of rows with 'WT2' in the label: 404
1     001_0_sub_*_TAA_MTT
2     001_0_sub_*_TAA_MTT
3     001_0_sub_*_TAG_MTT
4     001_0_sub_*_TAG_MTT
5     001_0_sub_*_TGA_MTT
6     001_0_sub_*_TGA_MTT
7     001_0_del_-_---_MTT
8     001_0_del_-_---_MTT
9     001_0_del_-_---_MTT
10    001_0_del_-_---_MTT
11    001_0_sub_A_GCA_MTT
12    001_0_sub_A_GCA_MTT
13    001_0_sub_A_GCC_MTT
14    001_0_sub_A_GCC_MTT
15    001_0_sub_A_GCG_MTT
16    001_0_sub_A_GCG_MTT
17    001_0_sub_A_GCT_MTT
18    001_0_sub_A_GCT_MTT
19    001_0_sub_C_TGC_MTT
20    001_0_sub_C_TGC_MTT
Name: label, dtype: object


Add 4 new columns (f_CMV_plasmid, f_CMV_virus, f_Rep_plasmid, f_Rep_virus ) to host the frequency of each mutant in the virus pool (fv) or the plasmid pool

In [19]:

sum_CMV_plasmid = df['count_average_plasmid_CMV'].sum()
sum_CMV_virus = df['count_average_virus_CMV'].sum()
sum_Rep_plasmid = df['count_average_plasmid_Rep'].sum()
sum_Rep_virus = df['count_average_virus_Rep'].sum()

def calculate_frequency(row, *args):
    return row[args[0]]/args[1]

df["f_CMV_plasmid"] = df.apply(calculate_frequency, axis=1, args=['count_average_plasmid_CMV', sum_CMV_plasmid])
df["f_CMV_virus"] = df.apply(calculate_frequency, axis=1, args=['count_average_virus_CMV', sum_CMV_virus])
df["f_Rep_plasmid"] = df.apply(calculate_frequency, axis=1, args=['count_average_plasmid_Rep', sum_Rep_plasmid ])
df["f_Rep_virus"] = df.apply(calculate_frequency, axis=1, args=['count_average_virus_Rep', sum_Rep_virus])

print(df[:1])

0 abs_pos tile_num aa is_wt_aa is_wt_codon wt_bc lib_type enzyme codon  \
1     1.0      0.0  *        0           0     0      sub   bbsi   TAA   

0 aa-codon               barcode  count_average_plasmid_CMV  \
1    *-TAA  CACTGTCACACACTGACACT                 563.333333   

0  count_average_virus_CMV  count_average_plasmid_Rep  \
1               215.333333                     494.75   

0  count_average_virus_Rep                label  f_CMV_plasmid  f_CMV_virus  \
1                    272.4  001_0_sub_*_TAA_MTT       0.000013     0.000007   

0  f_Rep_plasmid  f_Rep_virus  
1       0.000011     0.000007  


Add 2 new columns (s_CMV,  s_Rep) to host the selection of each mutant in the virus pool

In [20]:
def calculate_selection(row, *args):
    return row[args[0]]/row[args[1]]

df["s_CMV"] = df.apply(calculate_selection, axis=1, args=["f_CMV_virus", "f_CMV_plasmid"])
df["s_Rep"] = df.apply(calculate_selection, axis=1, args=["f_Rep_virus", "f_Rep_plasmid"])

print(df[:1])
    

0 abs_pos tile_num aa is_wt_aa is_wt_codon wt_bc lib_type enzyme codon  \
1     1.0      0.0  *        0           0     0      sub   bbsi   TAA   

0 aa-codon  ... count_average_virus_CMV  count_average_plasmid_Rep  \
1    *-TAA  ...              215.333333                     494.75   

0  count_average_virus_Rep                label  f_CMV_plasmid f_CMV_virus  \
1                    272.4  001_0_sub_*_TAA_MTT       0.000013    0.000007   

0  f_Rep_plasmid  f_Rep_virus     s_CMV     s_Rep  
1       0.000011     0.000007  0.521311  0.700489  

[1 rows x 22 columns]


In [21]:
def calculate_normalized_selection(row, *args):
    return row[args[0]]/args[1]


only_wt2 = df[df['label'].str.contains('WT2')]
print(len(only_wt2))
swt_CMV = only_wt2["s_CMV"].median()
swt_Rep = only_wt2["s_Rep"].median()
print(swt_CMV)
print(swt_Rep)
df["sNorm_CMV"] = df.apply(calculate_normalized_selection, axis=1, args=["s_CMV", swt_CMV])
df["sNorm_Rep"] = df.apply(calculate_normalized_selection, axis=1, args=["s_Rep", swt_Rep])

print(df[:5])

404
2.8139214807521746
3.167028086740208
0 abs_pos tile_num aa is_wt_aa is_wt_codon wt_bc lib_type enzyme codon  \
1     1.0      0.0  *        0           0     0      sub   bbsi   TAA   
2     1.0      0.0  *        0           0     0      sub   bbsi   TAA   
3     1.0      0.0  *        0           0     0      sub   bbsi   TAG   
4     1.0      0.0  *        0           0     0      sub   bbsi   TAG   
5     1.0      0.0  *        0           0     0      sub   bbsi   TGA   

0 aa-codon  ... count_average_virus_Rep                label  f_CMV_plasmid  \
1    *-TAA  ...                  272.40  001_0_sub_*_TAA_MTT       0.000013   
2    *-TAA  ...                  471.75  001_0_sub_*_TAA_MTT       0.000014   
3    *-TAG  ...                   41.00  001_0_sub_*_TAG_MTT       0.000002   
4    *-TAG  ...                   87.50  001_0_sub_*_TAG_MTT       0.000003   
5    *-TGA  ...                  537.25  001_0_sub_*_TGA_MTT       0.000023   

0  f_CMV_virus  f_Rep_plasmid f_Rep_vir

Add 1 new column (viability) as label for viability (binary), defined based on a normalized selection value cutoff (e.g.≥1); alternatively, use the full scale for fitness quantification

In [22]:
def calculate_viability_CMV(row):
    return int(row["sNorm_CMV"] >= 1)

def calculate_viability_Rep(row):
    return int(row["sNorm_Rep"] >= 1)

df["viability_CMV"] = df.apply(calculate_viability_CMV, axis=1)
df["viability_Rep"] = df.apply(calculate_viability_Rep, axis=1)
print(df[:3])

0 abs_pos tile_num aa is_wt_aa is_wt_codon wt_bc lib_type enzyme codon  \
1     1.0      0.0  *        0           0     0      sub   bbsi   TAA   
2     1.0      0.0  *        0           0     0      sub   bbsi   TAA   
3     1.0      0.0  *        0           0     0      sub   bbsi   TAG   

0 aa-codon  ... f_CMV_plasmid  f_CMV_virus  f_Rep_plasmid  f_Rep_virus  \
1    *-TAA  ...      0.000013     0.000007       0.000011     0.000007   
2    *-TAA  ...      0.000014     0.000006       0.000012     0.000013   
3    *-TAG  ...      0.000002     0.000005       0.000002     0.000001   

0     s_CMV     s_Rep  sNorm_CMV  sNorm_Rep  viability_CMV  viability_Rep  
1  0.521311  0.700489   0.185261   0.221182              0              0  
2  0.441718  1.056681   0.156976   0.333651              0              0  
3  2.722455  0.498374   0.967495   0.157363              0              0  

[3 rows x 26 columns]


Get original sequence

In [23]:

double_wt_df = df.query("(is_wt_aa == '1' or is_wt_aa == 1) and (is_wt_codon == '1' or is_wt_codon == 1)")

seen_pos = []
seq = ""

for index, row in double_wt_df.iterrows():
    pos = row["abs_pos"]
    if not float(pos).is_integer():
        print(pos)
    if not pos in seen_pos:
        seq += row["aa"]
        seen_pos.append(pos)
print(seq)
#print("".join(original_sequence)[::-1])

MAADGYLPDWLEDTLSEGIRQWWKLKPGPPPPKPAERHKDDSRGLVLPGYKYLGPFNGLDKGEPVNEADAAALEHDKAYDRQLDSGDNPYLKYNHADAEFQERLKEDTSFGGNLGRAVFQAKKRVLEPLGLVEEPVKTAPGKKRPVEHSPVEPDSSSGTGKAGQQPARKRLNFGQTGDADSVPDPQPLGQPPAAPSGLGTNTMATGSGAPMADNNEGADGVGNSSGNWHCDSTWMGDRVITTSTRTWALPTYNNHLYKQISSQSGASNDNHYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKRLNFKLFNIQVKEVTQNDGTTTIANNLTSTVQVFTDSEYQLPYVLGSAHQGCLPPFPADVFMVPQYGYLTLNNGSQAVGRSSFYCLEYFPSQMLRTGNNFTFSYTFEDVPFHSSYAHSQSLDRLMNPLIDQYLYYLSRTNTPSGTTTQSRLQFSQAGASDIRDQSRNWLPGPCYRQQRVSKTSADNNNSEYSWTGATKYHLNGRDSLVNPGPAMASHKDDEEKFFPQSGVLIFGKQGSEKTNVDIEKVMITDEEEIRTTNPVATEQYGSVSTNLQRGNRQAATADVNTQGVLPGMVWQDRDVYLQGPIWAKIPHTDGHFHPSPLMGGFGLKHPPPQILIKNTPVPANPSTTFSAAKFASFITQYSTGQVSVEIEWELQKENSKRWNPEIQYTSNYNKSVNVDFTVDTNGVYSEPRPIGTRYLTRNL


Add 1 new column (mutant_sequence) and use the rules in the labels (or respective columns ) to generate the full sequence that shall be placed in a new column

In [24]:

def insertion(aa, pos):
    return seq[:pos] + aa + seq[pos:]
def deletion(pos):
    return seq[:pos]+seq[pos+1:]

def substitution(aa, pos):
    return seq[:pos] + aa + seq[pos+1:]

print("INSERTION")
print("012345")
print(insertion("X", 0))
print("012345")
print(insertion("X", 3))
print("012345")    
print(insertion("X", 5))
print()

print("DELETION")
print("012345")
print(deletion(0))
print("012345")
print(deletion(3))
print("012345")    
print(deletion(5))
print()

print("SUBSTITUTION")
print("012345")
print(substitution("X", 0))
print("012345")
print(substitution("X", 3))
print("012345")    
print(substitution("X", 5))
print()

INSERTION
012345
XMAADGYLPDWLEDTLSEGIRQWWKLKPGPPPPKPAERHKDDSRGLVLPGYKYLGPFNGLDKGEPVNEADAAALEHDKAYDRQLDSGDNPYLKYNHADAEFQERLKEDTSFGGNLGRAVFQAKKRVLEPLGLVEEPVKTAPGKKRPVEHSPVEPDSSSGTGKAGQQPARKRLNFGQTGDADSVPDPQPLGQPPAAPSGLGTNTMATGSGAPMADNNEGADGVGNSSGNWHCDSTWMGDRVITTSTRTWALPTYNNHLYKQISSQSGASNDNHYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKRLNFKLFNIQVKEVTQNDGTTTIANNLTSTVQVFTDSEYQLPYVLGSAHQGCLPPFPADVFMVPQYGYLTLNNGSQAVGRSSFYCLEYFPSQMLRTGNNFTFSYTFEDVPFHSSYAHSQSLDRLMNPLIDQYLYYLSRTNTPSGTTTQSRLQFSQAGASDIRDQSRNWLPGPCYRQQRVSKTSADNNNSEYSWTGATKYHLNGRDSLVNPGPAMASHKDDEEKFFPQSGVLIFGKQGSEKTNVDIEKVMITDEEEIRTTNPVATEQYGSVSTNLQRGNRQAATADVNTQGVLPGMVWQDRDVYLQGPIWAKIPHTDGHFHPSPLMGGFGLKHPPPQILIKNTPVPANPSTTFSAAKFASFITQYSTGQVSVEIEWELQKENSKRWNPEIQYTSNYNKSVNVDFTVDTNGVYSEPRPIGTRYLTRNL
012345
MAAXDGYLPDWLEDTLSEGIRQWWKLKPGPPPPKPAERHKDDSRGLVLPGYKYLGPFNGLDKGEPVNEADAAALEHDKAYDRQLDSGDNPYLKYNHADAEFQERLKEDTSFGGNLGRAVFQAKKRVLEPLGLVEEPVKTAPGKKRPVEHSPVEPDSSSGTGKAGQQPARKRLNFGQTGDADSVPDPQPLGQPPAAPSGLGTNTMATGSGAPMADNNEGADGVGNSSGNWHCDSTWMGDR

In [25]:


def reconstruct(row):
    
    label = row["label"]
    
    if label != "guide":
        pos, frac, operation, aa, codon, wt = label.split("_")
        pos = int(pos)
        if operation == "ins":
        
            mutant_sequence = insertion(aa, pos)
        elif operation == "sub":
            mutant_sequence = substitution(aa, pos)
        elif operation == "del":
            mutant_sequence = deletion(pos)
        return mutant_sequence

df["sequence"] = df.apply(reconstruct, axis=1)

# 8894 has nan

r = df.dropna(subset=['barcode'])


#print(math.isnan(df["barcode"][8894]))
#print(df[:10]["barcode"])

Make a new dataset where, for each mutant, we have the label (1st column), the sequence (2nd column) and the median viability


In [26]:
df = df[["label", "sequence", "viability_CMV", "viability_Rep"]]
print(df[:5])


df.to_csv("./outputs/reconstructed_sequences_viability_l.csv", index=False)

0                label                                           sequence  \
1  001_0_sub_*_TAA_MTT  M*ADGYLPDWLEDTLSEGIRQWWKLKPGPPPPKPAERHKDDSRGLV...   
2  001_0_sub_*_TAA_MTT  M*ADGYLPDWLEDTLSEGIRQWWKLKPGPPPPKPAERHKDDSRGLV...   
3  001_0_sub_*_TAG_MTT  M*ADGYLPDWLEDTLSEGIRQWWKLKPGPPPPKPAERHKDDSRGLV...   
4  001_0_sub_*_TAG_MTT  M*ADGYLPDWLEDTLSEGIRQWWKLKPGPPPPKPAERHKDDSRGLV...   
5  001_0_sub_*_TGA_MTT  M*ADGYLPDWLEDTLSEGIRQWWKLKPGPPPPKPAERHKDDSRGLV...   

0  viability_CMV  viability_Rep  
1              0              0  
2              0              0  
3              0              0  
4              0              0  
5              0              0  
