# Notes

- Data taken from here: https://github.com/churchlab/Deep_diversification_AAV/tree/main/Data
- Just sequence (AA) data exists as well here: https://www.dropbox.com/sh/lmd8wmgibz24r2h/AADmgvc_0Q5mylwgGf97cTC-a/ML_and_Analysis/model_and_dataset_analysis/allseqs_20191230.csv.zip?dl=0
- [This notebook](https://github.com/churchlab/Deep_diversification_AAV/blob/main/Parsing_pipeline/Step3_compute_selection_scores.ipynb) suggest that the "target" column is `GAS1_virus_S` in the CSV file
- [This repository](https://github.com/google-research/google-research/tree/master/aav/model_training) suggest that the wild type AA sequence is 28 residues and is `DEEEIRTTNPVATEQYGSVSTNLQRGNR`
- The UniProt sequence is here: https://www.uniprot.org/uniprot/P03135
- In UniProt notation (sequence starts at 1), the region on P03135 where the sequence mutates is `[561, 588]`

In [1]:
!pip install biopython
!pip install pandas



In [2]:
import pandas
import math

from pathlib import Path
from pandas import read_csv, DataFrame

from src.helpers import read_fasta

from Bio import Align

In [3]:
pandas.set_option('display.max_rows', None)

In [4]:
data_path = Path('') / '..' / 'data' / 'avv'
avv_data_path = data_path / 'library_w_selection_scores.csv'

In [5]:
avv_data = read_csv(avv_data_path, index_col=0)

P03135 = read_fasta(data_path / 'P03135.fasta')[0]
region = (561,588)
reference_region = "DEEEIRTTNPVATEQYGSVSTNLQRGNR"

In [6]:
avv_data.columns

Index(['nt_seq', 'EK266_GAS1_p1_rep1a_plasmid', 'EK266_GAS1_p1_rep1b_plasmid',
       'EK269_GAS1_p1_rep1a_plasmid_x', 'EK269_GAS1_p1_rep1b_plasmid_x',
       'EK269_GAS1_p1_rep1c_plasmid_x', 'EK269_GAS1_p1_rep1d_plasmid_x',
       'EK269_GAS1_v3_rep1a_virus_x', 'EK269_GAS1_v3_rep1b_virus_x',
       'EK269_GAS1_v3_rep1c_virus_x', 'EK269_GAS1_v3_rep1d_virus_x',
       'EK269_GAS1_v4_rep2a_virus_x', 'EK269_GAS1_v4_rep2b_virus_x',
       'EK269_GAS1_v4_rep2c_virus_x', 'EK269_GAS1_v4_rep2d_virus_x',
       'EK269_GAS1_v5_rep3a_virus_x', 'EK269_GAS1_v5_rep3b_virus_x',
       'EK269_GAS1_v5_rep3c_virus', 'EK269_GAS1_v5_rep3d_virus', 'aa_x',
       'category_x', 'chip_x', 'control_x', 'is_wt_aa_x', 'is_wt_nt_x',
       'mask_x', 'mut_x', 'rep_i', 'rep_original_x', 'rep_total_x',
       'EK269_GAS1_p1_rep1a_plasmid_y', 'EK269_GAS1_p1_rep1b_plasmid_y',
       'EK269_GAS1_p1_rep1c_plasmid_y', 'EK269_GAS1_p1_rep1d_plasmid_y',
       'EK269_GAS1_v3_rep1a_virus_y', 'EK269_GAS1_v3_rep1b_virus_y',
  

In [7]:
avv_data['category_x'].unique()

array(['rnn_designed_plus_rand_train_walked',
       'rnn_designed_plus_rand_train_seed',
       'rnn_rand_doubles_plus_singles_walked',
       'rnn_rand_doubles_plus_singles_seed', 'rnn_standard_walked',
       'rnn_standard_seed', 'cnn_designed_plus_rand_train_walked',
       'cnn_designed_plus_rand_train_seed',
       'cnn_rand_doubles_plus_single_walked',
       'cnn_rand_doubles_plus_single_seed', 'cnn_standard_walked',
       'cnn_standard_seed', 'lr_designed_plus_rand_train_walked',
       'lr_designed_plus_rand_train_seed',
       'lr_rand_doubles_plus_single_walked',
       'lr_rand_doubles_plus_single_seed', 'lr_standard_walked',
       'lr_standard_seed', 'previous_chip_viable',
       'previous_chip_nonviable', 'stop', 'singles', 'wild_type',
       'random_doubles'], dtype=object)

In [8]:
# Get the wild type sequence from the data

wt = avv_data[avv_data['is_wt_nt_x'] == 1].to_dict(orient='records')[0]

In [9]:
# Make sure that all assumptions on the wild type are correct

assert(wt['aa_x'] == reference_region)

# Important: region[0] -1 because UniProt notation! (0 in Python == 1 in UniProt)
assert(wt['aa_x'] == str(P03135[region[0]-1 : region[1]].seq))

# Make sure chopping the sequence works as expected:

assert(
    str(P03135.seq) ==
    str(P03135[:region[0]-1].seq) + str(P03135[region[0]-1 : region[1]].seq) + str(P03135[region[1]:].seq)
)

In [10]:
protein_data = avv_data[['aa_x', 'mask_x', 'GAS1_virus_S', ]].copy()

In [11]:
def assign_category(category):
    if category == "wild_type":
        return "wild_type"
    else:
        return "designed"

protein_data['category'] = avv_data['category_x'].apply(assign_category)

In [12]:
protein_data[protein_data['aa_x'] == reference_region]

Unnamed: 0,aa_x,mask_x,GAS1_virus_S,category
218232,DEEEIRTTNPVATEQYGSVSTNLQRGNR,____________________________,-0.921691,wild_type
218233,DEEEIRTTNPVATEQYGSVSTNLQRGNR,____________________________,-0.125303,wild_type
218234,DEEEIRTTNPVATEQYGSVSTNLQRGNR,____________________________,0.155396,wild_type
218235,DEEEIRTTNPVATEQYGSVSTNLQRGNR,____________________________,0.947577,wild_type
218236,DEEEIRTTNPVATEQYGSVSTNLQRGNR,____________________________,-1.063887,wild_type
218237,DEEEIRTTNPVATEQYGSVSTNLQRGNR,____________________________,-0.281096,wild_type
218238,DEEEIRTTNPVATEQYGSVSTNLQRGNR,____________________________,-2.40736,wild_type
218239,DEEEIRTTNPVATEQYGSVSTNLQRGNR,____________________________,-0.267336,wild_type
218240,DEEEIRTTNPVATEQYGSVSTNLQRGNR,____________________________,-0.882356,wild_type
218241,DEEEIRTTNPVATEQYGSVSTNLQRGNR,____________________________,-0.429814,wild_type


In [13]:
protein_data[protein_data['aa_x'] == '*EEEIRTTNPVATEQYGSVSTNLQRGNR']

Unnamed: 0,aa_x,mask_x,GAS1_virus_S,category
214111,*EEEIRTTNPVATEQYGSVSTNLQRGNR,*___________________________,-inf,designed
214112,*EEEIRTTNPVATEQYGSVSTNLQRGNR,*___________________________,-3.414035,designed
214113,*EEEIRTTNPVATEQYGSVSTNLQRGNR,*___________________________,-6.615314,designed


In [14]:
# Remove data where GAS1_virus_S is +/- inf
protein_data = protein_data[protein_data['GAS1_virus_S'] != math.inf]
protein_data = protein_data[protein_data['GAS1_virus_S'] != -math.inf]

In [15]:
def region_cut(new_sequence):
    filtered_sequence = new_sequence.upper().replace('*','')
    return str(P03135[:region[0]-1].seq) + filtered_sequence + str(P03135[region[1]:].seq)

protein_data['full_aa_sequence'] = protein_data['aa_x'].apply(region_cut)

In [16]:
protein_data[:3]['full_aa_sequence'][2]

'MAADGYLPDWLEDTLSEGIRQWWKLKPGPPPPKPAERHKDDSRGLVLPGYKYLGPFNGLDKGEPVNEADAAALEHDKAYDRQLDSGDNPYLKYNHADAEFQERLKEDTSFGGNLGRAVFQAKKRVLEPLGLVEEPVKTAPGKKRPVEHSPVEPDSSSGTGKAGQQPARKRLNFGQTGDADSVPDPQPLGQPPAAPSGLGTNTMATGSGAPMADNNEGADGVGNSSGNWHCDSTWMGDRVITTSTRTWALPTYNNHLYKQISSQSGASNDNHYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKRLNFKLFNIQVKEVTQNDGTTTIANNLTSTVQVFTDSEYQLPYVLGSAHQGCLPPFPADVFMVPQYGYLTLNNGSQAVGRSSFYCLEYFPSQMLRTGNNFTFSYTFEDVPFHSSYAHSQSLDRLMNPLIDQYLYYLSRTNTPSGTTTQSRLQFSQAGASDIRDQSRNWLPGPCYRQQRVSKTSADNNNSEYSWTGATKYHLNGRDSLVNPGPAMASHKDDEEKFFPQSGVLIFGKQGSEKTNVDIEKVMITDEEEIATTNPVATEQYGSVSTNLQHDGDERQAATADVNTQGVLPGMVWQDRDVYLQGPIWAKIPHTDGHFHPSPLMGGFGLKHPPPQILIKNTPVPANPSTTFSAAKFASFITQYSTGQVSVEIEWELQKENSKRWNPEIQYTSNYNKSVNVDFTVDTNGVYSEPRPIGTRYLTRNL'

In [17]:
# We need to aggregate synonymous variants if operating on AA sequences
# Take the score and devide by number of synonymous WT AA seuqneces!

def aggregate_rows(dataframe_slice):
    
    first_element = dataframe_slice.iloc[0]
    
    return {
        'mask': first_element.mask_x,
        'reference_region': reference_region,
        'mutated_region': first_element.aa_x,
        'full_aa_sequence': first_element.full_aa_sequence,
        'score': dataframe_slice.GAS1_virus_S.mean()
    }

grouped_protein_data = protein_data[:100].groupby(['aa_x']).apply(aggregate_rows).values

In [18]:
grouped_protein_data = DataFrame.from_records(grouped_protein_data)

In [19]:
grouped_protein_data[:3]

Unnamed: 0,mask,reference_region,mutated_region,full_aa_sequence,score
0,A_D________________________gDe,DEEEIRTTNPVATEQYGSVSTNLQRGNR,AEDEIRTTNPVATEQYGSVSTNLQRGNgDe,MAADGYLPDWLEDTLSEGIRQWWKLKPGPPPPKPAERHKDDSRGLV...,1.605882
1,__A__A___________T______N_D_,DEEEIRTTNPVATEQYGSVSTNLQRGNR,DEAEIATTNPVATEQYGTVSTNLQNGDR,MAADGYLPDWLEDTLSEGIRQWWKLKPGPPPPKPAERHKDDSRGLV...,0.604704
2,__A___________H_________Lg__E,DEEEIRTTNPVATEQYGSVSTNLQRGNR,DEAEIRTTNPVATEHYGSVSTNLQLgGNE,MAADGYLPDWLEDTLSEGIRQWWKLKPGPPPPKPAERHKDDSRGLV...,0.903011


In [20]:
grouped_protein_data.to_csv(data_path / "processed_data.csv", index=False)