In [1]:
import pandas as pd
import numpy as np
from ViennaRNA import RNA
from cai import calc_cai
from text_processing import input_preprocessing

In [2]:
DESIGN_PATH = "./designs/proteins/nuclease_wildtype.fasta+design.csv"
BASELINE_PATH = "./designs/proteins/nuclease_wildtype_RNA.fasta"

In [3]:
def parse_fasta(text: str):
    text = text.strip()
    text = text.replace(">", "")
    return text

preprocessed_input_path = input_preprocessing(BASELINE_PATH, mode="RNA")

with open(preprocessed_input_path, "r") as f:
    baselines = f.readlines()
    baselines = list(map(lambda x: parse_fasta(x), baselines))

baselines = pd.DataFrame(np.array(baselines).reshape(-1, 2), columns=["Name", "mRNA sequence"])

In [4]:
def get_mfe(rna_sequence):
    ss, mfe = RNA.fold(rna_sequence)

    return ss, mfe


baselines["mRNA structure"], baselines["MFE (kcal/mol)"] = zip(*baselines["mRNA sequence"].map(get_mfe))


In [5]:
baselines

Unnamed: 0,Name,mRNA sequence,mRNA structure,MFE (kcal/mol)
0,52962_Lenti_Cas9_BSD [2886-7526],AUGGACAAGAAGUACAGCAUCGGCCUGGACAUCGGCACCAACUCUG...,((((...((((((((....((((((((..(..((((.(((.(((.....,-1580.400024
1,pLenti-IscB-P2A-BSD_KJY [2886-4955],augaaacggacagccgacggaagcgaguucgagucaccaaagaaga...,..........((((....((..(((...(((.(((((((..........,-733.799988
2,pLenti-TnpB-P2A-BSD_KJY [2886-4691],augaaacggacagccgacggaagcgaguucgagucaccaaagaaga...,.......(((((((((((((((((...((((..((.........))...,-633.900024


In [6]:
baselines["CAI"] = baselines["mRNA sequence"].apply(lambda x: calc_cai(x))
baselines["lambda"] = np.nan
baselines["wildtype"] = True

In [7]:
baselines

Unnamed: 0,Name,mRNA sequence,mRNA structure,MFE (kcal/mol),CAI,lambda,wildtype
0,52962_Lenti_Cas9_BSD [2886-7526],AUGGACAAGAAGUACAGCAUCGGCCUGGACAUCGGCACCAACUCUG...,((((...((((((((....((((((((..(..((((.(((.(((.....,-1580.400024,0.931062,,True
1,pLenti-IscB-P2A-BSD_KJY [2886-4955],augaaacggacagccgacggaagcgaguucgagucaccaaagaaga...,..........((((....((..(((...(((.(((((((..........,-733.799988,0.875812,,True
2,pLenti-TnpB-P2A-BSD_KJY [2886-4691],augaaacggacagccgacggaagcgaguucgagucaccaaagaaga...,.......(((((((((((((((((...((((..((.........))...,-633.900024,0.870357,,True


In [19]:
designs = pd.read_csv(DESIGN_PATH)
designs["Name"] = designs["Name"].apply(lambda x: x.replace("translation ", ""))
designs["wildtype"] = False
designs

Unnamed: 0,Name,mRNA sequence,mRNA structure,MFE (kcal/mol),CAI,lambda,wildtype
0,pLenti-TnpB-P2A-BSD_KJY [2886-4691],AUGAAGCGCACUGCCGACGGGUCCGAGUUUGAGAGCCCGAAGAAGA...,(((((((((((((((((.(((((((.(((((.(((((((..........,-1268.4,0.751,0.0,False
1,52962_Lenti_Cas9_BSD [2886-7526],AUGGACAAGAAGUACUCCAUCGGCCUUGACAUUGGCACUAAUUCCG...,...............((((((((((((((((.((((..((((((((...,-2901.8,0.759,0.0,False
2,pLenti-IscB-P2A-BSD_KJY [2886-4955],AUGAAGCGUACCGCCGACGGGAGCGAGUUCGAGAGUCCCAAGAAGA...,.(.(((((((((.((((((((((.(((((((.(((((((((.(......,-1427.8,0.74,0.0,False
3,pLenti-TnpB-P2A-BSD_KJY [2886-4691],AUGAAGCGCACUGCCGACGGGUCCGAGUUUGAGAGCCCGAAGAAGA...,(((((((((((((((((.(((((((.(((((.(((((((..........,-1268.4,0.756,0.1,False
4,52962_Lenti_Cas9_BSD [2886-7526],AUGGACAAGAAGUACUCCAUCGGCCUUGACAUUGGCACUAAUUCCG...,...............((((((((((((((((.((((..((((((((...,-2901.4,0.768,0.1,False
5,pLenti-IscB-P2A-BSD_KJY [2886-4955],AUGAAGCGUACCGCGGACGGGAGCGAGUUCGAGAGUCCCAAGAAGA...,.(.(((((((((.((((((((((.(((((((.(((((((((.(......,-1427.6,0.757,0.1,False
6,pLenti-TnpB-P2A-BSD_KJY [2886-4691],AUGAAGCGCACUGCCGACGGGUCCGAGUUUGAGAGCCCGAAGAAGA...,(((((((((((((((((.(((((((.(((((.(((((((..........,-1268.4,0.756,0.2,False
7,52962_Lenti_Cas9_BSD [2886-7526],AUGGACAAGAAGUACUCCAUCGGCCUUGACAUUGGCACUAAUUCCG...,...............((((((((((((((((.((((..((((((((...,-2898.3,0.777,0.2,False
8,pLenti-IscB-P2A-BSD_KJY [2886-4955],AUGAAGCGUACCGCGGACGGCAGCGAGUUCGAGAGUCCCAAGAAGA...,.(.(((((((((.((((((((((.(((((((.(((((((((.(......,-1427.1,0.76,0.2,False
9,pLenti-TnpB-P2A-BSD_KJY [2886-4691],AUGAAGCGCACCGCGGACGGCUCUGAGUUCGAGUCACCUAAGAAGA...,.....((((((((((((.(((.((...((((((((((((..........,-1259.9,0.794,0.5,False


In [20]:
comparison  = pd.concat([baselines, designs], ignore_index=True)
comparison

Unnamed: 0,Name,mRNA sequence,mRNA structure,MFE (kcal/mol),CAI,lambda,wildtype
0,52962_Lenti_Cas9_BSD [2886-7526],AUGGACAAGAAGUACAGCAUCGGCCUGGACAUCGGCACCAACUCUG...,((((...((((((((....((((((((..(..((((.(((.(((.....,-1580.400024,0.931062,,True
1,pLenti-IscB-P2A-BSD_KJY [2886-4955],augaaacggacagccgacggaagcgaguucgagucaccaaagaaga...,..........((((....((..(((...(((.(((((((..........,-733.799988,0.875812,,True
2,pLenti-TnpB-P2A-BSD_KJY [2886-4691],augaaacggacagccgacggaagcgaguucgagucaccaaagaaga...,.......(((((((((((((((((...((((..((.........))...,-633.900024,0.870357,,True
3,pLenti-TnpB-P2A-BSD_KJY [2886-4691],AUGAAGCGCACUGCCGACGGGUCCGAGUUUGAGAGCCCGAAGAAGA...,(((((((((((((((((.(((((((.(((((.(((((((..........,-1268.4,0.751,0.0,False
4,52962_Lenti_Cas9_BSD [2886-7526],AUGGACAAGAAGUACUCCAUCGGCCUUGACAUUGGCACUAAUUCCG...,...............((((((((((((((((.((((..((((((((...,-2901.8,0.759,0.0,False
5,pLenti-IscB-P2A-BSD_KJY [2886-4955],AUGAAGCGUACCGCCGACGGGAGCGAGUUCGAGAGUCCCAAGAAGA...,.(.(((((((((.((((((((((.(((((((.(((((((((.(......,-1427.8,0.74,0.0,False
6,pLenti-TnpB-P2A-BSD_KJY [2886-4691],AUGAAGCGCACUGCCGACGGGUCCGAGUUUGAGAGCCCGAAGAAGA...,(((((((((((((((((.(((((((.(((((.(((((((..........,-1268.4,0.756,0.1,False
7,52962_Lenti_Cas9_BSD [2886-7526],AUGGACAAGAAGUACUCCAUCGGCCUUGACAUUGGCACUAAUUCCG...,...............((((((((((((((((.((((..((((((((...,-2901.4,0.768,0.1,False
8,pLenti-IscB-P2A-BSD_KJY [2886-4955],AUGAAGCGUACCGCGGACGGGAGCGAGUUCGAGAGUCCCAAGAAGA...,.(.(((((((((.((((((((((.(((((((.(((((((((.(......,-1427.6,0.757,0.1,False
9,pLenti-TnpB-P2A-BSD_KJY [2886-4691],AUGAAGCGCACUGCCGACGGGUCCGAGUUUGAGAGCCCGAAGAAGA...,(((((((((((((((((.(((((((.(((((.(((((((..........,-1268.4,0.756,0.2,False


In [21]:
protein_group = comparison.groupby("Name")
protein_group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x14e82b070>

In [22]:
import plotly.express as px

In [23]:
def vis(group):
    fig = px.scatter(group, x="MFE (kcal/mol)", y="CAI", color="wildtype", hover_data=["Name", "lambda"], title=f"{group.iloc[0]['Name']}")
    fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
    fig.show()
    fig.write_html(f"./plots/{group.iloc[0]['Name']}.html")
    fig.write_image(f"./plots/{group.iloc[0]['Name']}.png")

for name, group in protein_group:
    vis(group)