In [1]:
from src.simulation import ProteinSinthesisProcess, save_data

RESULTS_PATH = 'data/human_genoma_results.pkl'
SIM_TIME = 100

Simulation:

In [2]:
protein_synthesis_process = ProteinSinthesisProcess(data='human_genoma', verbose=False)
protein_synthesis_process.run(simulation_time=SIM_TIME) # run the simulation
save_data(protein_synthesis_process.dna_sequences_df, RESULTS_PATH, verbose=False)

Results:

In [3]:
results_df = protein_synthesis_process.dna_sequences_df

In [4]:
print('Total number of dna sequences processed:', results_df[results_df['protein_synthesized'].notnull()].shape[0])
print('Total number of proteins synthesized:', results_df['protein_synthesized'].sum())
print('Number of true positives: ', results_df['protein_synthesized'].sum()-results_df[
    results_df['peptides_cardinality']==0]['peptides_cardinality'].size)
print('Number of false positives: ', results_df[results_df['peptides_cardinality']==0]['peptides_cardinality'].size)

Total number of dna sequences processed: 169
Total number of proteins synthesized: 88
Number of true positives:  74
Number of false positives:  14


In [5]:
results_df[results_df['protein_synthesized'] & results_df['peptides_cardinality']>0][
    ['polypeptides_chain_synthetized', 'polypeptides_chain_extended','peptides_cardinality']]

Unnamed: 0,polypeptides_chain_synthetized,polypeptides_chain_extended,peptides_cardinality
823,H2N-FPRSEITLDTTSALTSRLYTRVVGTSLST-COOH,H2N-Phe-Pro-Arg-Ser-Glu-Ile-Thr-Leu-Asp-Thr-Th...,29
5034,H2N-TMLPR-COOH,H2N-Thr-Met-Leu-Pro-Arg-COOH,5
5733,H2N-SKYLLKRTTSFFFTKSIWVIGPDKYRQPS-COOH,H2N-Ser-Lys-Tyr-Leu-Leu-Lys-Arg-Thr-Thr-Ser-Ph...,29
823,H2N-FPRSEITLDTTSALTSRLYTRVVGTSLST-COOH,H2N-Phe-Pro-Arg-Ser-Glu-Ile-Thr-Leu-Asp-Thr-Th...,29
5034,H2N-TMLPR-COOH,H2N-Thr-Met-Leu-Pro-Arg-COOH,5
5733,H2N-SKYLLKRTTSFFFTKSIWVIGPDKYRQPS-COOH,H2N-Ser-Lys-Tyr-Leu-Leu-Lys-Arg-Thr-Thr-Ser-Ph...,29
823,H2N-FPRSEITLDTTSALTSRLYTRVVGTSLST-COOH,H2N-Phe-Pro-Arg-Ser-Glu-Ile-Thr-Leu-Asp-Thr-Th...,29
5034,H2N-TMLPR-COOH,H2N-Thr-Met-Leu-Pro-Arg-COOH,5
5733,H2N-SKYLLKRTTSFFFTKSIWVIGPDKYRQPS-COOH,H2N-Ser-Lys-Tyr-Leu-Leu-Lys-Arg-Thr-Thr-Ser-Ph...,29
823,H2N-FPRSEITLDTTSALTSRLYTRVVGTSLST-COOH,H2N-Phe-Pro-Arg-Ser-Glu-Ile-Thr-Leu-Asp-Thr-Th...,29


Explore protein and compare with dataset results:

In [6]:
# see if the proteins are synthesized correctly
results_df[results_df['protein_synthesized'] & results_df['peptides_cardinality']>0]['polypeptides_chain_synthetized'].unique()

array(['H2N-FPRSEITLDTTSALTSRLYTRVVGTSLST-COOH', 'H2N-TMLPR-COOH',
       'H2N-SKYLLKRTTSFFFTKSIWVIGPDKYRQPS-COOH',
       'H2N-DPLPRSGYRGHVCSSFNSAPSRTSWMP-COOH', 'H2N-LILTKLTLKRIVN-COOH'],
      dtype=object)

In [7]:
chain_list = results_df[results_df['protein_synthesized'] & results_df[
    'peptides_cardinality']>0]['polypeptides_chain_synthetized'].unique()
protein_list = []
for p in chain_list:
    p = p.replace('H2N-', '')
    p = p.replace('-COOH', '')
    protein_list.append(p)
protein_list

['FPRSEITLDTTSALTSRLYTRVVGTSLST',
 'TMLPR',
 'SKYLLKRTTSFFFTKSIWVIGPDKYRQPS',
 'DPLPRSGYRGHVCSSFNSAPSRTSWMP',
 'LILTKLTLKRIVN']

In [8]:
results_df.columns

Index(['id', 'sequence', 'mrna_sequence', 'polypeptides_chain_synthetized',
       'polypeptides_chain_extended', 'protein_synthesized',
       'peptides_cardinality'],
      dtype='object')

In [10]:
results_df[results_df['protein_synthesized'] & results_df['peptides_cardinality']>0]

Unnamed: 0,id,sequence,mrna_sequence,polypeptides_chain_synthetized,polypeptides_chain_extended,protein_synthesized,peptides_cardinality
823,NR_046638.1,TAACAGATCCACCCACTTTGGAGACCAGTCCTGGCAGCTGCTACAA...,CH3GPPP-CGGGUUUCUUUUUGCUCGAUCUAAGAGACUUAACACCG...,H2N-FPRSEITLDTTSALTSRLYTRVVGTSLST-COOH,H2N-Phe-Pro-Arg-Ser-Glu-Ile-Thr-Leu-Asp-Thr-Th...,True,29
5034,NR_108083.1,GTTAGAGTTATTGCTGCCTGAGTTATAACAGCTGCCCTGGGAATCT...,CH3GPPP-GGAACCUGGUUCGAUAGUCGGCGAAACUCUCACUCCUG...,H2N-TMLPR-COOH,H2N-Thr-Met-Leu-Pro-Arg-COOH,True,5
5733,NR_029478.1,GGGTGAGGTAGTAGGTTGTATAGTTTGGGGCTCTGCCCTGCTATGG...,CH3GPPP-AGGUAAGGAGAAUUUUAUAUUCUGGAGACCGUACUUAC...,H2N-SKYLLKRTTSFFFTKSIWVIGPDKYRQPS-COOH,H2N-Ser-Lys-Tyr-Leu-Leu-Lys-Arg-Thr-Thr-Ser-Ph...,True,29
823,NM_001421698.1,ATCCCCGAACCCCGCTTTCCGGCCCGCGGCGACCGCCGGCAACTGT...,CH3GPPP-CGGGUUUCUUUUUGCUCGAUCUAAGAGACUUAACACCG...,H2N-FPRSEITLDTTSALTSRLYTRVVGTSLST-COOH,H2N-Phe-Pro-Arg-Ser-Glu-Ile-Thr-Leu-Asp-Thr-Th...,True,29
5034,NR_186221.1,ACAGGAGCCCCAGCCCCACACAGCAACCACCCGGGCGAGAAAGAAA...,CH3GPPP-GGAACCUGGUUCGAUAGUCGGCGAAACUCUCACUCCUG...,H2N-TMLPR-COOH,H2N-Thr-Met-Leu-Pro-Arg-COOH,True,5
5733,NM_001407662.1,GCTGAGACTTCCTGGACGGGGGACAGGCTGTGGGGTTTCTCAGATA...,CH3GPPP-AGGUAAGGAGAAUUUUAUAUUCUGGAGACCGUACUUAC...,H2N-SKYLLKRTTSFFFTKSIWVIGPDKYRQPS-COOH,H2N-Ser-Lys-Tyr-Leu-Leu-Lys-Arg-Thr-Thr-Ser-Ph...,True,29
823,NM_001308217.1,AGAAAAATCAATTCAGATTACTTTGATGACAGTGACTTCCAGTCTT...,CH3GPPP-CGGGUUUCUUUUUGCUCGAUCUAAGAGACUUAACACCG...,H2N-FPRSEITLDTTSALTSRLYTRVVGTSLST-COOH,H2N-Phe-Pro-Arg-Ser-Glu-Ile-Thr-Leu-Asp-Thr-Th...,True,29
5034,NM_022153.2,AGTCGCGGGAGGCTTCCCCGCGCCGGCCGCGTCCCGCCCGCTCCCC...,CH3GPPP-GGAACCUGGUUCGAUAGUCGGCGAAACUCUCACUCCUG...,H2N-TMLPR-COOH,H2N-Thr-Met-Leu-Pro-Arg-COOH,True,5
5733,NM_019009.4,GCGGCTGTCAGCTGACTGTGGCGGCGGCGGCCTCGAGGTGACAACT...,CH3GPPP-AGGUAAGGAGAAUUUUAUAUUCUGGAGACCGUACUUAC...,H2N-SKYLLKRTTSFFFTKSIWVIGPDKYRQPS-COOH,H2N-Ser-Lys-Tyr-Leu-Leu-Lys-Arg-Thr-Thr-Ser-Ph...,True,29
823,NM_001370161.1,GAGGGTGAGAAGGGCCGGCTGCCGGAGCTGGGTTGCGATCTTCCCG...,CH3GPPP-CGGGUUUCUUUUUGCUCGAUCUAAGAGACUUAACACCG...,H2N-FPRSEITLDTTSALTSRLYTRVVGTSLST-COOH,H2N-Phe-Pro-Arg-Ser-Glu-Ile-Thr-Leu-Asp-Thr-Th...,True,29


### EDA

In [11]:
nucleus = protein_synthesis_process.eucaryotes_cell.nucleus
promoter_found  = 0

for rna in results_df['sequence']:
    p = nucleus.find_promoter(rna)
    if p is not None and len(p) > 0:
        promoter_found +=1
print('promoter found:', promoter_found, 'out of', results_df.shape[0], 
    'sequences processed (', promoter_found/results_df.shape[0]*100, '%)')

promoter found: 138608 out of 260510 sequences processed ( 53.20640282522744 %)


In [12]:
t_count = 0 # thymine
u_count = 0 # uracil

for rna in results_df['sequence']:
    t_count += rna.count('T')
    u_count += rna.count('U')
print('Thymine count:', t_count, 'Uracil count:', u_count)

Thymine count: 269845417 Uracil count: 0


In [13]:
results_df['sequence']

0        AGCAGGGCGTCCAGCGGAGAAGGCAGAGGAGGGGAGATGCGGGCTC...
1        AGCAGGGCGTCCAGCGGAGAAGGCAGAGGAGGGGAGATGCGGGCTC...
2        AGTCCCAGGGAGGAGACCGCGGGAGAGGCGGCGGGACCAGGGTCCC...
3        GCACACCTGGCTCACGGCGAGTGCGGAGCAGAAAGCACTACTGGCG...
4        GCTACACTTAGTGACTCTGAGGGACATGCAACCCTCCCCGCATGCT...
                               ...                        
63840    AGTAACTGAGACCTCACCGCGTTGACCCAGCACCAGGGCTCGCGGG...
63841    AGTAACTGAGACCTCACCGCGTTGACCCAGCACCAGGGCTCGCGGG...
63842    CGTTGCCTTGGCTACACCGTCTGTTAGGGCCGCGCACGAGATCAGT...
63843    AGTAACTGAGACCTCACCGCGTTGACCCAGCACCAGGGCTCGCGGG...
63844    AGTAACTGAGACCTCACCGCGTTGACCCAGCACCAGGGCTCGCGGG...
Name: sequence, Length: 260510, dtype: object

In [14]:
BASE_COMPLEMENT_DNA2RNA = {
    'A': 'U', 
    'T': 'A', 
    'C': 'G', 
    'G': 'C'
}

t_count = 0 # thymine
u_count = 0 # uracil
error_list = []

for dna_sequence_to_transcript in results_df['sequence']:
    try:
        mrna = ''.join([BASE_COMPLEMENT_DNA2RNA[base] for base in dna_sequence_to_transcript])
        t_count += mrna.count('T')
        u_count += mrna.count('U')
    except:
        error_list.append(dna_sequence_to_transcript)
print('Thymine count:', t_count, 'Uracil count:', u_count)

Thymine count: 0 Uracil count: 282303474


In [15]:
len(error_list)

15

In [16]:
import pandas as pd

error_df = pd.DataFrame(columns=['sequence', 'N_count', 'S_count', 'Y_count', 'total_errors'])
error_df['sequence'] = error_list
error_df['N_count'] = 0
error_df['S_count'] = 0
error_df['Y_count'] = 0
error_df['total_errors'] = 0    

def count_errors(row):
    for base in row['sequence']:
        if base not in BASE_COMPLEMENT_DNA2RNA.keys():
            if base == 'N':
                row['N_count'] += 1
            elif base == 'S':
                row['S_count'] += 1
            elif base == 'Y':
                row['Y_count'] += 1
    row['total_errors'] = row['N_count'] + row['S_count'] + row['Y_count']
    return row

error_df = error_df.apply(count_errors, axis=1)

In [17]:
error_df

Unnamed: 0,sequence,N_count,S_count,Y_count,total_errors
0,GCTGACACGCTGTCCTCTGGCGACCTGTCGCTGGAGAGGTTGGGCC...,1,0,0,1
1,CGCGACCTCAGATCAGACGTGGCGACCCGCTGAATTTAAGCATATT...,1,0,0,1
2,CAGGGAGCTGTGAGGCAGTSCTGTGYGGTTCCTGCCGTCCGGACTC...,0,1,1,2
3,AGGCCGCCCTTTCCCTGCGGGTGGGAACTCCAGATGGACGAGGGCC...,2,0,0,2
4,GACGGGAGCTAGAGCCCATGTCTTGTGGCGTCAGATGCGGCTCTTT...,2,0,0,2
5,TGAATCGGGAAATGGCCGCTGTGTGGTTGCAACGGAGATAAATTCC...,1,0,0,1
6,GCAGTAACCATTAGTGCTGTGCTGGGGCGTGTTTGTCAGGAGGTTG...,0,0,0,0
7,ACAAGGGCGGCTGGCTGGCTGGCTGGCTGGCTGTCCGGGCAGGCCT...,2,0,0,2
8,ACAAGGGCGGCTGGCTGGCTGGCTGGCTGGCTGTCCGGGCAGGCCT...,2,0,0,2
9,AGCTTGAAATGAATTTTAAAGGATGACTGATGGTCCCTGGAAGAGA...,4,0,0,4


- "N" represents any nucleotide (A, T, C, or G).
- "S" represents either C (cytosine) or G (guanine).
- "Y" represents either T (thymine) or C (cytosine).

- Guanine. adenine, thymine, cytosine: G,A,T,C
- Purine (adenine or guanine): R
- Pyrimidine (thymine or cytosine): Y
- Adenine or thymine: W
- Guanine or cytosine: S
- Adenine or cytosine: M
- Guanine or thymine: K
- Adenine or thymine or cytosine: H
- Guanine or cytosine or thymine: B
- Guanine or adenine or cytosine: V
- Guanine or adenine or thymine: D
- Guanine or adenine or thymine or cytosine: N