In [18]:
from src.simulation import ProteinSinthesisProcess, save_data

RESULTS_PATH = 'data/human_genoma_results.pkl'
SIM_TIME = 100

Simulation:

In [19]:
protein_synthesis_process = ProteinSinthesisProcess(data='human_genoma', verbose=False)
protein_synthesis_process.run(simulation_time=SIM_TIME) # run the simulation
save_data(protein_synthesis_process.dna_sequences_df, RESULTS_PATH, verbose=False)

Results:

In [20]:
results_df = protein_synthesis_process.dna_sequences_df

In [21]:
print('Total number of dna sequences processed:', results_df[results_df['protein_synthesized'].notnull()].shape[0])
print('Total number of proteins synthesized:', results_df['protein_synthesized'].sum())
print('Number of true positives: ', results_df['protein_synthesized'].sum()-results_df[
    results_df['peptides_cardinality']==0]['peptides_cardinality'].size)
print('Number of false positives: ', results_df[results_df['peptides_cardinality']==0]['peptides_cardinality'].size)

Total number of dna sequences processed: 3120
Total number of proteins synthesized: 1992
Number of true positives:  1848
Number of false positives:  144


In [22]:
results_df[results_df['protein_synthesized'] & results_df['peptides_cardinality']>0][
    ['polypeptides_chain_synthetized', 'polypeptides_chain_extended','peptides_cardinality']]

Unnamed: 0,polypeptides_chain_synthetized,polypeptides_chain_extended,peptides_cardinality
100,H2N-THTHCRRLFQQPFYSSKRVHTTTEDWLRTSSFGNTHR-COOH,H2N-Thr-His-Thr-His-Cys-Arg-Arg-Leu-Phe-Gln-Gl...,37
100,H2N-THTHCRRLFQQPFYSSKRVHTTTEDWLRTSSFGNTHR-COOH,H2N-Thr-His-Thr-His-Cys-Arg-Arg-Leu-Phe-Gln-Gl...,37
100,H2N-THTHCRRLFQQPFYSSKRVHTTTEDWLRTSSFGNTHR-COOH,H2N-Thr-His-Thr-His-Cys-Arg-Arg-Leu-Phe-Gln-Gl...,37
100,H2N-THTHCRRLFQQPFYSSKRVHTTTEDWLRTSSFGNTHR-COOH,H2N-Thr-His-Thr-His-Cys-Arg-Arg-Leu-Phe-Gln-Gl...,37
100,H2N-THTHCRRLFQQPFYSSKRVHTTTEDWLRTSSFGNTHR-COOH,H2N-Thr-His-Thr-His-Cys-Arg-Arg-Leu-Phe-Gln-Gl...,37
...,...,...,...
3182,H2N-YFSDYVVSHLSQTLYRKLK-COOH,H2N-Tyr-Phe-Ser-Asp-Tyr-Val-Val-Ser-His-Leu-Se...,19
3182,H2N-YFSDYVVSHLSQTLYRKLK-COOH,H2N-Tyr-Phe-Ser-Asp-Tyr-Val-Val-Ser-His-Leu-Se...,19
3182,H2N-YFSDYVVSHLSQTLYRKLK-COOH,H2N-Tyr-Phe-Ser-Asp-Tyr-Val-Val-Ser-His-Leu-Se...,19
3182,H2N-YFSDYVVSHLSQTLYRKLK-COOH,H2N-Tyr-Phe-Ser-Asp-Tyr-Val-Val-Ser-His-Leu-Se...,19


Explore protein and compare with dataset results:

In [23]:
# see if the proteins are synthesized correctly
results_df[results_df['protein_synthesized'] & results_df['peptides_cardinality']>0]['polypeptides_chain_synthetized'].unique()

array(['H2N-THTHCRRLFQQPFYSSKRVHTTTEDWLRTSSFGNTHR-COOH',
       'H2N-KPFVTHSKYLESFFDILDEMVYSDDSFTYFMVYHILMYNYVTV-COOH',
       'H2N-VYGLKPYLTSWILMR-COOH', 'H2N-FTVPSKIKFISVSLFPTCF-COOH',
       'H2N-YFSDYVVSHLSQTLYRKLK-COOH'], dtype=object)

In [24]:
chain_list = results_df[results_df['protein_synthesized'] & results_df[
    'peptides_cardinality']>0]['polypeptides_chain_synthetized'].unique()
protein_list = []
for p in chain_list:
    p = p.replace('H2N-', '')
    p = p.replace('-COOH', '')
    protein_list.append(p)
protein_list

['THTHCRRLFQQPFYSSKRVHTTTEDWLRTSSFGNTHR',
 'KPFVTHSKYLESFFDILDEMVYSDDSFTYFMVYHILMYNYVTV',
 'VYGLKPYLTSWILMR',
 'FTVPSKIKFISVSLFPTCF',
 'YFSDYVVSHLSQTLYRKLK']

In [25]:
results_df.columns

Index(['rna_id', 'sequence', 'protein_id', 'protein', 'mrna_sequence',
       'polypeptides_chain_synthetized', 'polypeptides_chain_extended',
       'protein_synthesized', 'peptides_cardinality'],
      dtype='object')

In [26]:
results_df[results_df['polypeptides_chain_synthetized'] == 'H2N-K-COOH']['protein']

Series([], Name: protein, dtype: object)

In [27]:
results_df[results_df['protein_synthesized'] & results_df['peptides_cardinality']>0]

Unnamed: 0,rna_id,sequence,protein_id,protein,mrna_sequence,polypeptides_chain_synthetized,polypeptides_chain_extended,protein_synthesized,peptides_cardinality
100,NR_148390.2,AGCCCCGCCCCGAAGTTTGAGGGGTGTGGACGGTTTGTGACCCCCT...,NP_001156386.1,MAPKKKRGPSAGSQPGGAAAAGAEQPLSERAQYLQREHALLSEQLD...,CH3GPPP-UGUAAGAGUUGUUCCAUACUUUUGAACCCACUCUUACU...,H2N-THTHCRRLFQQPFYSSKRVHTTTEDWLRTSSFGNTHR-COOH,H2N-Thr-His-Thr-His-Cys-Arg-Arg-Leu-Phe-Gln-Gl...,True,37
100,NR_148390.2,AGCCCCGCCCCGAAGTTTGAGGGGTGTGGACGGTTTGTGACCCCCT...,NP_001393960.1,MGSAVMDTKKKKDVSSPGGSGGKKNASQKRRSLRVHIPDLSSFAMP...,CH3GPPP-UGUAAGAGUUGUUCCAUACUUUUGAACCCACUCUUACU...,H2N-THTHCRRLFQQPFYSSKRVHTTTEDWLRTSSFGNTHR-COOH,H2N-Thr-His-Thr-His-Cys-Arg-Arg-Leu-Phe-Gln-Gl...,True,37
100,NR_148390.2,AGCCCCGCCCCGAAGTTTGAGGGGTGTGGACGGTTTGTGACCCCCT...,NP_001307912.1,MEFFKNENNELPKLQWYKDCKPLLLDNIHFSGVKDRLIVMNVAEKH...,CH3GPPP-UGUAAGAGUUGUUCCAUACUUUUGAACCCACUCUUACU...,H2N-THTHCRRLFQQPFYSSKRVHTTTEDWLRTSSFGNTHR-COOH,H2N-Thr-His-Thr-His-Cys-Arg-Arg-Leu-Phe-Gln-Gl...,True,37
100,NR_148390.2,AGCCCCGCCCCGAAGTTTGAGGGGTGTGGACGGTTTGTGACCCCCT...,NP_001829.1,MDLGKPMKSVLVVALLVIFQVCLCQDEVTDDYIGDNTTVDYTLFES...,CH3GPPP-UGUAAGAGUUGUUCCAUACUUUUGAACCCACUCUUACU...,H2N-THTHCRRLFQQPFYSSKRVHTTTEDWLRTSSFGNTHR-COOH,H2N-Thr-His-Thr-His-Cys-Arg-Arg-Leu-Phe-Gln-Gl...,True,37
100,NR_148390.2,AGCCCCGCCCCGAAGTTTGAGGGGTGTGGACGGTTTGTGACCCCCT...,NP_002967.2,MLASPEPKGLVPFTKESFELIKQHIAKTHNEDHEEEDLKPTPDLEV...,CH3GPPP-UGUAAGAGUUGUUCCAUACUUUUGAACCCACUCUUACU...,H2N-THTHCRRLFQQPFYSSKRVHTTTEDWLRTSSFGNTHR-COOH,H2N-Thr-His-Thr-His-Cys-Arg-Arg-Leu-Phe-Gln-Gl...,True,37
...,...,...,...,...,...,...,...,...,...
3182,XM_054317358.1,AAACTGAAGCCGCGGCCGAAAACGCCAAGAGATTGATGCTGTAGCT...,XP_006721403.1,MTPQSLLQTTLFLLSLLFLVQASASSGAHGRGHREDFRFCSQRNQT...,CH3GPPP-AUGUACUUUUCCGACUACGUCGUUAGCCACCUUAGCCA...,H2N-YFSDYVVSHLSQTLYRKLK-COOH,H2N-Tyr-Phe-Ser-Asp-Tyr-Val-Val-Ser-His-Leu-Se...,True,19
3182,XM_054317358.1,AAACTGAAGCCGCGGCCGAAAACGCCAAGAGATTGATGCTGTAGCT...,XP_006712117.1,MVTGGGAAPPGTVTEPLPSVIVLSAGRKMAAAAAAASGPGCSSAAG...,CH3GPPP-AUGUACUUUUCCGACUACGUCGUUAGCCACCUUAGCCA...,H2N-YFSDYVVSHLSQTLYRKLK-COOH,H2N-Tyr-Phe-Ser-Asp-Tyr-Val-Val-Ser-His-Leu-Se...,True,19
3182,XM_054317358.1,AAACTGAAGCCGCGGCCGAAAACGCCAAGAGATTGATGCTGTAGCT...,XP_011532753.1,MAPWRKADKERHGVAIYNFQGSGAPQLSLQIGDVVRIQETCGDWYR...,CH3GPPP-AUGUACUUUUCCGACUACGUCGUUAGCCACCUUAGCCA...,H2N-YFSDYVVSHLSQTLYRKLK-COOH,H2N-Tyr-Phe-Ser-Asp-Tyr-Val-Val-Ser-His-Leu-Se...,True,19
3182,XM_054317358.1,AAACTGAAGCCGCGGCCGAAAACGCCAAGAGATTGATGCTGTAGCT...,XP_047279965.1,MLEAIDKNRALHAAERLQTKLRERGDVANEDKLSLLKSVLQSPLFS...,CH3GPPP-AUGUACUUUUCCGACUACGUCGUUAGCCACCUUAGCCA...,H2N-YFSDYVVSHLSQTLYRKLK-COOH,H2N-Tyr-Phe-Ser-Asp-Tyr-Val-Val-Ser-His-Leu-Se...,True,19


### EDA

In [28]:
nucleus = protein_synthesis_process.eucaryotes_cell.nucleus
promoter_found  = 0

for rna in results_df['sequence']:
    p = nucleus.find_promoter(rna)
    if p is not None and len(p) > 0:
        promoter_found +=1
print('promoter found:', promoter_found, 'out of', results_df.shape[0], 
    'sequences processed (', promoter_found/results_df.shape[0]*100, '%)')

promoter found: 395780 out of 746567 sequences processed ( 53.01332633239883 %)


In [29]:
t_count = 0 # thymine
u_count = 0 # uracil

for rna in results_df['sequence']:
    t_count += rna.count('T')
    u_count += rna.count('U')
print('Thymine count:', t_count, 'Uracil count:', u_count)

Thymine count: 745283566 Uracil count: 0


In [30]:
results_df['sequence']

0       AGCAGGGCGTCCAGCGGAGAAGGCAGAGGAGGGGAGATGCGGGCTC...
0       AGCAGGGCGTCCAGCGGAGAAGGCAGAGGAGGGGAGATGCGGGCTC...
0       AGCAGGGCGTCCAGCGGAGAAGGCAGAGGAGGGGAGATGCGGGCTC...
0       AGCAGGGCGTCCAGCGGAGAAGGCAGAGGAGGGGAGATGCGGGCTC...
0       AGCAGGGCGTCCAGCGGAGAAGGCAGAGGAGGGGAGATGCGGGCTC...
                              ...                        
5247    ATCCTCCCAGGGCAACTTGAAAGTAACCGCACCTTCCAAAGGGCAC...
5247    ATCCTCCCAGGGCAACTTGAAAGTAACCGCACCTTCCAAAGGGCAC...
5247    ATCCTCCCAGGGCAACTTGAAAGTAACCGCACCTTCCAAAGGGCAC...
5247    ATCCTCCCAGGGCAACTTGAAAGTAACCGCACCTTCCAAAGGGCAC...
5247    ATCCTCCCAGGGCAACTTGAAAGTAACCGCACCTTCCAAAGGGCAC...
Name: sequence, Length: 746567, dtype: object

In [31]:
BASE_COMPLEMENT_DNA2RNA = {
    'A': 'U', 
    'T': 'A', 
    'C': 'G', 
    'G': 'C'
}

t_count = 0 # thymine
u_count = 0 # uracil
error_list = []

for dna_sequence_to_transcript in results_df['sequence']:
    try:
        mrna = ''.join([BASE_COMPLEMENT_DNA2RNA[base] for base in dna_sequence_to_transcript])
        t_count += mrna.count('T')
        u_count += mrna.count('U')
    except:
        error_list.append(dna_sequence_to_transcript)
print('Thymine count:', t_count, 'Uracil count:', u_count)

Thymine count: 0 Uracil count: 784341526


In [32]:
len(error_list)

59

In [33]:
import pandas as pd

error_df = pd.DataFrame(columns=['sequence', 'N_count', 'S_count', 'Y_count', 'total_errors'])
error_df['sequence'] = error_list
error_df['N_count'] = 0
error_df['S_count'] = 0
error_df['Y_count'] = 0
error_df['total_errors'] = 0    

def count_errors(row):
    for base in row['sequence']:
        if base not in BASE_COMPLEMENT_DNA2RNA.keys():
            if base == 'N':
                row['N_count'] += 1
            elif base == 'S':
                row['S_count'] += 1
            elif base == 'Y':
                row['Y_count'] += 1
    row['total_errors'] = row['N_count'] + row['S_count'] + row['Y_count']
    return row

error_df = error_df.apply(count_errors, axis=1)

In [34]:
error_df

Unnamed: 0,sequence,N_count,S_count,Y_count,total_errors
0,ATGGTGAAGCTCTCTATTGTCCTGACCCCACAGTTCCTGTCCCATG...,4,0,0,4
1,ATGGTGAAGCTCTCTATTGTCCTGACCCCACAGTTCCTGTCCCATG...,4,0,0,4
2,ATGGTGAAGCTCTCTATTGTCCTGACCCCACAGTTCCTGTCCCATG...,4,0,0,4
3,ATGGTGAAGCTCTCTATTGTCCTGACCCCACAGTTCCTGTCCCATG...,4,0,0,4
4,ATGGTGAAGCTCTCTATTGTCCTGACCCCACAGTTCCTGTCCCATG...,4,0,0,4
5,ATGGTGAAGCTCTCTATTGTCCTGACCCCACAGTTCCTGTCCCATG...,4,0,0,4
6,ATGGTGAAGCTCTCTATTGTCCTGACCCCACAGTTCCTGTCCCATG...,4,0,0,4
7,ATGGTGAAGCTCTCTATTGTCCTGACCCCACAGTTCCTGTCCCATG...,4,0,0,4
8,ATGGTGAAGCTCTCTATTGTCCTGACCCCACAGTTCCTGTCCCATG...,4,0,0,4
9,ATGGTGAAGCTCTCTATTGTCCTGACCCCACAGTTCCTGTCCCATG...,4,0,0,4


- "N" represents any nucleotide (A, T, C, or G).
- "S" represents either C (cytosine) or G (guanine).
- "Y" represents either T (thymine) or C (cytosine).

- Guanine. adenine, thymine, cytosine: G,A,T,C
- Purine (adenine or guanine): R
- Pyrimidine (thymine or cytosine): Y
- Adenine or thymine: W
- Guanine or cytosine: S
- Adenine or cytosine: M
- Guanine or thymine: K
- Adenine or thymine or cytosine: H
- Guanine or cytosine or thymine: B
- Guanine or adenine or cytosine: V
- Guanine or adenine or thymine: D
- Guanine or adenine or thymine or cytosine: N