In [1]:
import pickle
import pandas as pd

### Load Data

In [2]:
DATA_PATH = 'data/data.pkl'

In [3]:
# Load the pickle file
with open(DATA_PATH, 'rb') as f:
    data = pickle.load(f)

# Convert the data to a pandas DataFrame
df = pd.DataFrame(data.values(), columns=['sequence'])
df.head()

Unnamed: 0,sequence
0,ATGGCTCAGACAAGATATACACAAAATAGATGGAGAAATGAAGCTT...
1,ATGGAACTTCGAGCATTAGAAGCGGATCTGAATTTCCTATCTGTCA...
2,ATGGAAGAATTTATTGCCCAGAAAATTCCATTCTGCTATCTGATTC...
3,ATGCGCTGTCCCAAATCCGCTGTTACTATGAGAAATGAAGAGCTGC...
4,ATGGCGGGTGACCTTGAAGACTGGCGTCAGCAGGCAAGATCAGCTC...


In [4]:
df.size

311

### Polypeptides Chain

In [5]:
from src.protein_synthesis import EucaryotesCell
LENGTH_AMIO_GROUP = 4 # length of amino acid group
LENGTH_CARBOXYL_GROUP = 5 # length of carboxyl group

In [6]:
cell = EucaryotesCell()

In [7]:
def synthesize_protein(row):
    cell.synthesize_protein(row['sequence'])

    polypeptides_chain = cell.get_protein()
    polypeptides_chain_ext = cell.get_extended_protein_name()

    row['polypeptides_chain_synthetized'] = polypeptides_chain
    row['polypeptides_chain_extended'] = polypeptides_chain_ext

    if polypeptides_chain:
        row['protein_synthesized'] = True
        peptides = polypeptides_chain[LENGTH_AMIO_GROUP:-LENGTH_CARBOXYL_GROUP]
        row['peptides_cardinality'] = len(peptides)
    else:
        row['protein_synthesized'] = False
        row['peptides_cardinality'] = None
    
    return row

In [8]:
df = df.apply(synthesize_protein, axis=1)

### Visualize results

In [9]:
print('Number of proteins synthesized: ', df['protein_synthesized'].sum())
print('Number of true positives: ', df['protein_synthesized'].sum()-df[df['peptides_cardinality']==0]['peptides_cardinality'].size)
print('Number of false positives: ', df[df['peptides_cardinality']==0]['peptides_cardinality'].size)

Number of proteins synthesized:  47
Number of true positives:  37
Number of false positives:  10


In [10]:
# Visualize row with protein synthesized
df[df['protein_synthesized'] & df['peptides_cardinality']>0][['polypeptides_chain_synthetized', 'polypeptides_chain_extended',
    'peptides_cardinality']]

Unnamed: 0,polypeptides_chain_synthetized,polypeptides_chain_extended,peptides_cardinality
7,H2N-WCNVYV-COOH,H2N-Trp-Cys-Asn-Val-Tyr-Val-COOH,6.0
8,H2N-SVNTFRSR-COOH,H2N-Ser-Val-Asn-Thr-Phe-Arg-Ser-Arg-COOH,8.0
9,H2N-PTKTGLF-COOH,H2N-Pro-Thr-Lys-Thr-Gly-Leu-Phe-COOH,7.0
13,H2N-YFFPNYLPNYFIVIYPL-COOH,H2N-Tyr-Phe-Phe-Pro-Asn-Tyr-Leu-Pro-Asn-Tyr-Ph...,17.0
16,H2N-ERQL-COOH,H2N-Glu-Arg-Gln-Leu-COOH,4.0
19,H2N-SD-COOH,H2N-Ser-As-COOH,2.0
20,H2N-SKSSQPKLVRARFGTPSFEIQETNLYLILIFVTLTSLQIGGA...,H2N-Ser-Lys-Ser-Ser-Gln-Pro-Lys-Leu-Val-Arg-Al...,75.0
24,H2N-SVETHRQCIVWSAPKLSVAYRLSRVSIRFCLGKTLTKSPL-COOH,H2N-Ser-Val-Glu-Thr-His-Arg-Gln-Cys-Ile-Val-Tr...,40.0
47,H2N-LVAPTSYSVPLRTLAPRNSG-COOH,H2N-Leu-Val-Ala-Pro-Thr-Ser-Tyr-Ser-Val-Pro-Le...,20.0
49,H2N-GQVTVPSYVVPRRSETLLIEFSPTLYTLLSRLVLNIV-COOH,H2N-Gly-Gln-Val-Thr-Val-Pro-Ser-Tyr-Val-Val-Pr...,37.0


In [11]:
df['polypeptides_chain_synthetized'][273]

'H2N-SNNNLLSHLKLIMWISVEISPVGIKFTVVLVRFISSYRISLLPRRNVLDETLSSGTQCCTLDNVYKW-COOH'