In [1]:
import itertools
import pandas as pd
import random

polymers = ["[*]CC([*])C", "[*]CC([*])CC","[*]CC([*])CCC","[*]CC([*])CC(C)C","[*]CCCCCCCC([*])C","[*]CC([*])O","[*]CC([*])OC(=O)CCCCCCCCCCC","[*]CC([*])OC(=O)CCCCCCCCCCCCCCC","[*]CC([*])OC(=O)CCCCCCCCCCCCCCCCC","[*]CC([*])(C)C#N"]
enzymes = ["PETase", "LCC", "TfH","ThC_Cut1","ThC_Cut2","HiC","FsC","PET2","PET5","Cut190"]

# https://docs.python.org/3/library/itertools.html

pr = itertools.product(polymers, enzymes)
xc = [44.47, 34.04, 20.01, 21.64, 31.47, 39.089205, 12.02, 33.35, 37.02, 8.0]
my_list = []

for pol, enz in pr:
    xc_value = xc[polymers.index(pol)]  # Get the initial xc value for the corresponding polymer
    for time in range(0, 500, 24):
        if time != 0:
            xc_value += random.uniform(0.5, 2.5)  # Increase xc with a random int between 2 and 5
        my_list.append({"time/h": time, "polymer": pol, "enzyme": enz, "crystallinity xc": xc_value})
df = pd.DataFrame(my_list)

df.to_pickle('my_dataset.pkl')

print(df)   

      time/h           polymer  enzyme  crystallinity xc
0          0       [*]CC([*])C  PETase         44.470000
1         24       [*]CC([*])C  PETase         45.208088
2         48       [*]CC([*])C  PETase         46.242378
3         72       [*]CC([*])C  PETase         46.837852
4         96       [*]CC([*])C  PETase         49.167938
...      ...               ...     ...               ...
2095     384  [*]CC([*])(C)C#N  Cut190         33.172276
2096     408  [*]CC([*])(C)C#N  Cut190         34.653355
2097     432  [*]CC([*])(C)C#N  Cut190         35.451463
2098     456  [*]CC([*])(C)C#N  Cut190         36.308215
2099     480  [*]CC([*])(C)C#N  Cut190         38.534018

[2100 rows x 4 columns]


In [3]:
!pip install rdkit-pypi

Defaulting to user installation because normal site-packages is not writeable
Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [4]:
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd

In [9]:
data = {
    'polymer_smiles': ["[*]CC([*])C", "[*]CC([*])CC","[*]CC([*])CCC","[*]CC([*])CC(C)C","[*]CCCCCCCC([*])C","[*]CC([*])O","[*]CC([*])OC(=O)CCCCCCCCCCC","[*]CC([*])OC(=O)CCCCCCCCCCCCCCC","[*]CC([*])OC(=O)CCCCCCCCCCCCCCCCC","[*]CC([*])(C)C#N"]
}

df_polymer = pd.DataFrame(data)

# Function to generate Morgan fingerprints from SMILES
def generate_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 2).ToBitString()
        return fingerprint
    else:
        return None

# Apply the function to the 'polymer_smiles' column
df_polymer['fingerprint'] = df['polymer_smiles'].apply(generate_fingerprint)

# Display the DataFrame with fingerprints
print(df_polymer)

                      polymer_smiles  \
0                        [*]CC([*])C   
1                       [*]CC([*])CC   
2                      [*]CC([*])CCC   
3                   [*]CC([*])CC(C)C   
4                  [*]CCCCCCCC([*])C   
5                        [*]CC([*])O   
6        [*]CC([*])OC(=O)CCCCCCCCCCC   
7    [*]CC([*])OC(=O)CCCCCCCCCCCCCCC   
8  [*]CC([*])OC(=O)CCCCCCCCCCCCCCCCC   
9                   [*]CC([*])(C)C#N   

                                         fingerprint  
0  0100000000000000000000000000000000000000000000...  
1  1100000000000000000000000000000000000000000000...  
2  0100000000000000000000000000000000000000000000...  
3  0100000000000000000000000000000000000000000000...  
4  0100000000000000000000000000000000000000000000...  
5  0100000000000000000000000000000000000000000000...  
6  0100000000000000000000000000000000000000000000...  
7  0100000000000000000000000000000000000000000000...  
8  0100000000000000000000000000000000000000000000...  
9  000000

In [6]:
pip install biopython

Defaulting to user installation because normal site-packages is not writeable
Collecting biopython
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: biopython
Successfully installed biopython-1.81
Note: you may need to restart the kernel to use updated packages.


In [7]:
data = {
    'enzyme_name': ['PETase', 'LCC', 'TfH', 'ThC_Cut1', 'ThC_Cut2', 'HiC', 'FsC', 'PET2', 'PET5', 'Cut190'],
    'enzyme_sequence': [
        'MKKVFAVTTVVLSVAGVVSAAAEVPEV',
        'MGGGFGLLLGILAGVAAGCVFSYFAATKPSPLAAGF',
        'MAHFKSPAKPVTYPQVAQNYNVLHAWM',
        'MKKTSLLLVATLVSLSASAGVSKKTLGKTLKSLLVFLLLSLGGVSALGGAA',
        'MKKTSLLLVATLVSLSASAGVSKKTLGKTLKSLLVFLLLSLGGVSALGGAA',
        'MKLFVALVLTIVIASTHGRTIAAAEMNKRRKSNKRKTRVHQR',
        'MSTGTFSMILVVLSVAGIFGIEYKLGIRVGLI',
        'MKKITFILVLLLSCISGKNSSSNSYAGTTFLHKRQVLTGR',
        'MKIKIFFLVMIFVLCISYNFISFFNSTIEMHETYSKSG',
        'MKTIVFLAIIFTSSSIVTGGAHSLSKKIEIEEGSETRWVVEYKAQMKFLP'
    ]
}

df_enzymes = pd.DataFrame(data)

# Function to generate enzyme fingerprints
def generate_enzyme_fingerprint(sequence):
    # Calculate amino acid composition
    amino_acid_composition = {aa: sequence.count(aa) / len(sequence) for aa in set(sequence)}
    
    # Sort amino acids to ensure consistent order
    sorted_aa = sorted(amino_acid_composition.keys())
    
    # Create a fingerprint as a list of amino acid frequencies
    fingerprint = [amino_acid_composition[aa] for aa in sorted_aa]
    
    return fingerprint

# Apply the function to the 'enzyme_sequence' column
df_enzymes['fingerprint'] = df_enzymes['enzyme_sequence'].apply(generate_enzyme_fingerprint)

# Display the DataFrame with enzyme fingerprints
print(df_enzymes[['enzyme_name', 'fingerprint']])

  enzyme_name                                        fingerprint
0      PETase  [0.18518518518518517, 0.07407407407407407, 0.0...
1         LCC  [0.19444444444444445, 0.027777777777777776, 0....
2         TfH  [0.14814814814814814, 0.037037037037037035, 0....
3    ThC_Cut1  [0.11764705882352941, 0.0196078431372549, 0.11...
4    ThC_Cut2  [0.11764705882352941, 0.0196078431372549, 0.11...
5         HiC  [0.11904761904761904, 0.023809523809523808, 0....
6         FsC  [0.03125, 0.03125, 0.0625, 0.15625, 0.15625, 0...
7        PET2  [0.025, 0.025, 0.05, 0.075, 0.025, 0.075, 0.1,...
8        PET5  [0.02631578947368421, 0.05263157894736842, 0.1...
9      Cut190  [0.06, 0.1, 0.06, 0.06, 0.02, 0.12, 0.1, 0.06,...


In [1]:
'hello world'

'hello world'