In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("CycPeptMPDB_Peptide_Assay_PAMPA.csv", low_memory=False)

# Create a new dataframe with only the SMILES column
smiles_df = df[['SMILES']]

In [None]:
smiles_df

Unnamed: 0,SMILES
0,C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](CC...
1,CC(C)C[C@@H]1NC(=O)[C@@H](CC(C)C)NC(=O)[C@@H](...
2,CC(C)C[C@H]1NC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[C@...
3,CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@H](Cc...
4,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[C...
...,...
6936,N=C(N)NCCC[C@@H]1NC(=O)[C@H](Cc2c[nH]c3ccccc23...
6937,C[C@@H]1NC(=O)CSC[C@@H](C(N)=O)NC(=O)[C@H](CCC...
6938,N=C(N)NCCC[C@@H]1NC(=O)[C@H](Cc2c[nH]c3ccccc23...
6939,N=C(N)NCCC[C@@H]1NC(=O)[C@H](Cc2c[nH]c3ccccc23...


In [None]:
import pandas as pd
from mordred import Calculator, descriptors
from rdkit import Chem
import os
import glob
from google.colab import files

# Function to process chunks and save results
def process_and_save_chunks(smiles_list, chunk_size, output_prefix):
    calc = Calculator(descriptors, ignore_3D=True)
    num_chunks = len(smiles_list) // chunk_size + 1

    for i in range(num_chunks):
        start_idx = i * chunk_size
        end_idx = (i + 1) * chunk_size
        chunk = smiles_list[start_idx:end_idx]

        if not chunk:
            continue

        # Calculate descriptors for the chunk
        mols = [Chem.MolFromSmiles(smile) for smile in chunk]
        descriptors_df = calc.pandas(mols)
        descriptors_df['SMILES'] = chunk  # Optional: keep track of SMILES strings

        # Save the descriptors to CSV
        chunk_filename = f'{output_prefix}_chunk_{i}.csv'
        descriptors_df.to_csv(chunk_filename, index=False)
        print(f'Processed and saved chunk {i+1}/{num_chunks} to {chunk_filename}')


smiles_list = smiles_df['SMILES'].tolist()

# Process and save chunks
chunk_size = 200
process_and_save_chunks(smiles_list, chunk_size, 'descriptor')

# Get a list of all chunk files
chunk_files = glob.glob('descriptor_chunk_*.csv')

# Read and concatenate all chunk files into a single dataframe
all_chunks = [pd.read_csv(file) for file in chunk_files]
merged_df = pd.concat(all_chunks, axis=0)

# Save the merged dataframe to a final CSV file
merged_df.to_csv('mordred_colab.csv', index=False)

# Download the final file
files.download('mordred_colab.csv')

  2%|▏         | 3/200 [00:01<01:39,  1.98it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 11%|█         | 22/200 [00:12<02:14,  1.32it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [03:24<00:00,  1.02s/it]


Processed and saved chunk 1/35 to descriptor_chunk_0.csv


  0%|          | 1/200 [00:03<12:23,  3.73s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [03:46<00:00,  1.13s/it]


Processed and saved chunk 2/35 to descriptor_chunk_1.csv


  0%|          | 1/200 [00:03<12:34,  3.79s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [03:33<00:00,  1.07s/it]


Processed and saved chunk 3/35 to descriptor_chunk_2.csv


  0%|          | 1/200 [00:02<09:13,  2.78s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [02:56<00:00,  1.13it/s]


Processed and saved chunk 4/35 to descriptor_chunk_3.csv


  0%|          | 1/200 [00:02<08:00,  2.42s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [02:59<00:00,  1.11it/s]


Processed and saved chunk 5/35 to descriptor_chunk_4.csv


  5%|▌         | 10/200 [00:04<01:16,  2.48it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [01:25<00:00,  2.33it/s]


Processed and saved chunk 6/35 to descriptor_chunk_5.csv




  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  4%|▍         | 8/200 [00:05<01:33,  2.06it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [01:43<00:00,  1.92it/s]


Processed and saved chunk 7/35 to descriptor_chunk_6.csv


  0%|          | 1/200 [00:01<04:00,  1.21s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  2%|▏         | 4/200 [00:02<01:31,  2.13it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [01:39<00:00,  2.02it/s]


Processed and saved chunk 8/35 to descriptor_chunk_7.csv


  8%|▊         | 16/200 [00:06<01:05,  2.81it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [01:37<00:00,  2.06it/s]


Processed and saved chunk 9/35 to descriptor_chunk_8.csv


  8%|▊         | 17/200 [00:06<01:25,  2.15it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 15%|█▌        | 30/200 [00:14<01:28,  1.93it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [01:36<00:00,  2.07it/s]


Processed and saved chunk 10/35 to descriptor_chunk_9.csv


 21%|██        | 42/200 [00:17<01:01,  2.56it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 22%|██▏       | 43/200 [00:17<01:04,  2.44it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [01:35<00:00,  2.10it/s]


Processed and saved chunk 11/35 to descriptor_chunk_10.csv


 31%|███       | 62/200 [00:29<01:01,  2.23it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 32%|███▎      | 65/200 [00:30<00:49,  2.73it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [01:29<00:00,  2.24it/s]


Processed and saved chunk 12/35 to descriptor_chunk_11.csv


 12%|█▎        | 25/200 [00:11<01:38,  1.78it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 60%|██████    | 121/200 [00:52<00:33,  2.38it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [01:24<00:00,  2.37it/s]


Processed and saved chunk 13/35 to descriptor_chunk_12.csv


  8%|▊         | 15/200 [00:08<02:04,  1.49it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 56%|█████▋    | 113/200 [00:48<00:28,  3.04it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [01:26<00:00,  2.31it/s]


Processed and saved chunk 14/35 to descriptor_chunk_13.csv


  8%|▊         | 15/200 [00:06<01:16,  2.42it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [01:26<00:00,  2.32it/s]


Processed and saved chunk 15/35 to descriptor_chunk_14.csv


 36%|███▋      | 73/200 [00:32<00:48,  2.60it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 49%|████▉     | 98/200 [00:46<00:48,  2.09it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [01:39<00:00,  2.02it/s]


Processed and saved chunk 16/35 to descriptor_chunk_15.csv


  2%|▏         | 3/200 [00:03<03:26,  1.05s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  2%|▎         | 5/200 [00:05<03:06,  1.04it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [01:47<00:00,  1.86it/s]


Processed and saved chunk 17/35 to descriptor_chunk_16.csv


 20%|██        | 41/200 [00:20<01:14,  2.12it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 28%|██▊       | 55/200 [00:29<01:11,  2.04it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [01:45<00:00,  1.90it/s]


Processed and saved chunk 18/35 to descriptor_chunk_17.csv


 19%|█▉        | 38/200 [00:19<01:17,  2.08it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 23%|██▎       | 46/200 [00:23<00:58,  2.65it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [01:44<00:00,  1.92it/s]


Processed and saved chunk 19/35 to descriptor_chunk_18.csv


 20%|█▉        | 39/200 [00:20<01:30,  1.79it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 28%|██▊       | 56/200 [00:30<01:01,  2.33it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [01:47<00:00,  1.85it/s]


Processed and saved chunk 20/35 to descriptor_chunk_19.csv


  9%|▉         | 18/200 [00:07<01:10,  2.57it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 14%|█▍        | 29/200 [00:15<02:36,  1.09it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [01:42<00:00,  1.94it/s]


Processed and saved chunk 21/35 to descriptor_chunk_20.csv


  0%|          | 1/200 [00:01<03:36,  1.09s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  3%|▎         | 6/200 [00:02<01:19,  2.45it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [01:49<00:00,  1.82it/s]


Processed and saved chunk 22/35 to descriptor_chunk_21.csv


  0%|          | 1/200 [00:01<03:40,  1.11s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [02:11<00:00,  1.52it/s]


Processed and saved chunk 23/35 to descriptor_chunk_22.csv


  4%|▎         | 7/200 [00:03<01:28,  2.18it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  5%|▌         | 10/200 [00:04<01:13,  2.58it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [01:59<00:00,  1.67it/s]


Processed and saved chunk 24/35 to descriptor_chunk_23.csv


  0%|          | 1/200 [00:01<03:27,  1.04s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  2%|▎         | 5/200 [00:02<01:37,  2.00it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [01:57<00:00,  1.70it/s]


Processed and saved chunk 25/35 to descriptor_chunk_24.csv


  0%|          | 1/200 [00:01<03:19,  1.00s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  2%|▎         | 5/200 [00:03<01:47,  1.81it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [01:41<00:00,  1.96it/s]


Processed and saved chunk 26/35 to descriptor_chunk_25.csv


  0%|          | 1/200 [00:01<06:09,  1.86s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [02:41<00:00,  1.24it/s]


Processed and saved chunk 27/35 to descriptor_chunk_26.csv


  0%|          | 1/200 [00:01<05:29,  1.66s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [02:37<00:00,  1.27it/s]


Processed and saved chunk 28/35 to descriptor_chunk_27.csv


  1%|          | 2/200 [00:01<02:19,  1.42it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [02:41<00:00,  1.24it/s]


Processed and saved chunk 29/35 to descriptor_chunk_28.csv


  0%|          | 1/200 [00:01<04:54,  1.48s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [02:38<00:00,  1.26it/s]


Processed and saved chunk 30/35 to descriptor_chunk_29.csv


  0%|          | 1/200 [00:02<07:23,  2.23s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [02:43<00:00,  1.22it/s]


Processed and saved chunk 31/35 to descriptor_chunk_30.csv


  0%|          | 1/200 [00:01<06:08,  1.85s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [02:38<00:00,  1.26it/s]


Processed and saved chunk 32/35 to descriptor_chunk_31.csv


  1%|          | 2/200 [00:02<02:51,  1.15it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [02:47<00:00,  1.20it/s]


Processed and saved chunk 33/35 to descriptor_chunk_32.csv


  0%|          | 1/200 [00:01<04:48,  1.45s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 200/200 [02:43<00:00,  1.23it/s]


Processed and saved chunk 34/35 to descriptor_chunk_33.csv


  1%|          | 1/141 [00:02<05:14,  2.25s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 141/141 [01:41<00:00,  1.40it/s]


Processed and saved chunk 35/35 to descriptor_chunk_34.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>