This little script creates the `index.csv` needed for the process_pdbs script 

In [None]:
import csv
from pathlib import Path

def collect_pdb_data(folder_path, output_csv="pdb_summary.csv"):
    pdb_entries = []
    folder_path = Path(folder_path)

    for pdb_file in folder_path.rglob("*.pdb"):
        pdb_id = pdb_file.stem.lower()
        pdb_path = str(pdb_file)
        # Placeholder fields – update this logic if you want to extract real data
        chain1 = "A"
        chain2 = "B_C"
        lig_code = ""
        lig_smiles = ""
        lig_resi = ""

        pdb_entries.append({
            "pdb_id": pdb_id,
            "pdb_path": pdb_path,
            "chain1": chain1,
            "chain2": chain2,
            "lig_code": lig_code,
            "lig_smiles": lig_smiles,
            "lig_resi": lig_resi
        })

    with open(output_csv, "w", newline="") as csvfile:
        fieldnames = ["pdb_id", "pdb_path", "chain1", "chain2", "lig_code", "lig_smiles", "lig_resi"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(pdb_entries)

    print(f"CSV file written to {output_csv}")

# Example usage:
if __name__ == "__main__":
    collect_pdb_data("/home/sascha/data/Projects/affinity_project/affinity/src/submodules/dna_positioning/complexes_pdbs")


CSV file written to pdb_summary.csv


Subsequently we need to run process_pdbs in the terminal:
```
python data/process_pdbs.py \
    --data_index_file /home/sascha/data/Projects/affinity_project/affinity/src/submodules/ATOMICA/trial/process_complexes/pdb_summary.csv \
    --out_path /home/sascha/data/Projects/affinity_project/affinity/src/submodules/ATOMICA/trial/process_complexes/processed_pdbs.pkl
```
and then add the labels to the .pkl file.  
For now I'll create dummy data.

In [1]:
import pickle
import numpy as np
from sklearn.model_selection import train_test_split

# === CONFIGURATION ===
input_pkl = "/home/sascha/data/Projects/affinity_project/affinity/src/submodules/ATOMICA/trial/process_complexes/processed_pdbs.pkl"
output_train_pkl = "/home/sascha/data/Projects/affinity_project/affinity/src/submodules/ATOMICA/trial/process_complexes/train_labelled_processed_pdbs.pkl"
output_test_pkl = "/home/sascha/data/Projects/affinity_project/affinity/src/submodules/ATOMICA/trial/process_complexes/test_labelled_processed_pdbs.pkl"

mu = 0.0
sigma = 125.0  # 95% of values will fall between -250 and +250
min_affinity_value = 1e-5  # to avoid log(0) in neglog_aff computation

# === LOAD ===
with open(input_pkl, 'rb') as f:
    data = pickle.load(f)

print(f"Loaded {len(data)} items from {input_pkl}")

# === GENERATE FAKE AFFINITIES ===
for item in data:
    raw_affinity = np.random.normal(loc=mu, scale=sigma)
    affinity_for_log = max(abs(raw_affinity), min_affinity_value)
    neglog_aff = -np.log(affinity_for_log)

    item['label'] = float(neglog_aff)
    item['affinity'] = {'neglog_aff': float(neglog_aff)}

# === TRAIN-TEST SPLIT ===
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

print(f"Train size: {len(train_data)}")
print(f"Test size: {len(test_data)}")

# === SAVE SPLITS ===
with open(output_train_pkl, 'wb') as f:
    pickle.dump(train_data, f)
with open(output_test_pkl, 'wb') as f:
    pickle.dump(test_data, f)

print(f"[DONE] Saved train set to {output_train_pkl}")
print(f"[DONE] Saved test set to {output_test_pkl}")


Loaded 1000 items from /home/sascha/data/Projects/affinity_project/affinity/src/submodules/ATOMICA/trial/process_complexes/processed_pdbs.pkl
Train size: 800
Test size: 200
[DONE] Saved train set to /home/sascha/data/Projects/affinity_project/affinity/src/submodules/ATOMICA/trial/process_complexes/train_labelled_processed_pdbs.pkl
[DONE] Saved test set to /home/sascha/data/Projects/affinity_project/affinity/src/submodules/ATOMICA/trial/process_complexes/test_labelled_processed_pdbs.pkl


In [None]:
import pickle
import numpy as np

with open("/home/sascha/data/Projects/affinity_project/affinity/src/submodules/ATOMICA/trial/process_complexes/train_labelled_processed_pdbs.pkl", 'rb') as f:
    test = pickle.load(f)


TypeError: list indices must be integers or slices, not str

In [24]:
neglog_aff = ([item['affinity']['neglog_aff'] for item in test])

In [26]:
import math
aff = np.exp(-np.array(neglog_aff))

In [27]:
aff

array([9.10226600e+01, 5.41718581e+01, 7.09359721e+01, 1.01733382e+01,
       7.15976304e+01, 1.88131763e+01, 8.54637483e+01, 1.91153325e+02,
       4.34943402e+01, 5.93843100e+01, 1.77857847e+02, 6.37446812e+01,
       8.65197978e+01, 1.28161972e+02, 6.60846978e+01, 1.43412405e+01,
       5.69186776e+01, 2.27322620e+01, 1.05953750e+02, 7.13052657e+01,
       2.36796421e+02, 1.35223616e+02, 9.03597929e+01, 1.77358878e+02,
       9.23756073e+01, 9.16291948e+01, 3.91432127e+01, 6.77674965e+01,
       1.55466631e+02, 9.07802226e+01, 9.79889020e-01, 6.65728452e+01,
       2.90283298e+01, 5.95242002e+01, 1.49487589e+02, 1.59685749e+01,
       5.35372602e+01, 1.30924077e+02, 6.91280253e+01, 2.00837854e+01,
       2.40521375e+02, 9.54742770e+01, 2.78258281e+02, 4.27954339e+01,
       1.72521144e+02, 1.95868034e+01, 7.39428616e+01, 1.99785904e+02,
       3.44748040e+01, 1.50823356e+02, 3.96799011e+01, 8.36528547e+01,
       1.04487761e+02, 4.89559484e+01, 3.76260501e+01, 7.87936414e+01,
      

Now, I can try to run train.py with this dataset to check, if it trains at all.  
