In [1]:
import pandas as pd
import tqdm
import glob
import os

from utils.alphafold import download_alphafold, check_atom_coherence


In [2]:
DL_ALPHAFOLD_PDB = True
UPDATE_PATHS = True
CHECK_COHERENCE = True
ONLY_DDG = False
WITH_MUTATED_STRUCTURE = True

DATASET_DIR = "./data/main_dataset_creation/outputs/merged/"
DATASET_INPUT_PATH = f"{DATASET_DIR}dataset_only_infos.csv"
DATASET_OUTPUT_PATH = f"{DATASET_DIR}dataset_with_3D_paths_2.csv"


In [3]:
df = pd.read_csv(DATASET_INPUT_PATH)


In [4]:
if DL_ALPHAFOLD_PDB:
    # download alphafold prediction
    for alphafold_id in tqdm.tqdm(df.AlphaFoldDB.unique()):
        download_alphafold(alphafold_id)

    # some entries in the dataset have no linked alphafold id, we try our luck with the uniprot id instead
    # already download structures will not be redownloaded (check in download_alphafold)

    for uniprot_id in tqdm.tqdm(df.uniprot.unique()):
        download_alphafold(uniprot_id)


100%|██████████| 407/407 [00:00<00:00, 4790.75it/s]


exception raised for alphafold_id: nan:
HTTP Error 404: Not Found


 12%|█▏        | 58/478 [00:00<00:01, 302.13it/s]

exception raised for alphafold_id: Q23551:
HTTP Error 404: Not Found


 25%|██▍       | 118/478 [00:00<00:00, 375.55it/s]

exception raised for alphafold_id: Q10466:
HTTP Error 404: Not Found
exception raised for alphafold_id: Q9H782:
HTTP Error 404: Not Found
exception raised for alphafold_id: P01625:
HTTP Error 404: Not Found


 63%|██████▎   | 301/478 [00:00<00:00, 808.58it/s]

exception raised for alphafold_id: P11532:
HTTP Error 404: Not Found
exception raised for alphafold_id: P12823:
HTTP Error 404: Not Found
exception raised for alphafold_id: P01607:
HTTP Error 404: Not Found


100%|██████████| 478/478 [00:00<00:00, 722.33it/s]

exception raised for alphafold_id: P08519:
HTTP Error 404: Not Found





In [5]:
if UPDATE_PATHS:
    # we now add the path to each record of the dataframe

    df["alphafold_path"] = ""
    df["relaxed_wild_3D_path"] = ""
    df["relaxed_mutated_3D_path"] = ""

    all_alphafold_paths = glob.glob(
        f"./data/main_dataset_creation/3D_structures/alphafold/*.pdb")
    all_relaxed_paths = glob.glob(
        f"./compute_mutated_structures/relaxed_pdb/**/*_relaxed.pdb")

    def find_alphafold_3D_path(row):
        alphafold_id = row["AlphaFoldDB"]
        path = f"./data/main_dataset_creation/3D_structures/alphafold/{alphafold_id}.pdb"
        if path not in all_alphafold_paths:
            alphafold_id = row["uniprot"]
            path = f"./data/main_dataset_creation/3D_structures/alphafold/{alphafold_id}.pdb"

        row["alphafold_path"] = path if path in all_alphafold_paths else ""
        return row

    def find_wild_3D_path(row):
        name, _ = os.path.splitext(row["alphafold_path"].split("/")[-1])
        path = f"./compute_mutated_structures/relaxed_pdb/{name}_relaxed/{name}_relaxed.pdb"
        
        row["relaxed_wild_3D_path"] = path if path in all_relaxed_paths else ""
        return row

    def find_mutation_3D_path(row):
        name, _ = os.path.splitext(row["alphafold_path"].split("/")[-1])
        w_aa, m_aa = row["wild_aa"], row["mutated_aa"]
        pos  = int(row["mutation_position"])+1
        path = (f"./compute_mutated_structures/relaxed_pdb/{name}_relaxed/" +
                f"{name}_relaxed_{w_aa}{pos}{m_aa}_relaxed.pdb")

        row["relaxed_mutated_3D_path"] = path if path in all_relaxed_paths else ""
        return row

    df = df.apply(find_alphafold_3D_path, axis=1)
    df = df.apply(find_wild_3D_path, axis=1)
    df = df.apply(find_mutation_3D_path, axis=1)

    # if WITH_MUTATED_STRUCTURE:
    #     df = df[~(df.relaxed_mutated_3D_path.eq(""))]


In [6]:
df.to_csv(DATASET_OUTPUT_PATH, index=False)


In [7]:
df = pd.read_csv(DATASET_OUTPUT_PATH)
print(f"the current 3D structures folder contains {len(glob.glob('./data/main_dataset_creation/3D_structures/alphafold/*.pdb'))} files")
print(
    f"there are {len(df[pd.isna(df.alphafold_path)])} records without 3D structures path")


the current 3D structures folder contains 539 files
there are 23 records without 3D structures path


In [8]:
print(len(df))
if ONLY_DDG:
    # we drop rows without ddG
    df = df[~pd.isna(df.ddG)]
# we drop rows without alphafold_path
df = df[~pd.isna(df.alphafold_path)]
if CHECK_COHERENCE:
    print(len(df))
    df = check_atom_coherence(df)
    print(len(df))
print(len(df))
print(len(df.uniprot.unique()))
print(len(df.alphafold_path.unique()))


11113
11090
checking coherence between 470 pairs of sequence-atom(pdb) files


326it [00:08, 32.41it/s]

error for ./data/main_dataset_creation/3D_structures/alphafold/P28335.pdb at position 22: C instead of S


339it [00:09, 37.04it/s]

error for ./data/main_dataset_creation/3D_structures/alphafold/P00749.pdb at position 140: P instead of L


470it [00:13, 33.73it/s]

found 2 non coherent sequence-atom(pdb) pairs
11058
11058
471
468





In [9]:
# save df
df.to_csv(DATASET_OUTPUT_PATH, index=False)


In [10]:
# stop run all
assert False

AssertionError: 

In [None]:
from utils.file_utils import write_json
no_3d_struct = df[pd.isna(df.alphafold_path)]
no_3d_struct = no_3d_struct[["uniprot", "sequence"]]
no_3d_struct.drop_duplicates(subset=["sequence"], inplace=True)
# put into latch bio template for running it online
# see https://console.latch.bio/workflows/82788/parameters
no_3d_struct.rename(columns={"sequence": "aa_sequence", "uniprot": "run_name"}, inplace=True)
def fasta_format(row):
    row["aa_sequence"] = f"> {row['run_name']}\n"+row["aa_sequence"]
    return row

no_3d_struct = no_3d_struct.apply(fasta_format, axis=1)
no_3d_struct = no_3d_struct[["aa_sequence", "run_name"]]
no_3d_struct.to_csv("./data/main_dataset_creation/3D_structures/no_3d_struct_latchbio.csv", index=False)

# alphafold's shared notebook on google colab:
# https://colab.research.google.com/github/deepmind/alphafold/blob/main/notebooks/AlphaFold.ipynb#scrollTo=woIxeCPygt7K

In [None]:
from utils.file_utils import write_json
list_missing_uniprot = no_3d_struct.run_name.to_dict()
write_json("./data/main_dataset_creation/3D_structures/no_3d_struct_latchbio_ids.json",
           list_missing_uniprot)


In [11]:
import pandas as pd
df = pd.read_csv(DATASET_OUTPUT_PATH)
df.isna().sum()

uniprot                       0
wild_aa                       0
mutated_chain                 0
mutation_position             0
mutated_aa                    0
pH                            0
sequence                      0
length                        0
chain_start                   0
chain_end                     0
AlphaFoldDB                1528
Tm                         7225
ddG                        2417
dTm                        5949
dataset_source                0
infos_found                   0
alphafold_path                0
relaxed_wild_3D_path          7
relaxed_mutated_3D_path     471
dtype: int64