In [10]:
import pandas as pd
import tqdm
import glob
from utils.alphafold import download_alphafold, check_atom_coherence


In [11]:
DL_ALPHAFOLD_PDB = True
UPDATE_PATHS = True
DATASET_INPUT_PATH = "./data/main_dataset_creation/outputs/all_v2_2/dataset_only_infos.csv"
DATASET_OUTPUT_PATH = "./data/main_dataset_creation/outputs/all_v2_2/dataset_with_alphafold_paths.csv"


In [12]:
df = pd.read_csv(DATASET_INPUT_PATH)


In [13]:
if DL_ALPHAFOLD_PDB:
    # download alphafold prediction
    for alphafold_id in tqdm.tqdm(df.AlphaFoldDB.unique()):
        download_alphafold(alphafold_id)

    # some entries in the dataset have no linked alphafold id, we try our luck with the uniprot id instead
    # already download structures will not be redownloaded (check in download_alphafold)

    for uniprot_id in tqdm.tqdm(df.uniprot.unique()):
        download_alphafold(uniprot_id)


100%|██████████| 417/417 [00:00<00:00, 3759.02it/s]


exception raised for alphafold_id: nan:
HTTP Error 404: Not Found


  0%|          | 0/484 [00:00<?, ?it/s]

exception raised for alphafold_id: Q23551:
HTTP Error 404: Not Found
exception raised for alphafold_id: Q10466:
HTTP Error 404: Not Found


 26%|██▌       | 126/484 [00:00<00:00, 658.23it/s]

exception raised for alphafold_id: Q9H782:
HTTP Error 404: Not Found


 40%|███▉      | 192/484 [00:00<00:00, 472.48it/s]

exception raised for alphafold_id: P01625:
HTTP Error 404: Not Found
exception raised for alphafold_id: P11532:
HTTP Error 404: Not Found


100%|██████████| 484/484 [00:00<00:00, 653.37it/s]

exception raised for alphafold_id: P12823:
HTTP Error 404: Not Found
exception raised for alphafold_id: P01607:
HTTP Error 404: Not Found





In [14]:
if UPDATE_PATHS:
    # we now add the path to each record of the dataframe
    df["alphafold_path"] = ""

    def find_alphafold_path(row):
        alphafold_id = row["AlphaFoldDB"]
        path = glob.glob(f"./data/main_dataset_creation/3D_structures/alphafold/{alphafold_id}.pdb")
        if not path:
            alphafold_id = row["uniprot"]
            path = glob.glob(
                f"./data/main_dataset_creation/3D_structures/alphafold/{alphafold_id}.pdb")
        
        row["alphafold_path"] = path[0] if path else ""
        return row

    df = df.apply(find_alphafold_path, axis=1)

In [15]:
df.to_csv(DATASET_OUTPUT_PATH, index=False)


In [16]:
df = pd.read_csv(DATASET_OUTPUT_PATH)
print(f"the current 3D structures folder contains {len(glob.glob('./data/main_dataset_creation/3D_structures/alphafold/*.pdb'))} files")
print(
    f"there are {len(df[pd.isna(df.alphafold_path)])} records without 3D structures path")


the current 3D structures folder contains 522 files
there are 22 records without 3D structures path


In [17]:
print(len(df))
# we drop rows without ddG
df = df[~pd.isna(df.ddG)]
# we drop rows without alphafold_path
df = df[~pd.isna(df.alphafold_path)]
print(len(df))
print(len(df.uniprot.unique()))
print(len(df.alphafold_path.unique()))

11316
6712
297
294


#### Check Coherence between pdb and sequence

In [None]:
df = pd.read_csv(DATASET_OUTPUT_PATH)
print(len(df))
df = check_atom_coherence(df)
print(len(df))


In [18]:
# stop run all
assert False

AssertionError: 

In [None]:
from utils.file_utils import write_json
no_3d_struct = df[pd.isna(df.alphafold_path)]
no_3d_struct = no_3d_struct[["uniprot", "sequence"]]
no_3d_struct.drop_duplicates(subset=["sequence"], inplace=True)
# put into latch bio template for running it online
# see https://console.latch.bio/workflows/82788/parameters
no_3d_struct.rename(columns={"sequence": "aa_sequence", "uniprot": "run_name"}, inplace=True)
def fasta_format(row):
    row["aa_sequence"] = f"> {row['run_name']}\n"+row["aa_sequence"]
    return row

no_3d_struct = no_3d_struct.apply(fasta_format, axis=1)
no_3d_struct = no_3d_struct[["aa_sequence", "run_name"]]
no_3d_struct.to_csv("./data/main_dataset_creation/3D_structures/no_3d_struct_latchbio.csv", index=False)

# alphafold's shared notebook on google colab:
# https://colab.research.google.com/github/deepmind/alphafold/blob/main/notebooks/AlphaFold.ipynb#scrollTo=woIxeCPygt7K

In [None]:
from utils.file_utils import write_json
list_missing_uniprot = no_3d_struct.run_name.to_dict()
write_json("./data/main_dataset_creation/3D_structures/no_3d_struct_latchbio_ids.json",
           list_missing_uniprot)
