In [3]:
import pandas as pd
import tqdm
import glob
from utils.alphafold import download_alphafold

In [4]:
DL_ALPHAFOLD_PDB = False
UPDATE_PATHS = True

In [5]:
df = pd.read_csv("./data/main_dataset/main.csv")

In [6]:
if DL_ALPHAFOLD_PDB:
    # download alphafold prediction
    for alphafold_id in tqdm.tqdm(df.AlphaFoldDB.unique()):
        download_alphafold(alphafold_id)

    # some entries in the dataset have no linked alphafold id, we try our luck with the uniprot id instead
    # already download structures will not be redownloaded (check in download_alphafold)

    for uniprot_id in tqdm.tqdm(df.uniprot.unique()):
        download_alphafold(uniprot_id)


In [7]:
if UPDATE_PATHS:
    # we now add the path to each record of the dataframe
    df["alphafold_path"] = ""

    def find_alphafold_path(row):
        alphafold_id = row["AlphaFoldDB"]
        path = glob.glob(f"./data/main_dataset/3D_structures/alphafold/{alphafold_id}.pdb")
        if not path:
            alphafold_id = row["uniprot"]
            path = glob.glob(f"./data/main_dataset/3D_structures/alphafold/{alphafold_id}.pdb")
        
        row["alphafold_path"] = path[0] if path else ""
        return row

    df = df.apply(find_alphafold_path, axis=1)

In [8]:
df.to_csv("./data/main_dataset/main.csv", index=False)

In [9]:
df = pd.read_csv("./data/main_dataset/main.csv")
print(f"the current 3D structures folder contains {len(glob.glob('./data/main_dataset/3D_structures/alphafold/*.pdb'))} files")
print(
    f"there are {len(df[pd.isna(df.alphafold_path)])} records without 3D structures path")


the current 3D structures folder contains 426 files
there are 113 records without 3D structures path


In [10]:
from utils.file_utils import write_json
no_3d_struct = df[pd.isna(df.alphafold_path)]
no_3d_struct = no_3d_struct[["PDB_wild", "uniprot", "sequence"]]
no_3d_struct.drop_duplicates(subset=["sequence"], inplace=True)
# put into latch bio template for running it online
# see https://console.latch.bio/workflows/82788/parameters
no_3d_struct.rename(columns={"sequence": "aa_sequence", "uniprot": "run_name"}, inplace=True)
no_3d_struct.drop(columns=["PDB_wild"], inplace=True)
def fasta_format(row):
    row["aa_sequence"] = f"> {row['run_name']}\n"+row["aa_sequence"]
    return row

no_3d_struct = no_3d_struct.apply(fasta_format, axis=1)
no_3d_struct = no_3d_struct[["aa_sequence", "run_name"]]
no_3d_struct.to_csv("./data/main_dataset/3D_structures/no_3d_struct_latchbio.csv", index=False)

# alphafold's shared notebook on google colab:
# https://colab.research.google.com/github/deepmind/alphafold/blob/main/notebooks/AlphaFold.ipynb#scrollTo=woIxeCPygt7K