In [None]:
import pandas as pd
import tqdm
import glob
from utils.alphafold import download_alphafold

In [None]:
df = pd.read_csv("./data/main_dataset/main.csv")
df.head(2)

In [None]:
# download alphafold prediction

for alphafold_id in tqdm.tqdm(df.AlphaFoldDB.unique()):
    download_alphafold(alphafold_id)

In [None]:
# some entries in the dataset have no linked alphafold id, we try our luck with the uniprot id instead
# already download structures will not be redownloaded (check in download_alphafold)

for uniprot_id in tqdm.tqdm(df.uniprot.unique()):
    download_alphafold(uniprot_id)

In [None]:
# we now add the path to each record of the dataframe
df["alphafold_path"] = ""

def find_alphafold_path(row):
    alphafold_id = row["AlphaFoldDB"]
    path = glob.glob(f"./data/main_dataset/3D_structures/alphafold/{alphafold_id}.pdb")
    if not path:
        alphafold_id = row["uniprot"]
        path = glob.glob(f"./data/main_dataset/3D_structures/alphafold/{alphafold_id}.pdb")
    
    row["alphafold_path"] = path[0] if path else ""
    return row

df = df.apply(find_alphafold_path, axis=1)
df.to_csv("./data/main_dataset/main.csv", index=False)

In [None]:
df = pd.read_csv("./data/main_dataset/main.csv")
print(f"the current 3D structures folder contains {len(glob.glob('./data/main_dataset/3D_structures/alphafold/*.pdb'))} files")
print(
    f"there are {len(df[pd.isna(df.alphafold_path)])} records without 3D structures path")


In [27]:
from utils.file_utils import write_json
no_3d_struct = df[pd.isna(df.alphafold_path)]
no_3d_struct = no_3d_struct[["PDB_wild", "uniprot", "sequence"]]
no_3d_struct.drop_duplicates(subset=["sequence"], inplace=True)
no_3d_struct.to_csv("./data/main_dataset/no_3d_struct.csv", index=False)

# this no_3d_struct is then used to get 3d structure from alphafold by running alphafold on the sequence in alphafold's shared notebook:
# https://colab.research.google.com/github/deepmind/alphafold/blob/main/notebooks/AlphaFold.ipynb#scrollTo=woIxeCPygt7K