# AlphaFold - preparation of files for prediction

In [1]:
import pandas as pd
import os
from typing import List, Dict, Optional, Set, Tuple

## Load `proteins` dataframes from .csv files

In [2]:
proteins = pd.read_csv('proteins/proteins.csv')
proteins

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path,unique_chains,inferred_fasta_path,masks_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta,['8XPV:A'],proteins/inferred_fasta_files/8XPV.fasta,proteins/mask_files/8XPV_mask.fasta
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta,['8GQ4:A'],proteins/inferred_fasta_files/8GQ4.fasta,proteins/mask_files/8GQ4_mask.fasta
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta,['8TIF:A'],proteins/inferred_fasta_files/8TIF.fasta,proteins/mask_files/8TIF_mask.fasta
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta,['8H3Z:A'],proteins/inferred_fasta_files/8H3Z.fasta,proteins/mask_files/8H3Z_mask.fasta
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta,['8ALL:A'],proteins/inferred_fasta_files/8ALL.fasta,proteins/mask_files/8ALL_mask.fasta
...,...,...,...,...,...,...,...
744,8HNE,synthetic,proteins/original_pdb_files/8HNE.pdb,proteins/original_fasta_files/8HNE.fasta,['8HNE:A'],proteins/inferred_fasta_files/8HNE.fasta,proteins/mask_files/8HNE_mask.fasta
745,8FIN,synthetic,proteins/original_pdb_files/8FIN.pdb,proteins/original_fasta_files/8FIN.fasta,['8FIN:A'],proteins/inferred_fasta_files/8FIN.fasta,proteins/mask_files/8FIN_mask.fasta
746,8J0A,synthetic,proteins/original_pdb_files/8J0A.pdb,proteins/original_fasta_files/8J0A.fasta,['8J0A:A'],proteins/inferred_fasta_files/8J0A.fasta,proteins/mask_files/8J0A_mask.fasta
747,8HDU,synthetic,proteins/original_pdb_files/8HDU.pdb,proteins/original_fasta_files/8HDU.fasta,['8HDU:A'],proteins/inferred_fasta_files/8HDU.fasta,proteins/mask_files/8HDU_mask.fasta


In [3]:
chains = pd.read_csv('proteins/chains.csv')
chains

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path,chain_id,inferred_fasta_path,masks_path,chain_inferred_fasta_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta,8XPV:A,proteins/inferred_fasta_files/8XPV.fasta,proteins/mask_files/8XPV_mask.fasta,proteins/chain_fasta_files/8XPV:A.fasta
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta,8GQ4:A,proteins/inferred_fasta_files/8GQ4.fasta,proteins/mask_files/8GQ4_mask.fasta,proteins/chain_fasta_files/8GQ4:A.fasta
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta,8TIF:A,proteins/inferred_fasta_files/8TIF.fasta,proteins/mask_files/8TIF_mask.fasta,proteins/chain_fasta_files/8TIF:A.fasta
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta,8H3Z:A,proteins/inferred_fasta_files/8H3Z.fasta,proteins/mask_files/8H3Z_mask.fasta,proteins/chain_fasta_files/8H3Z:A.fasta
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta,8ALL:A,proteins/inferred_fasta_files/8ALL.fasta,proteins/mask_files/8ALL_mask.fasta,proteins/chain_fasta_files/8ALL:A.fasta
...,...,...,...,...,...,...,...,...
1042,8HNE,synthetic,proteins/original_pdb_files/8HNE.pdb,proteins/original_fasta_files/8HNE.fasta,8HNE:A,proteins/inferred_fasta_files/8HNE.fasta,proteins/mask_files/8HNE_mask.fasta,proteins/chain_fasta_files/8HNE:A.fasta
1043,8FIN,synthetic,proteins/original_pdb_files/8FIN.pdb,proteins/original_fasta_files/8FIN.fasta,8FIN:A,proteins/inferred_fasta_files/8FIN.fasta,proteins/mask_files/8FIN_mask.fasta,proteins/chain_fasta_files/8FIN:A.fasta
1044,8J0A,synthetic,proteins/original_pdb_files/8J0A.pdb,proteins/original_fasta_files/8J0A.fasta,8J0A:A,proteins/inferred_fasta_files/8J0A.fasta,proteins/mask_files/8J0A_mask.fasta,proteins/chain_fasta_files/8J0A:A.fasta
1045,8HDU,synthetic,proteins/original_pdb_files/8HDU.pdb,proteins/original_fasta_files/8HDU.fasta,8HDU:A,proteins/inferred_fasta_files/8HDU.fasta,proteins/mask_files/8HDU_mask.fasta,proteins/chain_fasta_files/8HDU:A.fasta


Preparing fasta files containing sequences of separate chains:

## Divide fasta files into 21 batches containing approximately 50 files
(ran on Metacentrun using OnDemand)

## Obtain a directory containing only the pdb files of relaxed best models
(ran on Metacentrun using OnDemand)

## Add paths to the `chains` dataframe and save it as a .csv file

In [4]:
def check_file(path):
    if os.path.isfile(path):
        return path
    return None


chains["AF_prediction_pdb_path"] = chains.apply(
    lambda row: check_file(f"proteins/alphafold_prediction_pdb_files/{row['chain_id']}.pdb"),
    axis = 1
)
chains

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path,chain_id,inferred_fasta_path,masks_path,chain_inferred_fasta_path,AF_prediction_pdb_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta,8XPV:A,proteins/inferred_fasta_files/8XPV.fasta,proteins/mask_files/8XPV_mask.fasta,proteins/chain_fasta_files/8XPV:A.fasta,proteins/alphafold_prediction_pdb_files/8XPV:A...
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta,8GQ4:A,proteins/inferred_fasta_files/8GQ4.fasta,proteins/mask_files/8GQ4_mask.fasta,proteins/chain_fasta_files/8GQ4:A.fasta,proteins/alphafold_prediction_pdb_files/8GQ4:A...
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta,8TIF:A,proteins/inferred_fasta_files/8TIF.fasta,proteins/mask_files/8TIF_mask.fasta,proteins/chain_fasta_files/8TIF:A.fasta,proteins/alphafold_prediction_pdb_files/8TIF:A...
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta,8H3Z:A,proteins/inferred_fasta_files/8H3Z.fasta,proteins/mask_files/8H3Z_mask.fasta,proteins/chain_fasta_files/8H3Z:A.fasta,proteins/alphafold_prediction_pdb_files/8H3Z:A...
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta,8ALL:A,proteins/inferred_fasta_files/8ALL.fasta,proteins/mask_files/8ALL_mask.fasta,proteins/chain_fasta_files/8ALL:A.fasta,proteins/alphafold_prediction_pdb_files/8ALL:A...
...,...,...,...,...,...,...,...,...,...
1042,8HNE,synthetic,proteins/original_pdb_files/8HNE.pdb,proteins/original_fasta_files/8HNE.fasta,8HNE:A,proteins/inferred_fasta_files/8HNE.fasta,proteins/mask_files/8HNE_mask.fasta,proteins/chain_fasta_files/8HNE:A.fasta,proteins/alphafold_prediction_pdb_files/8HNE:A...
1043,8FIN,synthetic,proteins/original_pdb_files/8FIN.pdb,proteins/original_fasta_files/8FIN.fasta,8FIN:A,proteins/inferred_fasta_files/8FIN.fasta,proteins/mask_files/8FIN_mask.fasta,proteins/chain_fasta_files/8FIN:A.fasta,proteins/alphafold_prediction_pdb_files/8FIN:A...
1044,8J0A,synthetic,proteins/original_pdb_files/8J0A.pdb,proteins/original_fasta_files/8J0A.fasta,8J0A:A,proteins/inferred_fasta_files/8J0A.fasta,proteins/mask_files/8J0A_mask.fasta,proteins/chain_fasta_files/8J0A:A.fasta,proteins/alphafold_prediction_pdb_files/8J0A:A...
1045,8HDU,synthetic,proteins/original_pdb_files/8HDU.pdb,proteins/original_fasta_files/8HDU.fasta,8HDU:A,proteins/inferred_fasta_files/8HDU.fasta,proteins/mask_files/8HDU_mask.fasta,proteins/chain_fasta_files/8HDU:A.fasta,proteins/alphafold_prediction_pdb_files/8HDU:A...


In [5]:
chains.to_csv("proteins/chains_alphafold.csv", sep = ",", index = False)