# Loading proteins

In [1]:
from IPython.utils import io

with io.capture_output() as captured:
    !conda install -y requests
    !conda install -y biopython
    !conda install -y pandas

print(captured.stdout if ("Error" in captured.stdout) or ("ERROR" in captured.stdout) else "Installation successful")

Installation successful


Prepare filesystem:

In [2]:
ORIGINAL_PDB_FILES_DIRECTORY = "proteins/original_pdb_files"
!mkdir {ORIGINAL_PDB_FILES_DIRECTORY}

ORIGINAL_FASTA_FILES_DIRECTORY = "proteins/original_fasta_files"
!mkdir {ORIGINAL_FASTA_FILES_DIRECTORY}

INFERRED_FASTA_FILES_DIRECTORY = "proteins/inferred_fasta_files"
!mkdir {INFERRED_FASTA_FILES_DIRECTORY}

## Obtaining PDB IDs of protein structures added to PDB after 22. 7. 2022
Useful functions:

In [3]:
import requests, json
from random import sample, seed


def display_query_results(results):
    print(f"Query ID: {results['query_id']}")
    print(f"Result type: {results['result_type']}")
    print(f"Total count of matches: {results['total_count']}")
    print(f"Total count of returned entities: {len(results['result_set'])}")
    to_display = min(len(results['result_set']), 10)
    print(f"First {to_display} items in result set:")
    for i in range(to_display):
        print(results['result_set'][i])


def retrieve_query_from_pdb(request):
    query = json.dumps(request)
    data = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={query}")
    if data.status_code == 200:
        print("The query was processed successfully!")
        results = data.json()
        display_query_results(results)
        return results
    print(f"The request failed with status code {data.status_code}")
    return None


def select_random_pdb_ids(ids_list, sample_size, entity_ids_list = False):
    seed(0)
    if entity_ids_list:
        ids_list = [entity_id.split("_")[0] for entity_id in ids_list]
    return sample(ids_list, sample_size)

### Monomers
Firstly, we will create a dataset containing 500 monomer proteins.

This query returns polymer entity IDs from entries that:
- were deposited into PDB between 22. 7. 2022 and 22. 2. 2024
- contain only protein entities (no nucleic acids or oligosaccharides)
- have available pdb file
- are not synthetic
- the whole structure contains only one chain, which is not longer than 400 amino acids
- the sequence identity between the chains is at most 70% (to remove redundancy from the data and obtain a diverse dataset)

Unfortunately, it is not possible to perform the sequence identity filtering on whole entries. Therefore, the PDB IDs will be obtained from polymer entity IDs. 

In [4]:
monomers_query = {
    "query": {
        "type": "group",
        "logical_operator": "and",
        "nodes": [
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_accession_info.deposit_date",
                 "operator": "range",
                 "value": {
                     "from": "2022-07-22",
                     "include_lower": True,
                     "to": "2024-02-22",
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.selected_polymer_entity_types",
                 "operator": "exact_match",
                 "value": "Protein (only)"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "pdbx_database_status.pdb_format_compatible",
                 "operator": "exact_match",
                 "value": "Y"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entity_source_organism.ncbi_scientific_name",
                 "operator": "exact_match",
                 "negation": True,
                 "value": "synthetic construct"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.polymer_entity_count_protein",
                 "operator": "equals",
                 "value": 1
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "entity_poly.rcsb_sample_sequence_length",
                 "operator": "less_or_equal",
                 "value": 400
                }
            },
            ] 
        },
    "request_options": {
        "return_all_hits": True,
        "results_verbosity": "compact",
        "group_by": {
            "aggregation_method": "sequence_identity",
            "similarity_cutoff": 70
            },
        "group_by_return_type": "representatives"
        },
    "return_type": "polymer_entity"
}


monomers_result = retrieve_query_from_pdb(monomers_query)

The query was processed successfully!
Query ID: 1ea384d7-0c84-4aaa-b9d6-9763960e484a
Result type: polymer_entity
Total count of matches: 5355
Total count of returned entities: 1736
First 10 items in result set:
5SSZ_1
7FY1_1
7G1Q_1
7G1X_1
7YKB_1
7YKC_1
7YKM_1
7YL8_1
7YLE_1
7YLG_1


The 500 monomers used for the experiment will be selected at random.

In [5]:
monomers_pdb_ids = select_random_pdb_ids(monomers_result["result_set"], 500, entity_ids_list = True)
monomers_pdb_ids[:10]

['8XPV',
 '8GQ4',
 '8TIF',
 '8H3Z',
 '8ALL',
 '8EIL',
 '8I86',
 '8HW1',
 '8GY0',
 '8ULM']

### Small protein complexes
Secondly, we will add 200 protein complexes to the dataset.

This query returns PDB IDs of entries that:
- were deposited into PDB between 22. 7. 2022 and 22. 2. 2024
- contain only protein entities (no nucleic acids or oligosaccharides)
- have available pdb file
- are not synthetic
- the whole structure contains two, three or four unique chains, which are not longer than 400 amino acids
- the PDB deposit group ID is empty

As mentioned above, it is not possible to remove redundancy from the data when searching for whole entries. To avoid choosing from a set containing multiple almost identical structures, only structures not belonging to any of the PDB deposit groups were chosen. There were only two such groups matching the criteria, containing aroud 300 structures altogether. I think that by doing this, I lose the least data while retaining as much diversity as possible. 

In [6]:
complexes_query = {
    "query": {
        "type": "group",
        "logical_operator": "and",
        "nodes": [
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_accession_info.deposit_date",
                 "operator": "range",
                 "value": {
                     "from": "2022-07-22",
                     "include_lower": True,
                     "to": "2024-02-22",
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.selected_polymer_entity_types",
                 "operator": "exact_match",
                 "value": "Protein (only)"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "pdbx_database_status.pdb_format_compatible",
                 "operator": "exact_match",
                 "value": "Y"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entity_source_organism.ncbi_scientific_name",
                 "operator": "exact_match",
                 "negation": True,
                 "value": "synthetic construct"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.polymer_entity_count_protein",
                 "operator": "range",
                 "value": {
                     "from": 2,
                     "include_lower": True,
                     "to": 4,
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "entity_poly.rcsb_sample_sequence_length",
                 "operator": "less_or_equal",
                 "value": 400
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "pdbx_deposit_group.group_id",
                 "operator": "exists",
                 "negation": True
                }
            }
            ] 
        },
    "request_options": {
        "return_all_hits": True,
        "results_verbosity": "compact"
        },
    "return_type": "entry"
}

complexes_result = retrieve_query_from_pdb(complexes_query)

The query was processed successfully!
Query ID: d8258233-8f60-49a4-a000-eadc1860e23f
Result type: entry
Total count of matches: 1918
Total count of returned entities: 1918
First 10 items in result set:
7YKA
7YKF
7YKG
7YKH
7YKI
7YKJ
7YLB
7YLM
7YLS
7YMD


The 200 heterooligomers used for the experiment will be selected at random.

In [7]:
complexes_pdb_ids = select_random_pdb_ids(complexes_result["result_set"], 200)
complexes_pdb_ids[:10]

['8SS2',
 '8EYR',
 '8OXV',
 '8TS7',
 '8FA9',
 '8ALR',
 '8CEA',
 '8GO9',
 '8G7B',
 '8F6D']

### Synthetic constructs
Lastly, 50 synthetic proteins will be added to the dataset.

This query returns PDB IDs of entries that:
- were deposited into PDB between 22. 7. 2022 and 22. 2. 2024
- contain only protein entities (no nucleic acids or oligosaccharides)
- have available pdb file
- are purely synthetic
- the whole structure contains at most four unique chains, whose length is between 50 and 400 amino acids (including the borders)
- the PDB deposit group ID is empty

The requirement for the chains to be longer than 50 amino acids was added because the database contains a significant amount of very short synthetic peptides (almost a half of the matching synthetic proteins was shorter than 50 amino acids).

In [8]:
synthetic_query = {
    "query": {
        "type": "group",
        "logical_operator": "and",
        "nodes": [
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_accession_info.deposit_date",
                 "operator": "range",
                 "value": {
                     "from": "2022-07-22",
                     "include_lower": True,
                     "to": "2024-02-22",
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.selected_polymer_entity_types",
                 "operator": "exact_match",
                 "value": "Protein (only)"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "pdbx_database_status.pdb_format_compatible",
                 "operator": "exact_match",
                 "value": "Y"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entity_source_organism.ncbi_scientific_name",
                 "operator": "exact_match",
                 "value": "synthetic construct"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entity_source_organism.ncbi_parent_scientific_name",
                 "operator": "contains_words",
                 "negation": True,
                 "value": "Eukaryota Archaea Bacteria Riboviria Duplodnaviria"
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "rcsb_entry_info.polymer_entity_count_protein",
                 "operator": "less_or_equal",
                 "value": 4
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "entity_poly.rcsb_sample_sequence_length",
                 "operator": "range",
                 "value": {
                     "from": 50,
                     "include_lower": True,
                     "to": 400,
                     "include_upper": True
                    }
                }
            },
            {"type": "terminal",
             "service": "text",
             "parameters": {
                 "attribute": "pdbx_deposit_group.group_id",
                 "operator": "exists",
                 "negation": True
                }
            }
            ] 
        },
    "request_options": {
        "return_all_hits": True,
        "results_verbosity": "compact"
    },
    "return_type": "entry"
}


synthetic_result = retrieve_query_from_pdb(synthetic_query)

The query was processed successfully!
Query ID: d8732d55-4833-4ecc-b67e-0f5cb5ecc2e2
Result type: entry
Total count of matches: 102
Total count of returned entities: 102
First 10 items in result set:
8AO0
8AO1
8AW4
8B30
8BCS
8BCT
8BL6
8BO9
8BU0
8C3E


We will select 50 synthetic proteins at random:

In [9]:
synthetic_pdb_ids = select_random_pdb_ids(synthetic_result["result_set"], 50)
synthetic_pdb_ids[:10]

['8FIQ',
 '8U5A',
 '8FJG',
 '8BCT',
 '8ETY',
 '8H7C',
 '8GJG',
 '8FJE',
 '8F54',
 '8GAQ']

## Pandas dataframe
Create pandas dataframe to store information about the proteins and files:

In [10]:
import pandas as pd
import numpy as np

proteins = pd.DataFrame({'pdb_id': np.concatenate([monomers_pdb_ids, complexes_pdb_ids, synthetic_pdb_ids]),
                         'label': np.repeat(['monomer', 'complex', 'synthetic'],
                                            [len(monomers_pdb_ids), len(complexes_pdb_ids), len(synthetic_pdb_ids)])})

proteins

Unnamed: 0,pdb_id,label
0,8XPV,monomer
1,8GQ4,monomer
2,8TIF,monomer
3,8H3Z,monomer
4,8ALL,monomer
...,...,...
745,8FIT,synthetic
746,8J1W,synthetic
747,8HDV,synthetic
748,8AO0,synthetic


## Download PDB files according to the PDB IDs

In [11]:
def download_structure(pdb_id, directory):
    try:
        # download 3D structure based on PDB ID:
        pdb_id_lower = pdb_id.lower()
        response = requests.get(f'https://files.rcsb.org/download/{pdb_id_lower}.pdb')
        if not response.ok or response.text == 'N/A':
            return None
        structure = response.text

        # save the downloaded structure to a pdb file:
        path_file = f'{directory}/{pdb_id}.pdb'
        with open(path_file, 'w') as f:
            f.write(structure)
        return path_file

    except:
        return None


proteins["original_pdb_path"] = proteins.apply(lambda row: download_structure(row["pdb_id"], ORIGINAL_PDB_FILES_DIRECTORY), axis = 1)
proteins

Unnamed: 0,pdb_id,label,original_pdb_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb
...,...,...,...
745,8FIT,synthetic,proteins/original_pdb_files/8FIT.pdb
746,8J1W,synthetic,proteins/original_pdb_files/8J1W.pdb
747,8HDV,synthetic,proteins/original_pdb_files/8HDV.pdb
748,8AO0,synthetic,proteins/original_pdb_files/8AO0.pdb


Filter out proteins whose pdb files cannot be downloaded:

In [12]:
proteins.dropna(inplace = True)
proteins

Unnamed: 0,pdb_id,label,original_pdb_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb
...,...,...,...
745,8FIT,synthetic,proteins/original_pdb_files/8FIT.pdb
746,8J1W,synthetic,proteins/original_pdb_files/8J1W.pdb
747,8HDV,synthetic,proteins/original_pdb_files/8HDV.pdb
748,8AO0,synthetic,proteins/original_pdb_files/8AO0.pdb


## Identify unique chains using downloaded fasta files

Download fasta files:

In [13]:
def download_fasta(pdb_id, directory):
    try:
        # download fasta file based on PDB ID:
        pdb_id_lower = pdb_id.lower()
        response = requests.get(f'https://www.rcsb.org/fasta/entry/{pdb_id_lower}')
        if not response.ok or response.text == 'N/A':
            return None
        sequence = response.text

        # save the downloaded sequence as a fasta file:
        path_file = f'{directory}/{pdb_id}.fasta'
        with open(path_file, 'w') as f:
            f.write(sequence)
        return path_file

    except:
        return None


proteins["original_fasta_path"] = proteins.apply(lambda row: download_fasta(row["pdb_id"], ORIGINAL_FASTA_FILES_DIRECTORY), axis = 1)
proteins

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta
...,...,...,...,...
745,8FIT,synthetic,proteins/original_pdb_files/8FIT.pdb,proteins/original_fasta_files/8FIT.fasta
746,8J1W,synthetic,proteins/original_pdb_files/8J1W.pdb,proteins/original_fasta_files/8J1W.fasta
747,8HDV,synthetic,proteins/original_pdb_files/8HDV.pdb,proteins/original_fasta_files/8HDV.fasta
748,8AO0,synthetic,proteins/original_pdb_files/8AO0.pdb,proteins/original_fasta_files/8AO0.fasta


Filter out proteins whose fasta files cannot be downloaded:

In [14]:
proteins.dropna(inplace = True)
proteins

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta
...,...,...,...,...
745,8FIT,synthetic,proteins/original_pdb_files/8FIT.pdb,proteins/original_fasta_files/8FIT.fasta
746,8J1W,synthetic,proteins/original_pdb_files/8J1W.pdb,proteins/original_fasta_files/8J1W.fasta
747,8HDV,synthetic,proteins/original_pdb_files/8HDV.pdb,proteins/original_fasta_files/8HDV.fasta
748,8AO0,synthetic,proteins/original_pdb_files/8AO0.pdb,proteins/original_fasta_files/8AO0.fasta


Identify unique chains:

In [15]:
import re

def extract_chain_id(line):
    line = line.replace(",", "")
    chain_id = line.lstrip(">").split("|")[0]
    chain_letter = re.sub(r"\[.*?\]", "", line).split("|")[1].split(" ")[1:][0] # NEED TO CHANGE TO AUTH!!!!!!
    return chain_letter


def identify_unique_chains(pdb_id, fasta_path):
    unique_chains = set()
    with open(fasta_path, "r") as file:
        for line in file:
            if line.startswith(">"):
                chain_letter = extract_chain_id(line)
                unique_chains.add(f"{pdb_id}:{chain_letter}")
    return unique_chains


proteins["unique_chains"] = proteins.apply(lambda row: identify_unique_chains(row["pdb_id"], row["original_fasta_path"]), axis = 1)
proteins

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path,unique_chains
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta,{8XPV:A}
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta,{8GQ4:A}
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta,{8TIF:A}
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta,{8H3Z:A}
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta,{8ALL:A}
...,...,...,...,...,...
745,8FIT,synthetic,proteins/original_pdb_files/8FIT.pdb,proteins/original_fasta_files/8FIT.fasta,{8FIT:A}
746,8J1W,synthetic,proteins/original_pdb_files/8J1W.pdb,proteins/original_fasta_files/8J1W.fasta,{8J1W:A}
747,8HDV,synthetic,proteins/original_pdb_files/8HDV.pdb,proteins/original_fasta_files/8HDV.fasta,{8HDV:A}
748,8AO0,synthetic,proteins/original_pdb_files/8AO0.pdb,proteins/original_fasta_files/8AO0.fasta,{8AO0:A}


Obtain sequences of identified chains from pdb files:

In [17]:
import warnings, Bio
from Bio import SeqIO
warnings.simplefilter('ignore', Bio.BiopythonWarning)


def get_aa_seq_from_pdb(pdb_path):
    with warnings.catch_warnings():
        return {(record.description, str(record.seq)) for record in SeqIO.parse(pdb_path, 'pdb-atom')}


def make_fasta_from_sequences(pdb_id, directory, pdb_path, wanted_chains):
    sequences = get_aa_seq_from_pdb(pdb_path)
    fasta_path = f"{directory}/{pdb_id}.fasta"
    with open(fasta_path, 'w') as f:
        for chain, sequence in sequences:
            if chain in wanted_chains:
                f.write(f'>{chain}\n{sequence}\n')
    return fasta_path


proteins["inferred_fasta_path"] = proteins.apply(lambda row: make_fasta_from_sequences(row["pdb_id"],
                                                                                       INFERRED_FASTA_FILES_DIRECTORY,
                                                                                       row["original_pdb_path"],
                                                                                       row["unique_chains"]),
                                                 axis = 1)
proteins

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path,unique_chains,inferred_fasta_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta,{8XPV:A},proteins/inferred_fasta_files/8XPV.fasta
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta,{8GQ4:A},proteins/inferred_fasta_files/8GQ4.fasta
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta,{8TIF:A},proteins/inferred_fasta_files/8TIF.fasta
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta,{8H3Z:A},proteins/inferred_fasta_files/8H3Z.fasta
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta,{8ALL:A},proteins/inferred_fasta_files/8ALL.fasta
...,...,...,...,...,...,...
745,8FIT,synthetic,proteins/original_pdb_files/8FIT.pdb,proteins/original_fasta_files/8FIT.fasta,{8FIT:A},proteins/inferred_fasta_files/8FIT.fasta
746,8J1W,synthetic,proteins/original_pdb_files/8J1W.pdb,proteins/original_fasta_files/8J1W.fasta,{8J1W:A},proteins/inferred_fasta_files/8J1W.fasta
747,8HDV,synthetic,proteins/original_pdb_files/8HDV.pdb,proteins/original_fasta_files/8HDV.fasta,{8HDV:A},proteins/inferred_fasta_files/8HDV.fasta
748,8AO0,synthetic,proteins/original_pdb_files/8AO0.pdb,proteins/original_fasta_files/8AO0.fasta,{8AO0:A},proteins/inferred_fasta_files/8AO0.fasta


Create pandas dataframe for chains:

In [18]:
chains = proteins.explode("unique_chains", ignore_index = True).rename(columns = {"unique_chains": "chain_id"})
chains

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path,chain_id,inferred_fasta_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta,8XPV:A,proteins/inferred_fasta_files/8XPV.fasta
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta,8GQ4:A,proteins/inferred_fasta_files/8GQ4.fasta
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta,8TIF:A,proteins/inferred_fasta_files/8TIF.fasta
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta,8H3Z:A,proteins/inferred_fasta_files/8H3Z.fasta
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta,8ALL:A,proteins/inferred_fasta_files/8ALL.fasta
...,...,...,...,...,...,...
1052,8FIT,synthetic,proteins/original_pdb_files/8FIT.pdb,proteins/original_fasta_files/8FIT.fasta,8FIT:A,proteins/inferred_fasta_files/8FIT.fasta
1053,8J1W,synthetic,proteins/original_pdb_files/8J1W.pdb,proteins/original_fasta_files/8J1W.fasta,8J1W:A,proteins/inferred_fasta_files/8J1W.fasta
1054,8HDV,synthetic,proteins/original_pdb_files/8HDV.pdb,proteins/original_fasta_files/8HDV.fasta,8HDV:A,proteins/inferred_fasta_files/8HDV.fasta
1055,8AO0,synthetic,proteins/original_pdb_files/8AO0.pdb,proteins/original_fasta_files/8AO0.fasta,8AO0:A,proteins/inferred_fasta_files/8AO0.fasta


## Save both dataframes as csv files

In [19]:
proteins.to_csv("proteins/proteins.csv", sep = ",", index = False)
chains.to_csv("proteins/chains.csv", sep = ",", index = False)