# Removing protein chains labelled "complexes" with a high sequence similarity

In [1]:
import pandas as pd
import os
from typing import List

In [2]:
PROTEIN_DIRECTORY = "data/proteins"

chains = pd.read_csv("data/chains.csv")
chains

Unnamed: 0,pdb_id,label,chain_id
0,8P0E,monomer,8P0E:A
1,8PX8,monomer,8PX8:A
2,8B2E,monomer,8B2E:A
3,8HOE,monomer,8HOE:A
4,8TCE,monomer,8TCE:A
...,...,...,...
1458,8G9J,synthetic,8G9J:A
1459,8OYV,synthetic,8OYV:A
1460,8TNO,synthetic,8TNO:A
1461,8FJE,synthetic,8FJE:A


In [3]:
CLUSTERING_DIRECTORY = "data/clustering"
if not os.path.exists(CLUSTERING_DIRECTORY):
        os.makedirs(CLUSTERING_DIRECTORY)

## Preparing FASTA file with all sequences

In [4]:
SEQUENCES = []
CHAIN_IDS = set(chains[chains["label"] == "complex"]["chain_id"])

def get_sequence(fasta_path: str) -> str:
    """
    This function obtains a sequence from a given fasta file.
    
    Parameters:
        fasta_path (str): path to a fasta file containing exactly one
                          sequence on the second line of the file
    
    Returns:
        string of letters without a new line at the end
    """
    with open(fasta_path, "r") as file:
        lines = file.readlines()

    assert(len(lines) == 2)
    return lines[1].strip()


def add_to_fasta(chain_id: str, fasta_path: str) -> None:
    sequence = get_sequence(fasta_path)
    SEQUENCES.append(f">{chain_id}\n{sequence}\n")

chains[chains["label"] == "complex"].apply(
    lambda row: add_to_fasta(row["chain_id"], f"{PROTEIN_DIRECTORY}/{row['pdb_id']}/{row['chain_id']}.fasta"),
    axis = 1
)

with open(f"{CLUSTERING_DIRECTORY}/complex_sequences.fasta", "w") as f:
    f.write("".join(sorted(SEQUENCES)))

## Clustering (on Metacentrum)

Run on Metacentrum OnDemand using Frontend Shell (version of cdhit: `cdhit/4.8.1-gcc-10.2.1-zlhcwe3`):

Output:

Representative sequences are in a file `clustered.fasta`, detailed information about clusters are in `clustered.fasta.clstr`.

## Removing redundant 

In [5]:
with open(f"{CLUSTERING_DIRECTORY}/clustered.fasta", "r") as f:
    lines = f.readlines()

REPRESENTATIVES_CHAIN_IDS = []
for line in lines:
    if line.startswith(">"):
        REPRESENTATIVES_CHAIN_IDS.append(line.lstrip(">").rstrip())

TO_REMOVE = set(CHAIN_IDS) - set(REPRESENTATIVES_CHAIN_IDS)

print(f"Original number of protein chains labelled 'complex': {len(CHAIN_IDS)}")
print(f"Number of clusters: {len(REPRESENTATIVES_CHAIN_IDS)}")
print(f"Number of protein chains to remove: {len(TO_REMOVE)}")
print(f"New number of protein chains labelled 'complex': {len(CHAIN_IDS) - len(TO_REMOVE)}")

Original number of protein chains labelled 'complex': 381
Number of clusters: 255
Number of protein chains to remove: 126
New number of protein chains labelled 'complex': 255


In [6]:
chains_filtered = chains[~chains["chain_id"].isin(TO_REMOVE)]
chains_filtered

Unnamed: 0,pdb_id,label,chain_id
0,8P0E,monomer,8P0E:A
1,8PX8,monomer,8PX8:A
2,8B2E,monomer,8B2E:A
3,8HOE,monomer,8HOE:A
4,8TCE,monomer,8TCE:A
...,...,...,...
1458,8G9J,synthetic,8G9J:A
1459,8OYV,synthetic,8OYV:A
1460,8TNO,synthetic,8TNO:A
1461,8FJE,synthetic,8FJE:A


In [7]:
chains_filtered["label"].value_counts()

label
monomer      980
complex      255
synthetic    102
Name: count, dtype: int64

In [8]:
if not os.path.exists("data/filtered"):
        os.makedirs("data/filtered")

chains_filtered.to_csv(f"data/filtered/chains_filtered.csv", sep = ",", index = False)