## True Postive

In [19]:
import requests
import json

def search_pdb_with_json_query(json_query):
    base_url = "https://search.rcsb.org/rcsbsearch/v2/query"

    # URL-encode the JSON query and construct the API endpoint
    encoded_query = json.dumps(json_query)
    endpoint = f"{base_url}?json={encoded_query}"

    try:
        # Make the HTTP GET request to the API endpoint
        response = requests.get(endpoint)

        # Check if the request was successful
        response.raise_for_status()

        # Parse the response JSON
        response_data = response.json()

        # Extract and return the search results
        return response_data["result_set"]
    except requests.exceptions.RequestException as e:
        print("Error occurred during the API request:", e)
        return None

In [20]:
json_query = {
  "query": {
    "type": "group",
    "logical_operator": "and",
    "nodes": [
      {
        "type": "group",
        "nodes": [
          {
            "type": "terminal",
            "service": "text",
            "parameters": {
              "attribute": "entity_poly.rcsb_sample_sequence_length",
              "operator": "greater_or_equal",
              "negation": False,
              "value": 20
            }
          },
          {
            "type": "terminal",
            "service": "text",
            "parameters": {
              "attribute": "rcsb_entry_info.selected_polymer_entity_types",
              "operator": "exact_match",
              "negation": False,
              "value": "Protein (only)"
            }
          },
          {
            "type": "terminal",
            "service": "text",
            "parameters": {
              "attribute": "rcsb_entry_info.polymer_entity_count_protein",
              "operator": "equals",
              "negation": False,
              "value": 2
            }
          }
        ],
        "logical_operator": "and"
      },
      {
        "type": "terminal",
        "service": "text",
        "parameters": {
          "attribute": "rcsb_entry_info.deposited_polymer_entity_instance_count",
          "operator": "equals",
          "negation": False,
          "value": 2
        }
      },
      {
        "type": "terminal",
        "service": "text",
        "parameters": {
          "attribute": "rcsb_assembly_info.polymer_entity_instance_count",
          "operator": "equals",
          "negation": False,
          "value": 2
        }
      },
      {
        "type": "group",
        "nodes": [
          {
            "type": "terminal",
            "service": "text",
            "parameters": {
              "attribute": "rcsb_entry_info.structure_determination_methodology",
              "value": "experimental",
              "operator": "exact_match"
            }
          },
          {
            "type": "terminal",
            "service": "text",
            "parameters": {
              "attribute": "rcsb_entity_source_organism.ncbi_parent_scientific_name",
              "value": "Eukaryota",
              "operator": "exact_match"
            }
          }
        ],
        "logical_operator": "and"
      }
    ],
    "label": "text"
  },
  "return_type": "entry",
  "request_options": {
    "paginate": {
      "start": 0,
      "rows": 10000
    },
    "results_content_type": [
      "experimental"
    ],
    "sort": [
      {
        "sort_by": "score",
        "direction": "desc"
      }
    ],
    "scoring_strategy": "combined"
  }
}
# Perform the search and get the results
search_results = search_pdb_with_json_query(json_query)

print("The number of the TP from PDB: ", len(search_results))

The number of the TP from PDB:  8599


In [23]:
# select pairs randomly from these set
import random
def random_select(search_results, sampling_num):
    indexes = []
    res = []
    while len(indexes) < sampling_num:
        index = random.randrange(0, len(search_results), 1) 
        if index not in indexes:
            indexes.append(index)
            res.append(search_results[index]['identifier'])
    return res

In [24]:
import requests
def get_fasta_from_pdb(entry_id):
    """Retrieve FASTA sequence from RCSB PDB using entry_id."""
    # Base URL for the RCSB PDB RESTful API for FASTA format
    base_url = f"https://www.rcsb.org/fasta/entry/{entry_id}"
    
    # Send a GET request to the API to fetch the FASTA data
    response = requests.get(base_url)
    
    if response.status_code == 200:
        return response.text
    else:
        return None

In [42]:
def update_fasta_input_and_pair(fasta_sequence: str, input_file: str, custom_pair: str):
# I don't use the uniprot protein id in this case actually cause some of the entry id failed to match 
# the uniprot id. And almost all the them are not in exact form with the proteins listed in the uniprot
# good for data augmentation in this case. and easier to track the exact aa seq
    fastas = fasta_sequence.strip().split('\n')
    name = []
    seq = []
    for i in fastas:
        items = i.strip().split("|")
        # print(items)
        for item in items:
            if item.strip()[0] == ">":
                name.append(item.strip()[1:])
                break
            if item.isupper():
                seq.append(item.strip())
                continue
    # update 
    n = len(name)
    for i in range(n):
        if len(seq[i]) < 20:
            return 
    with open(input_file, "a") as file:
        for i in range(n):
            file.write(f">{name[i]}\n")
            file.write(f"{seq[i]}\n")
    with open(custom_pair, "a") as file:
        for i in range(n-1):
            file.write(f"{name[i]};")
        file.write(f"{name[n-1]}\n")
    return

In [43]:
tp_entries = []
n = len(search_results)
for i in range(n):
    tp_entries.append(search_results[i]['identifier'])

fasta_input = "/n/home10/ytingliu/alphapulldown_new/8599_tp_input.fasta"
pair_input = "/n/home10/ytingliu/alphapulldown_new/8599_tp_pairs.txt"
for entry_id in tp_entries:
    fasta_sequence = get_fasta_from_pdb(entry_id)
    update_fasta_input_and_pair(fasta_sequence, fasta_input, pair_input)

## True Negative

In [8]:
# 1. kinase with its protein substrate

In [9]:
# 2. pairs for prokaryotes and eukaryotes

# Bacteria (eubacteria) (taxonomy_id:2) ba review = True 336,064 results
# Eukaryota (eucaryotes) (taxonomy_id:2759) review = True 197,016 results
# Archaea (taxonomy_id:2157) review = True 19,716 results
# Homo sapiens (species) (taxonomy_id:9606) review = True 20,429 results

# remove based on sequence similarity? 

# since the pagnition nature of large uniprot query, the best method would be download all results and then use it as the base data
# the random algorithm is low efficient

In [47]:
# generate random pairs between archaea and homo sapien
import random
import os
def random_pair(file1_path, file2_path, n):
    # Lists to store protein names and sequences
    protein_names_1 = []
    protein_sequences_1 = []
    protein_names_2 = []
    protein_sequences_2 = []

    with open(file1_path, "r") as file1:
        lines = file1.readlines()
        for i in range(0, len(lines), 2):
            protein_names_1.append(lines[i].strip())
            protein_sequences_1.append(lines[i + 1].strip())

    with open(file2_path, "r") as file2:
        lines = file2.readlines()
        for i in range(0, len(lines), 2):
            protein_names_2.append(lines[i].strip())
            protein_sequences_2.append(lines[i + 1].strip())

    selected_proteins_1 = random.sample(list(zip(protein_names_1, protein_sequences_1)), n)
    selected_proteins_2 = random.sample(list(zip(protein_names_2, protein_sequences_2)), n)

    protein_pairs = [(protein1, protein2) for protein1, protein2 in zip(selected_proteins_1, selected_proteins_2)]
    return protein_pairs


In [48]:
file1_path = "/n/home10/ytingliu/uniprot_database/uniprotkb_taxonomy_id_2157_AND_reviewed.txt"
file2_path = "/n/home10/ytingliu/uniprot_database/uniprotkb_taxonomy_id_9606_AND_reviewed.txt"
fasta_input = "/n/home10/ytingliu/alphapulldown_new/6000_tn_input.fasta"
pair_input = "/n/home10/ytingliu/alphapulldown_new/6000_tn_pairs.txt"
res = random_pair(file1_path, file2_path, 6000)

In [49]:
with open(fasta_input, "a") as file:
    for pair in res:
        file.write(f"{pair[0][0]}\n")
        file.write(f"{pair[0][1]}\n")
        file.write(f"{pair[1][0]}\n")
        file.write(f"{pair[1][1]}\n")
with open(pair_input, "a") as file:
    for pair in res:
        file.write(f"{pair[0][0][1:]};{pair[1][0][1:]}\n")

In [6]:
# reinburse the nagative dataset
# add new combi from existing MSA (rather than from the fasta list)
# extracting and split tn name
def get_subdirectories(directory):
    subdirectories = []
    target_directories = []
    for item in os.listdir(directory):
        if "6000_tn" in item:
            target_dir = os.path.join(directory, item)
            for sub_item in os.listdir(target_dir):
                subdir_item_path = os.path.join(target_dir, sub_item)
                if os.path.isdir(subdir_item_path):
                    subdirectories.append(sub_item)
    return subdirectories

# Specify the directory for which you want to extract subdirectories
directory_path = "/n/holyscratch01/ramanathan_lab/yuting/outputs"

subdirectories = get_subdirectories(directory_path)


Q8PSJ3_and_Q5T280


In [9]:
valid_fasta = set()
for pair in subdirectories:
    if '_and_' in pair:
        valid_fasta.add(pair.split('_and_')[0])
        valid_fasta.add(pair.split('_and_')[1])
    
file_path = "valid_tn_msas.txt"

# Writing set elements to a text file
with open(file_path, "w") as file:
    for element in valid_fasta:
        file.write(element + "\n")

In [11]:
# add 500 new pairs using the existing msa to enrich the dataset
pair_input = "/n/home10/ytingliu/alphapulldown_new/6000_tn_pairs.txt"
archaea = []
human = []
pairs = set()
new_pairs = []
with open(pair_input, 'r') as file:
    for line in file:
        res = line.strip().split(';')
        archaea.append(res[0])
        human.append(res[1])
        pairs.add((res[0], res[1]))

count = 0
while count < 500:
    selected_archaea = random.choice(archaea)
    selected_human = random.choice(human)
    if (selected_archaea, selected_human) not in pairs and selected_archaea in valid_fasta and selected_human in valid_fasta:
        new_pairs.append([selected_archaea, selected_human])
        count += 1
with open("/n/home10/ytingliu/alphapulldown_new/500_tn_pairs.txt", "a") as file:
    for pair in new_pairs:
        file.write(f"{pair[0]};{pair[1]}\n")

In [1]:
pair_input = "/n/home10/ytingliu/alphapulldown_new/6000_tn_pairs.txt"
archaea = []
human = []
pairs = set()
new_pairs = []
with open(pair_input, 'r') as file:
    for line in file:
        res = line.strip().split(';')
        archaea.append(res[0])
        human.append(res[1])
        pairs.add((res[0], res[1]))