### This Notebook is the investigation for one residue of the genomic position
We need to resolve this

In [154]:
# Imports
import argparse
from json import JSONDecodeError

import pandas as pd
import requests
import pyspark.sql.functions as f
from pyspark.sql import SparkSession
from functools import reduce

from Bio.Data.CodonTable import CodonTable
from Bio.Seq import Seq
from Bio.SeqUtils import IUPACData

In [155]:
# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

The residue used here is **Cys115** of the **3e7g** structure

In [156]:
# PLIP INPUT (contain wanted data)
plip_json_input = (
    # "gene_mapped_structures.json"
    spark.read.json("gene_mapped_structures.json")
    .select("pdbStructureId", "chains", f.explode("compoundIds").alias("pdbCompoundId"))
    .select("pdbStructureId", "pdbCompoundId", f.explode("chains").alias("chains"))

    )

plip_json_input_v2 = (plip_json_input
                    .withColumn("chainId", plip_json_input["chains.chainId"])
                    .withColumn("geneId", plip_json_input["chains.geneId"])
                    .withColumn("uniprotId", plip_json_input["chains.uniprot"])
                    .drop("chains")
                    )

# PLIP OUTPUT
plip_csv_output = (
    # "output.csv"
    spark.read.csv("output.csv", header=True, sep=",")
    .withColumnRenamed("pdb_structure_id", "pdbStructureId")
    .withColumnRenamed("compound_id", "pdbCompoundId")
    .withColumnRenamed("prot_chain_id", "chainId")
    )

# JOIN to have gene id (used for filter mapping file on the gene id)
plip_output_target_id = (
    plip_json_input_v2
    .join(plip_csv_output, on=["pdbStructureId", "chainId", "pdbCompoundId"])
    .withColumnRenamed("interaction_type", "intType")
    .withColumnRenamed("prot_residue_number", "protResNb")
    .withColumnRenamed("prot_residue_type", "protResType")
)

# Target df
target_df = (
    spark.read
    .parquet("../targets")
    .select("id", "genomicLocation")
    .withColumn("chromosome", f.col("genomicLocation.chromosome"))
    .withColumnRenamed("id", "geneId")
    .drop("genomicLocation")
)

plip_output_agg = (
    plip_output_target_id
    .join(target_df, on='geneId')
    .groupby([f.col('geneId'),
            f.col('uniprotId'),
            f.col("pdbStructureId").alias("pdbStructId")
            ])

    .agg(f.collect_set(f.struct(
            f.col('pdbCompoundId'),
            f.col('chromosome'),
            f.col('intType'),
            f.col('chainId'),
            f.col('protResType'),
            f.col('protResNb')))        
        .alias("chr, intType, chain, resType, resNb"),

        f.collect_set(f.col("pdbCompoundId")).alias("pdbCompId")
        )
    )


                                                                                

In [157]:
my_residue_of_interest = (
    plip_output_agg
    .filter(f.col("pdbStructId").rlike("3e7g"))
    .select("*", f.explode(f.col("chr, intType, chain, resType, resNb")).alias("resInfos"))
    .filter(f.col("resInfos.protResNb").rlike("115"))
    .filter(f.col("resInfos.chainId").rlike("A"))
    .withColumn("pdbCompoundId", f.col("resInfos.pdbCompoundId"))
    .withColumn("chromosome", f.col("resInfos.chromosome"))
    .withColumn("intType", f.col("resInfos.intType"))
    .withColumn("chainId", f.col("resInfos.chainId"))
    .withColumn("protResType", f.col("resInfos.protResType"))
    .withColumn("protResNb", f.col("resInfos.protResNb"))
    .drop("chr, intType, chain, resType, resNb", "pdbCompId", "resInfos")
)

In [158]:
my_residue_of_interest.show()

                                                                                

+---------------+---------+-----------+-------------+----------+-------------+-------+-----------+---------+
|         geneId|uniprotId|pdbStructId|pdbCompoundId|chromosome|      intType|chainId|protResType|protResNb|
+---------------+---------+-----------+-------------+----------+-------------+-------+-----------+---------+
|ENSG00000007171|   P35228|       3e7g|           ZN|        17|metal_complex|      A|        CYS|      115|
+---------------+---------+-----------+-------------+----------+-------------+-------+-----------+---------+



In [159]:
# Hard coding because the translation id in the xml file
pdb_struct_id = "3e7g"
geneId = "ENSG00000007171"
translationId = "ENSP00000327251"
protResNb = "115"

Residue genomic position which is working

In [160]:
url = f"https://rest.ensembl.org/map/translation/{translationId}/{protResNb}..{protResNb}?content-type=application/json"

response = requests.get(url)
res_api_info = response.json()

chr = res_api_info["mappings"][0]["seq_region_name"]
start_gen = res_api_info["mappings"][0]["start"]
end_gen = res_api_info["mappings"][0]["end"]
strand = res_api_info["mappings"][0]["strand"]

url = f"https://rest.ensembl.org/sequence/region/human/{chr}:{start_gen}..{end_gen}:{strand}?content-type=text/plain"

response = requests.get(url)
codon = response.text
my_rna = Seq(codon)
amino_acid_1 = str(my_rna.translate())
try:
    amino_acid_3 = IUPACData.protein_letters_1to3[amino_acid_1].upper()
except KeyError:
    amino_acid_3 = '*'

print(amino_acid_3)

CYS


Try to find the same genomic location with ensembl mapping graph api

In [161]:
def get_position(row: pd.Series) -> int:
    aa_in_range = row['protResNb'] - row['author_residue_start']
    base_in_range = aa_in_range * 3
    return base_in_range + row['genome_start']

In [162]:
# Quick conversion to pandas and integer
my_residue_of_interest_pd = my_residue_of_interest.toPandas().astype({'protResNb': 'int32'})

                                                                                

In [163]:
url = f'https://www.ebi.ac.uk/pdbe/graph-api/mappings/ensembl/{pdb_struct_id}'
headers={'Content-Type': 'application/json'}
response = requests.get(url, headers=headers)
e_mapping_file = response.json()

In [164]:
all_mappings = (
        # Extract mappings for all genes:
        pd.DataFrame(reduce(lambda x, y: x + y['mappings'], e_mapping_file[pdb_struct_id]['Ensembl'].values(), []))

        # Extract protein position:
        .assign(
            author_residue_start = lambda df: df.start.apply(lambda x: x['author_residue_number']),
            author_residue_end = lambda df: df.end.apply(lambda x: x['author_residue_number']),
        )

        # Selecting columns:
        [['chain_id', 'accession', 'genome_start', 'genome_end', 'author_residue_start', 'author_residue_end']]

        # Dropping isoforms:
        .assign(
            accession = lambda df: df.accession.str.replace(r'-\d+', '', regex=True),
            pdb_struct_id = pdb_struct_id
        )
        .drop_duplicates()

        .rename(columns={"chain_id": "chainId"})        
    )

In [165]:
# Joining mappings with plip data (Reduce the whole set to only same chain id)
map_plip_o_joined = all_mappings.merge(my_residue_of_interest_pd, on='chainId', how='inner')

In [166]:
# Only keep the residues that are in the range and compute the position
map_plip_o_joined_filt = (
    map_plip_o_joined
        # Filter for position match:
        .query('author_residue_start <= protResNb and author_residue_end >= protResNb')
        # Compute position:
        .assign(
            residue_start_genomic = lambda df: df.apply(get_position, axis=1)
        )
)

In [167]:
map_plip_o_joined_filt

Unnamed: 0,chainId,accession,genome_start,genome_end,author_residue_start,author_residue_end,pdb_struct_id,geneId,uniprotId,pdbStructId,pdbCompoundId,chromosome,intType,protResType,protResNb,residue_start_genomic
2,A,P35228,27787826,27787973,107,156,3e7g,ENSG00000007171,P35228,3e7g,ZN,17,metal_complex,CYS,115,27787850
3,A,PRO_0000170930,27787826,27787973,107,156,3e7g,ENSG00000007171,P35228,3e7g,ZN,17,metal_complex,CYS,115,27787850


In [169]:
# !! Strand manually entered !!
def determine_aa(row):
    chr = row[11]
    start_gen = row[2]
    end_gen = row[3]

    url = f"https://rest.ensembl.org/sequence/region/human/{chr}:{start_gen}..{end_gen}:-1?content-type=text/plain"

    response = requests.get(url)
    codon = response.text
    my_rna = Seq(codon)
    amino_acid_1 = str(my_rna.translate())
    try:
        amino_acid_3 = IUPACData.protein_letters_1to3[amino_acid_1].upper()
    except KeyError:
        amino_acid_3 = '*'

    return amino_acid_3

In [172]:
(
    map_plip_o_joined_filt
    .assign(
        protResTypeComputed = lambda df: df.apply(determine_aa, axis=1)
        )
)



Unnamed: 0,chainId,accession,genome_start,genome_end,author_residue_start,author_residue_end,pdb_struct_id,geneId,uniprotId,pdbStructId,pdbCompoundId,chromosome,intType,protResType,protResNb,residue_start_genomic,protResTypeComputed
2,A,P35228,27787826,27787973,107,156,3e7g,ENSG00000007171,P35228,3e7g,ZN,17,metal_complex,CYS,115,27787850,*
3,A,PRO_0000170930,27787826,27787973,107,156,3e7g,ENSG00000007171,P35228,3e7g,ZN,17,metal_complex,CYS,115,27787850,*
