In [None]:
import pandas as pd
import requests
from functools import reduce

"""
 pdb_structure_id    | 1d2s            
 prot_chain_id       | A               
 compound_id         | DHT             
 interaction_type    | hbond           
 prot_residue_number | 65              
 prot_residue_type   | ASP             
 target_id           | ENSG00000129214 
 uniprot_id          | P04278 
"""

In [None]:
# From PLIP output:
chain_id = 'A'
pdb_id = '1d2s'
residue_no = 65

In [None]:
# Fetching data from PDB API:
URL = f'https://www.ebi.ac.uk/pdbe/api/mappings/ensembl/{pdb_id}'
data = requests.get(URL).json()

In [None]:
all_mappings = (
    # Extract mappings for all genes:
    pd.DataFrame(reduce(lambda x, y: x + y['mappings'], data[pdb_id]['Ensembl'].values(), []))
    
    # Extract protein position:
    .assign(
        author_residue_start = lambda df: df.start.apply(lambda x: x['author_residue_number']),
        author_residue_end = lambda df: df.end.apply(lambda x: x['author_residue_number']),
    )
    
    # Selecting columns:
    [['chain_id', 'accession', 'genome_start', 'genome_end', 'author_residue_start', 'author_residue_end']]
    
    # Dropping isoforms:
    .assign(
        accession = lambda df: df.accession.str.replace(r'-\d+', '')
    )
    .drop_duplicates()
       
    # Filter for chain ID:
    .query(f"chain_id == '{chain_id}'")
    
    # Filter for position match:
    .query('author_residue_start <= @residue_no and author_residue_end >= @residue_no')
)
all_mappings

In [13]:
import argparse
from email import header
from json import JSONDecodeError
import logging
import psutil

import pandas as pd
import requests
from pandarallel import pandarallel
import pyspark.sql.functions as f
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, StringType
from functools import reduce

from Bio.Data.CodonTable import CodonTable
from Bio.Seq import Seq
from Bio.SeqUtils import IUPACData


In [14]:
# Global configuration for Spark and Pandarallel.
spark = SparkSession.builder.master('local[*]').getOrCreate()

pandarallel.initialize(
    nb_workers=psutil.cpu_count(),
    progress_bar=True,
)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [15]:
def fetch_gapi_ensembl_mapping(row):
    """This function fetches the graph api ensembl mapping file from ePDB server

    Args:
        rows of the DataFrame (each structures)
    Returns:
        a column for each structure with genomic positions and other infos about residues
    """
    gene_id = row[0]
    uniprot_id = row[1]
    pdb_struct_id = row[2]
    residue_info = pd.DataFrame(row[3]).values.tolist()


    url = f'https://www.ebi.ac.uk/pdbe/graph-api/mappings/ensembl/{pdb_struct_id}'
    headers={'Content-Type': 'application/json'}

    response = requests.get(url, headers=headers)

    try:
        if response.headers['Content-Type'] != 'application/json':
            return None

        else:
            e_mapping_file = response.json()
            return filter_dict_file(e_mapping_file, pdb_struct_id, residue_info)

    except KeyError:

        return None

In [51]:
def get_position(row: pd.Series) -> int:
    aa_in_range = row['resNb'] - row['author_residue_start']
    base_in_range = aa_in_range * 3
    return base_in_range + row['genome_start']

def filter_dict_file(e_mapping_file, pdb_struct_id, residue_infos):

    # Reading residue infos generated by plip + convert residue number to numeric
    residue_info = (
        pd.DataFrame(residue_infos, columns = ['pdbCompoundId', 'chr', 'intType', 'chainId', 'resType', 'resNb'])
        .astype({'resNb': 'int32'})
    )

    all_mappings = (
        # Extract mappings for all genes:
        pd.DataFrame(reduce(lambda x, y: x + y['mappings'], e_mapping_file[pdb_struct_id]['Ensembl'].values(), []))

        # Extract protein position:
        .assign(
            author_residue_start = lambda df: df.start.apply(lambda x: x['author_residue_number']),
            author_residue_end = lambda df: df.end.apply(lambda x: x['author_residue_number'])
        )

        # Selecting columns:
        [['chain_id', 'accession', 'genome_start', 'genome_end', 'author_residue_start', 'author_residue_end']]

        # Dropping isoforms:
        .assign(
            accession = lambda df: df.accession.str.replace(r'-\d+', '', regex=True),
            pdb_struct_id = pdb_struct_id
        )
        .drop_duplicates()

        .rename(columns={"chain_id": "chainId"})        
    )

    # Joining mappings with plip data:
    map_plip_o_joined = all_mappings.merge(residue_info, on='chainId', how='inner')

    map_plip_o_joined_filt = (
        map_plip_o_joined
            # Filter for position match:
            .query('author_residue_start <= resNb and author_residue_end >= resNb')
            # Compute position:
            .assign(
                residue_start_genomic = lambda df: df.apply(get_position, axis=1)
            )
    )
    # products_list = map_plip_o_joined_filt.values.tolist()

    return map_plip_o_joined_filt



pandas.core.series.Series

In [84]:

# mapped_positions = genomic_pos_pd.head(20).apply(fetch_gapi_ensembl_mapping, axis=1)
# mapped_positions.iloc[1].head()
mapped_positions_df = pd.concat(mapped_positions.to_list())

def get_gene_id(uniprot_id: str) -> str:
    URL = f'https://rest.ensembl.org/xrefs/symbol/homo_sapiens/{uniprot_id}?content-type=application/json'
    data = requests.get(URL).json()
    try:
        gene_id = [x['id'] for x in data if x['type'] == 'gene'][0]
    except IndexError:
        gene_id = None
        
    return (gene_id, uniprot_id)

def get_gene_info(ensembl_gene_id: str, uniprot_id: str) -> dict:
    
    if ensembl_gene_id is None:
        return {}
    
    URL = f'https://rest.ensembl.org/lookup/id/{ensembl_gene_id}?content-type=application/json'
    data = requests.get(URL).json()

    try:
        gene_info = {key: data[key] for key in ['start', 'end', 'seq_region_name', 'strand', 'id']}
    except KeyError:
        gene_info = {}
    
    gene_info['accession'] = uniprot_id
    return gene_info

gene_infos = pd.Series(mapped_positions_df.accession.unique()).apply(lambda x: get_gene_info(*get_gene_id(x)))
gene_info_df = pd.DataFrame(gene_infos.to_list()).dropna()

annotated_rows = (
    mapped_positions_df
    .merge(gene_info_df, on='accession', how='inner')
)

In [93]:
print('positions on the wrong chromosome:')
print(
    annotated_rows
    .loc[annotated_rows.seq_region_name != annotated_rows.chr]
)

print('Mapped location outside gene:')
print(
    annotated_rows
    .loc[ ~
        (annotated_rows.start <= annotated_rows.residue_start_genomic) &
        (annotated_rows.end >= annotated_rows.residue_start_genomic)
    ]
)

annotated_rows[['strand', 'id']].drop_duplicates()

positions on the wrong chromosome:
Empty DataFrame
Columns: [chainId, accession, genome_start, genome_end, author_residue_start, author_residue_end, pdb_struct_id, pdbCompoundId, chr, intType, resType, resNb, residue_start_genomic, start, end, seq_region_name, strand, id]
Index: []
Mapped location outside gene:
Empty DataFrame
Columns: [chainId, accession, genome_start, genome_end, author_residue_start, author_residue_end, pdb_struct_id, pdbCompoundId, chr, intType, resType, resNb, residue_start_genomic, start, end, seq_region_name, strand, id]
Index: []


Unnamed: 0,strand,id
0,-1.0,ENSG00000007171
45,-1.0,ENSG00000096060
52,1.0,ENSG00000106723
63,1.0,ENSG00000175793
67,-1.0,ENSG00000101347
110,1.0,ENSG00000101966
114,-1.0,ENSG00000133703
164,1.0,ENSG00000168487
167,-1.0,ENSG00000148219
170,-1.0,ENSG00000186642


In [183]:
def get_res_sequence(row) -> dict:
    chr = row['chr']
    residue_start_genomic = row['residue_start_genomic']
    strand = int(row['strand'])
    residue_start_genomic += -1
    URL = f'https://rest.ensembl.org/sequence/region/human/{chr}:{residue_start_genomic}..{residue_start_genomic + 2}:{strand}?content-type=text/plain'
    codon = (requests.get(URL)).text
    my_rna = Seq(codon)
    amino_acid_1 = str(my_rna.translate())
    try:
        amino_acid_3 = IUPACData.protein_letters_1to3[amino_acid_1].upper()
    except KeyError:
        amino_acid_3 = '*'
        
    return amino_acid_3

amino_acid_translations = (
    annotated_rows.query('strand == -1')
    [['chr', 'strand', 'residue_start_genomic']]
    .head(50)
    .drop_duplicates()
    .assign(calculated_aa = lambda df: df.apply(get_res_sequence, axis=1))
)

(
    annotated_rows.query('strand == -1').head(50)
    .merge(amino_acid_translations, on=['chr', 'strand', 'residue_start_genomic'], how='left')
    .query('calculated_aa != resType')
    .reset_index(drop=True)
)

Unnamed: 0,chainId,accession,genome_start,genome_end,author_residue_start,author_residue_end,pdb_struct_id,pdbCompoundId,chr,intType,resType,resNb,residue_start_genomic,start,end,seq_region_name,strand,id,calculated_aa
0,A,P35228,27787826,27787973,107,156,3e7g,ZN,17,metal_complex,CYS,115,27787850,27756766.0,27800529.0,17,-1.0,ENSG00000007171,SER
1,A,P35228,27787826,27787973,107,156,3e7g,ZN,17,metal_complex,CYS,110,27787835,27756766.0,27800529.0,17,-1.0,ENSG00000007171,LEU
2,A,P35228,27779056,27779230,335,393,3e7g,AT2,17,hydroph_interaction,PRO,350,27779101,27756766.0,27800529.0,17,-1.0,ENSG00000007171,LYS
3,A,P35228,27779056,27779230,335,393,3e7g,AT2,17,hbond,GLU,377,27779182,27756766.0,27800529.0,17,-1.0,ENSG00000007171,LEU
4,A,P35228,27779056,27779230,335,393,3e7g,AT2,17,hydroph_interaction,PHE,369,27779158,27756766.0,27800529.0,17,-1.0,ENSG00000007171,LEU
5,A,P35228,27779056,27779230,335,393,3e7g,H4B,17,hbond,ARG,381,27779194,27756766.0,27800529.0,17,-1.0,ENSG00000007171,GLY
6,A,P35228,27774451,27774643,428,492,3e7g,H4B,17,pication,TRP,463,27774556,27756766.0,27800529.0,17,-1.0,ENSG00000007171,ASP
7,A,P35228,27774451,27774643,428,492,3e7g,H4B,17,hydroph_interaction,GLU,479,27774604,27756766.0,27800529.0,17,-1.0,ENSG00000007171,ILE
8,A,P35228,27774451,27774643,428,492,3e7g,H4B,17,hbond,ILE,462,27774553,27756766.0,27800529.0,17,-1.0,ENSG00000007171,CYS
9,A,P35228,27774451,27774643,428,492,3e7g,H4B,17,hbond,TRP,463,27774556,27756766.0,27800529.0,17,-1.0,ENSG00000007171,ASP


In [173]:
amino_acid_translations = (
    annotated_rows
    [['chr', 'strand', 'residue_start_genomic']]
    .head()
    .drop_duplicates()
    .assign(calculated_aa = lambda df: df.apply(get_res_sequence, axis=1))
)

(
    annotated_rows.head()
    .merge(amino_acid_translations, on=['chr', 'strand', 'residue_start_genomic'], how='left')
    .query('calculated_aa != resType')
)


Unnamed: 0,chainId,accession,genome_start,genome_end,author_residue_start,author_residue_end,pdb_struct_id,pdbCompoundId,chr,intType,resType,resNb,residue_start_genomic,start,end,seq_region_name,strand,id,calculated_aa
0,A,P35228,27787826,27787973,107,156,3e7g,ZN,17,metal_complex,CYS,115,27787850,27756766.0,27800529.0,17,-1.0,ENSG00000007171,LEU
1,A,P35228,27787826,27787973,107,156,3e7g,ZN,17,metal_complex,CYS,110,27787835,27756766.0,27800529.0,17,-1.0,ENSG00000007171,SER
2,A,P35228,27779056,27779230,335,393,3e7g,AT2,17,hydroph_interaction,PRO,350,27779101,27756766.0,27800529.0,17,-1.0,ENSG00000007171,*
3,A,P35228,27779056,27779230,335,393,3e7g,AT2,17,hbond,GLU,377,27779182,27756766.0,27800529.0,17,-1.0,ENSG00000007171,VAL
4,A,P35228,27779056,27779230,335,393,3e7g,AT2,17,hydroph_interaction,PHE,369,27779158,27756766.0,27800529.0,17,-1.0,ENSG00000007171,THR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,B,Q9UIF8,159332686,159332830,1933,1981,6fi1,ZN,2,metal_complex,CYS,1949,159332734,159318979.0,159616569.0,2,-1.0,ENSG00000123636,
219,B,Q9UIF8,159332686,159332830,1933,1981,6fi1,ZN,2,metal_complex,CYS,1978,159332821,159318979.0,159616569.0,2,-1.0,ENSG00000123636,
220,B,Q9UIF8,159332686,159332830,1933,1981,6fi1,ZN,2,metal_complex,CYS,1975,159332812,159318979.0,159616569.0,2,-1.0,ENSG00000123636,
221,B,Q9UIF8,159332686,159332830,1933,1981,6fi1,ZN,2,metal_complex,CYS,1952,159332743,159318979.0,159616569.0,2,-1.0,ENSG00000123636,


In [96]:
print('Amino Acids obtained from position computations:')
print(
    annotated_rows_codon
    .loc[ ~
        (annotated_rows_codon.start <= annotated_rows.residue_start_genomic) &
        (annotated_rows_codon.end >= annotated_rows.residue_start_genomic)
    ]
)

annotated_rows[['strand', 'id']].drop_duplicates()

residue_start_genomic
27787850     <function get_res_sequence at 0x13b871310>
27787835     <function get_res_sequence at 0x13b871310>
27779101     <function get_res_sequence at 0x13b871310>
27779182     <function get_res_sequence at 0x13b871310>
27779158     <function get_res_sequence at 0x13b871310>
                                ...                    
159332734    <function get_res_sequence at 0x13b871310>
159332821    <function get_res_sequence at 0x13b871310>
159332812    <function get_res_sequence at 0x13b871310>
159332743    <function get_res_sequence at 0x13b871310>
31993982     <function get_res_sequence at 0x13b871310>
Name: chr, Length: 223, dtype: object

In [17]:
# PLIP INPUT (contain wanted data)
plip_json_input = (
    # "gene_mapped_structures.json"
    spark.read.json('gene_mapped_structures.json')
    .select("pdbStructureId", "chains", f.explode("compoundIds").alias("pdbCompoundId"))
    .select("pdbStructureId", "pdbCompoundId", f.explode("chains").alias("chains"))
    )

plip_json_input_v2 = (plip_json_input
                    .withColumn("chainId", plip_json_input["chains.chainId"])
                    .withColumn("geneId", plip_json_input["chains.geneId"])
                    .withColumn("uniprotId", plip_json_input["chains.uniprot"])
                    .drop("chains")
                    )

                                                                                

In [18]:
# PLIP OUTPUT
plip_csv_output = (
    # "output.csv"
    spark.read.csv('output.csv', header=True, sep=",")
    .withColumnRenamed("pdb_structure_id", "pdbStructureId")
    .withColumnRenamed("compound_id", "pdbCompoundId")
    .withColumnRenamed("prot_chain_id", "chainId")
    )

In [19]:
# JOIN to have gene id (used for filter mapping file on the gene id)
plip_output_target_id = (
    plip_json_input_v2
    .join(plip_csv_output, on=["pdbStructureId", "chainId", "pdbCompoundId"])
    .withColumnRenamed("interaction_type", "intType")
    .withColumnRenamed("prot_residue_number", "protResNb")
    .withColumnRenamed("prot_residue_type", "protResType")
)

In [20]:
# Target df
target_df = (
    spark.read
    .parquet("../targets")
    .select("id", "genomicLocation")
    .withColumn("chromosome", f.col("genomicLocation.chromosome"))
    .withColumnRenamed("id", "geneId")
    .drop("genomicLocation")
)

In [21]:
plip_output_agg = (
    plip_output_target_id
    .join(target_df, on='geneId')

    .groupby([f.col('geneId'),
            f.col('uniprotId'),
            f.col("pdbStructureId").alias("pdbStructId")
            ])

    .agg(f.collect_set(f.struct(
            f.col('pdbCompoundId'),
            f.col('chromosome'),
            f.col('intType'),
            f.col('chainId'),
            f.col('protResType'),
            f.col('protResNb')))        
        .alias("chr, intType, chain, resType, resNb"),

        f.collect_set(f.col("pdbCompoundId")).alias("pdbCompId")
        )
    )

In [22]:
plip_output_agg = plip_output_agg.sample(0.001, 3)

In [23]:
plip_output_agg.show()

                                                                                

+---------------+---------+-----------+-----------------------------------+--------------+
|         geneId|uniprotId|pdbStructId|chr, intType, chain, resType, resNb|     pdbCompId|
+---------------+---------+-----------+-----------------------------------+--------------+
|ENSG00000007171|   P35228|       3e7g|               [{AT2, 17, hbond,...|[H4B, AT2, ZN]|
|ENSG00000096060|   Q13451|       3o5r|               [{FK5, 6, hydroph...|         [FK5]|
|ENSG00000106723|   Q9Y657|       5y5w|               [{M3L, 9, picatio...|         [M3L]|
|ENSG00000175793|   P31947|       6xwd|               [{SEP, 1, saltbri...|         [SEP]|
|ENSG00000101347|   Q9Y3Z3|       4q7h|               [{GTP, 20, hbond,...|     [GTP, ZN]|
|ENSG00000101966|   P98170|       2poi|               [{ZN, X, metal_co...|          [ZN]|
|ENSG00000133703|   P01116|       6mta|               [{GNP, 12, hbond,...|         [GNP]|
|ENSG00000168487|   P13497|       6btn|               [{ZN, 8, metal_co...|          [ZN]|

In [36]:
# Pandas Apply
# genomic_pos_pd = plip_output_agg.toPandas()
# genomic_pos_pd["resInfos"] = genomic_pos_pd.head().apply(

genomic_pos_pd.head(2).apply(fetch_gapi_ensembl_mapping, axis=1)

[['AT2', '17', 'hbond', 'C', 'GLU', '377'], ['AT2', '17', 'hydroph_interaction', 'A', 'PRO', '350'], ['H4B', '17', 'pication', 'A', 'TRP', '463'], ['AT2', '17', 'hydroph_interaction', 'B', 'PHE', '369'], ['H4B', '17', 'hbond', 'C', 'TRP', '463'], ['AT2', '17', 'hbond', 'A', 'GLU', '377'], ['AT2', '17', 'hbond', 'B', 'GLU', '377'], ['AT2', '17', 'hydroph_interaction', 'D', 'PHE', '369'], ['H4B', '17', 'pication', 'B', 'TRP', '463'], ['H4B', '17', 'pistack', 'D', 'TRP', '463'], ['AT2', '17', 'hbond', 'D', 'GLU', '377'], ['H4B', '17', 'hydroph_interaction', 'D', 'GLU', '479'], ['ZN', '17', 'metal_complex', 'C', 'CYS', '110'], ['ZN', '17', 'metal_complex', 'B', 'CYS', '115'], ['AT2', '17', 'hydroph_interaction', 'C', 'PHE', '369'], ['H4B', '17', 'hbond', 'B', 'TRP', '463'], ['H4B', '17', 'pication', 'C', 'TRP', '463'], ['H4B', '17', 'hydroph_interaction', 'A', 'GLU', '479'], ['H4B', '17', 'hbond', 'A', 'ILE', '462'], ['H4B', '17', 'hbond', 'D', 'ARG', '381'], ['AT2', '17', 'hydroph_interac

  accession = lambda df: df.accession.str.replace(r'-\d+', '')


[['FK5', '6', 'hydroph_interaction', 'A', 'TYR', '113'], ['FK5', '6', 'hydroph_interaction', 'A', 'TYR', '57'], ['FK5', '6', 'hbond', 'A', 'ILE', '87'], ['FK5', '6', 'hydroph_interaction', 'A', 'VAL', '86'], ['FK5', '6', 'hydroph_interaction', 'A', 'ILE', '122'], ['FK5', '6', 'hydroph_interaction', 'A', 'TRP', '90'], ['FK5', '6', 'hydroph_interaction', 'A', 'PHE', '77']]


  accession = lambda df: df.accession.str.replace(r'-\d+', '')


0    [[A, P35228, 27787826, 27787973, 107, 156, ZN,...
1    [[A, Q13451, 35637158, 35637302, 36, 84, FK5, ...
dtype: object

In [29]:
(
    spark.createDataFrame(genomic_pos_pd)
).show(2, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# Final DF
genomic_pos_rm_null_pd = genomic_pos_pd[["geneId", "pdbStructId", "resInfos"]].dropna()
final_df = genomic_pos_rm_null_pd.explode('resInfos')
final_df.to_json(args.output_folder + "/residue_genomic_position_2.json", orient="records")

In [None]:
dummy_data1 = {
        'id': ['1', '2', '3', '4', '5'],
        'Feature1': ['A', 'C', 'E', 'G', 'I'],
        'Feature2': ['B', 'D', 'F', 'H', 'J']}
df1 = pd.DataFrame(dummy_data1, columns = ['id', 'Feature1', 'Feature2'])

In [None]:
dummy_data2 = {
        'id': ['1', '2', '6', '7', '8'],
        'Feature1': ['K', 'M', 'O', 'Q', 'S'],
        'Feature2': ['L', 'N', 'P', 'R', 'T']}
df2 = pd.DataFrame(dummy_data2, columns = ['id', 'Feature1', 'Feature2'])

In [None]:
df_inner = pd.merge(df1, df2, on='id', how='inner')


In [None]:
df_inner