### Write json file for plip output

In [1]:
import pandas as pd
import json
import requests

from pyspark.sql.types import ArrayType, StringType, IntegerType, StructType, StructField
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/05 12:36:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [11]:
pd.set_option('display.max_rows', 100)

In [2]:
df = (
    spark.read.csv('output.csv', sep=',', header=True)
    # Somehow there are duplications:
    .distinct() 
    .groupBy(['pdb_structure_id', 'compound_id', 'prot_residue_number','prot_chain_id', 'prot_residue_type'])
    .agg(
        f.collect_set(f.col('interaction_type')).alias('interaction_types')
    )
)

                                                                                

In [None]:
# df.write.json('plip_output_aggregated.json')

In [None]:
# %%bash 

# cat /Users/dsuveges/project_data/marine/plip_output_aggregated.json/*json \
#     | gzip > /Users/dsuveges/project_data/marine/plip_output_aggregated.json.gz

In [3]:
import pandas as pd
from functools import reduce

# 319408 -> unique 265603
df = (
    pd.read_json('plip_output_aggregated.json.gz', orient='records', lines=True)
)

df.head()

Unnamed: 0,pdb_structure_id,compound_id,prot_residue_number,prot_chain_id,prot_residue_type,interaction_types
0,11gs,GSH,52,A,LEU,[hbond]
1,13gs,GSH,52,A,LEU,[hbond]
2,13gs,SAS,10,A,VAL,[hydroph_interaction]
3,13gs,SAS,13,A,ARG,[saltbridge]
4,13gs,SAS,35,B,VAL,[hbond]


### Group by pdb structure and take only one group (one structure)

In [4]:
# Grouping data by pdb structure id:
grouped = df.groupby('pdb_structure_id')

# Selecting one of the groups:
test_df = grouped.get_group('3e7g')
test_df.head()

Unnamed: 0,pdb_structure_id,compound_id,prot_residue_number,prot_chain_id,prot_residue_type,interaction_types
8288,3e7g,AT2,347,B,TYR,[hbond]
8289,3e7g,AT2,350,A,PRO,[hydroph_interaction]
8290,3e7g,AT2,350,B,PRO,[hydroph_interaction]
8291,3e7g,AT2,373,C,TYR,[hbond]
8292,3e7g,H4B,463,A,TRP,"[pication, hbond]"


### The function to get uniprot residue position and Uniprot accession

In [5]:
def get_pdb_sifts_mapping(pdb_id: str) -> pd.DataFrame:
    URL = f'https://www.ebi.ac.uk/pdbe/graph-api/mappings/ensembl/{pdb_id}'
    data = requests.get(URL).json()

    return (
        pd.DataFrame(reduce(lambda x,y: x + y['mappings'], data[pdb_id]['Ensembl'].values(), []))
        .assign(
            author_start = lambda df: df.start.apply(lambda start: start['author_residue_number']),
            author_end = lambda df: df.end.apply(lambda end: end['author_residue_number']),
            uniprot_position = lambda df: df.apply(lambda row: list(range(row['unp_start'], row['unp_end']+1)), axis=1),
            diff = lambda df: df.apply(lambda row: row['author_start'] - row['unp_start'], axis=1)
        )
        .explode('uniprot_position')
        .assign(
            prot_residue_number = lambda df: df.apply(lambda row: row['uniprot_position'] + row['diff'], axis=1)
        )
        [['accession', 'chain_id', 'uniprot_position', 'prot_residue_number']]
        .rename(columns={'chain_id': 'prot_chain_id'})
        .drop_duplicates()
    )


# def map2uniprot(plip_df: pd.DataFrame) -> pd.DataFrame:
#     pdb_id = plip_df.pdb_id.iloc

pdb_id = test_df.pdb_structure_id.iloc[0]
sifts_df = get_pdb_sifts_mapping(pdb_id)
sifts_df.head()

Unnamed: 0,accession,prot_chain_id,uniprot_position,prot_residue_number
0,P35228-2,A,82,82
0,P35228-2,A,83,83
0,P35228-2,A,84,84
0,P35228-2,A,85,85
0,P35228-2,A,86,86


### Merge the output of the function "sifts_df" with the input of the fuction (one group) to obtain the other infos on the residue

In [6]:
residue_pos_df = (
    test_df
    .merge(sifts_df, on=['prot_chain_id', 'prot_residue_number'], how='left')
)
residue_pos_df

Unnamed: 0,pdb_structure_id,compound_id,prot_residue_number,prot_chain_id,prot_residue_type,interaction_types,accession,uniprot_position
0,3e7g,AT2,347,B,TYR,[hbond],P35228-2,308
1,3e7g,AT2,347,B,TYR,[hbond],PRO_0000170930,347
2,3e7g,AT2,347,B,TYR,[hbond],P35228,347
3,3e7g,AT2,350,A,PRO,[hydroph_interaction],P35228-2,311
4,3e7g,AT2,350,A,PRO,[hydroph_interaction],PRO_0000170930,350
...,...,...,...,...,...,...,...,...
115,3e7g,H4B,479,D,GLU,[hydroph_interaction],P35228,479
116,3e7g,H4B,479,D,GLU,[hydroph_interaction],P35228-2,440
117,3e7g,ZN,115,C,CYS,[metal_complex],P35228-2,115
118,3e7g,ZN,115,C,CYS,[metal_complex],PRO_0000170930,115


### Open the crossref Uniprot Ensembl to add the Ensembl id

In [7]:
schema = StructType([
    StructField("UniProtKB-AC", StringType(), True),
    StructField("ID_type", StringType(), True),
    StructField("ID", StringType(), True)
    ])

cross_ref_uniprot = (
                    spark
                        .read.csv("../cross_ref_uniprot_ensembl_prot/HUMAN_9606_idmapping.tsv", sep="\t", schema=schema)
                        .filter(f.col('ID_type').rlike('Ensembl_PRO'))
                        .select(f.col('UniProtKB-AC'), f.col('ID'))
                        .withColumnRenamed("UniProtKB-AC", "accession")
                        .withColumnRenamed("ID", "ensemblProtId")
                        .toPandas()
    )
# 6289868 rows full dataset
# 115247 rows filtered dataset
cross_ref_uniprot
# Splice variants (different UTR regions but same sequence so, should be the same genomic location)

                                                                                

Unnamed: 0,accession,ensemblProtId
0,P31946,ENSP00000300161
1,P31946,ENSP00000361930
2,P62258,ENSP00000264335
3,P62258-2,ENSP00000461762
4,P62258-2,ENSP00000481059
...,...,...
115242,E9PPE7,ENSP00000432733
115243,E9PRZ4,ENSP00000432213
115244,A0A5F9ZHV4,ENSP00000500622
115245,A0A7P0T8F6,ENSP00000505116


### Merge the crossref df with the residue position df on Uniprot id

In [8]:
residue_pos_df = (
    cross_ref_uniprot
    .merge(residue_pos_df, on='accession', how='inner')
    # .drop(["accession"])
)
residue_pos_df

Unnamed: 0,accession,ensemblProtId,pdb_structure_id,compound_id,prot_residue_number,prot_chain_id,prot_residue_type,interaction_types,uniprot_position
0,P35228,ENSP00000327251,3e7g,AT2,347,B,TYR,[hbond],347
1,P35228,ENSP00000327251,3e7g,AT2,350,A,PRO,[hydroph_interaction],350
2,P35228,ENSP00000327251,3e7g,AT2,350,B,PRO,[hydroph_interaction],350
3,P35228,ENSP00000327251,3e7g,AT2,373,C,TYR,[hbond],373
4,P35228,ENSP00000327251,3e7g,H4B,463,A,TRP,"[pication, hbond]",463
...,...,...,...,...,...,...,...,...,...
75,P35228-2,ENSP00000482291,3e7g,H4B,381,D,ARG,[hbond],342
76,P35228-2,ENSP00000482291,3e7g,H4B,463,D,TRP,"[pication, hbond, pistack]",424
77,P35228-2,ENSP00000482291,3e7g,AT2,377,C,GLU,[hbond],338
78,P35228-2,ENSP00000482291,3e7g,H4B,479,D,GLU,[hydroph_interaction],440


### Open the mapping residue index to genomic position file

In [9]:
generated_mapping = (
    pd.read_csv("residue_gen_pos_output/generated_mappings.tsv.gz", sep="\t")
    .rename(columns={"protein_id": "ensemblProtId", "gene_id": "geneId", "amino_acid_position": "uniprot_position"})
)
generated_mapping

  pd.read_csv("residue_gen_pos_output/generated_mappings.tsv.gz", sep="\t")


Unnamed: 0,pos1,pos2,pos3,ensemblProtId,geneId,chr,strand,uniprot_position
0,127588499,127588500,127588501,ENSP00000000233,ENSG00000004059,7,+,1
1,127588502,127588503,127588504,ENSP00000000233,ENSG00000004059,7,+,2
2,127588505,127588506,127588507,ENSP00000000233,ENSG00000004059,7,+,3
3,127588508,127588509,127588510,ENSP00000000233,ENSG00000004059,7,+,4
4,127588511,127588512,127588513,ENSP00000000233,ENSG00000004059,7,+,5
...,...,...,...,...,...,...,...,...
42439476,22731559,22731560,22731561,ENSP00000512964,ENSG00000136244,7,+,263
42439477,22731562,22731563,22731564,ENSP00000512964,ENSG00000136244,7,+,264
42439478,22731565,22731566,22731567,ENSP00000512964,ENSG00000136244,7,+,265
42439479,22731568,22731569,22731570,ENSP00000512964,ENSG00000136244,7,+,266


### Merge the mapping residue to genomic position DF WITH the residue position with the good ensembl protein id DF

In [12]:
residue_gen_pos_df = (
    residue_pos_df
    .merge(generated_mapping, on=['ensemblProtId', 'uniprot_position'], how='inner')
    # .drop(["accession"])
)
residue_gen_pos_df

Unnamed: 0,accession,ensemblProtId,pdb_structure_id,compound_id,prot_residue_number,prot_chain_id,prot_residue_type,interaction_types,uniprot_position,pos1,pos2,pos3,geneId,chr,strand
0,P35228,ENSP00000327251,3e7g,AT2,347,B,TYR,[hbond],347,27765712,27765711,27765710,ENSG00000007171,17,-
1,P35228,ENSP00000327251,3e7g,AT2,347,C,TYR,[hbond],347,27765712,27765711,27765710,ENSG00000007171,17,-
2,P35228,ENSP00000327251,3e7g,AT2,347,D,TYR,[hbond],347,27765712,27765711,27765710,ENSG00000007171,17,-
3,P35228,ENSP00000327251,3e7g,AT2,350,A,PRO,[hydroph_interaction],350,27765703,27765702,27765701,ENSG00000007171,17,-
4,P35228,ENSP00000327251,3e7g,AT2,350,B,PRO,[hydroph_interaction],350,27765703,27765702,27765701,ENSG00000007171,17,-
5,P35228,ENSP00000327251,3e7g,AT2,350,D,PRO,[hydroph_interaction],350,27765703,27765702,27765701,ENSG00000007171,17,-
6,P35228,ENSP00000327251,3e7g,AT2,350,C,PRO,[hydroph_interaction],350,27765703,27765702,27765701,ENSG00000007171,17,-
7,P35228,ENSP00000327251,3e7g,AT2,373,C,TYR,[hbond],373,27765634,27765633,27765632,ENSG00000007171,17,-
8,P35228,ENSP00000327251,3e7g,AT2,373,D,TYR,[hbond],373,27765634,27765633,27765632,ENSG00000007171,17,-
9,P35228,ENSP00000327251,3e7g,H4B,463,A,TRP,"[pication, hbond]",463,27767746,27767745,27767744,ENSG00000007171,17,-
