### Investigation on potential new mechanism of action or indication

In [2]:
import pyspark.sql.functions as f
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, StringType
import pandas as pd

In [3]:
spark = (
    SparkSession.builder
    .master('local[*]')
    .config("spark.driver.memory", "15g")
    .getOrCreate()
    )

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/17 09:54:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/05/17 09:54:39 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
disease_evidence = (
    spark.read.parquet("pdb2variants2disease")
    .withColumnRenamed('name', 'targetName')
    .na.drop("any")
)
disease_evidence.show(1)

+-----------+-----------------+--------------------+------+--------------------+------------+
|pdbCompound|       variantIds|         diseaseName|symbol|          targetName|    drugName|
+-----------+-----------------+--------------------+------+--------------------+------------+
|        CJH|[16_72014532_C_T]|Postaxial acrofac...| DHODH|dihydroorotate de...|CHEMBL483995|
+-----------+-----------------+--------------------+------+--------------------+------------+
only showing top 1 row



In [69]:
# Loading target index:
targets = (
    spark.read.parquet('../../targets')
    .select(
        f.col('id').alias('targetId'),
        f.col('approvedName').alias('targetName')
    )
    .persist()
)
targets.show(1)

# Loading disease index:
diseases = (
    spark.read.parquet('../../diseases')
    .select(
        f.col('id').alias('diseaseId'),
        f.col('name').alias('diseaseName')
    )
    .persist()
)
diseases.show(1)

# MOLECULES
molecule = (
    spark.read
    .parquet("../../molecule/")
    .select(
        f.col('id'),
        f.col('inchiKey').alias('inchikey'), 
        f.col('name').alias('drugName'),
        'linkedTargets', 'linkedDiseases'
    )
.persist()
)
molecule.show(1)

22/05/17 12:42:43 WARN CacheManager: Asked to cache already cached data.
22/05/17 12:42:43 WARN CacheManager: Asked to cache already cached data.


+---------------+--------------------+
|       targetId|          targetName|
+---------------+--------------------+
|ENSG00000002016|RAD52 homolog, DN...|
+---------------+--------------------+
only showing top 1 row

+----------+--------------------+
| diseaseId|         diseaseName|
+----------+--------------------+
|GO_0044238|primary metabolic...|
+----------+--------------------+
only showing top 1 row

+----------+--------------------+----------+-------------+--------------+
|        id|            inchikey|  drugName|linkedTargets|linkedDiseases|
+----------+--------------------+----------+-------------+--------------+
|CHEMBL1006|JKOQGQFVAUAYPM-UH...|AMIFOSTINE|         null|          null|
+----------+--------------------+----------+-------------+--------------+
only showing top 1 row



22/05/17 12:42:43 WARN CacheManager: Asked to cache already cached data.


In [97]:
drugs_w_linked_target_names = (
    molecule
    .filter(f.col('linkedTargets').isNotNull())
    .select(
        f.col('id').alias('drugId'),
        f.explode(f.col('linkedTargets.rows')).alias('targetId')
    )
    .join(targets, on='targetId', how='left')
    .groupby('drugId')
    .agg(
        f.collect_set('targetName').alias('linkedTargetName')
    )
    .persist()
)

drugs_w_linked_disease_names = (
    molecule
    .filter(f.col('linkedDiseases').isNotNull())
    .select(
        f.col('id').alias('drugId'),
        f.explode(f.col('linkedDiseases.rows')).alias('diseaseId')
    )
    .join(diseases, on='diseaseId', how='left')
    .groupby('drugId')
    .agg(
        f.collect_set('diseaseName').alias('linkedDiseaseName')
    )
    .persist()
)

resolved_molecules = (
    molecule
    .select(
        f.col('id').alias('drugId'),
        f.col('drugName').alias('drugName'),
    )
    .join(drugs_w_linked_disease_names, on='drugId', how='left')
    .join(drugs_w_linked_target_names, on='drugId', how='left')
)

joined_mapped_linked = (
    disease_evidence
    .select(
        f.col('drugName'),
        f.col('diseaseName').alias('evidenceDiseaseName'),
        f.col('targetName').alias('interactingTargetName'),
        f.col('variantIds')
    )
    .join(resolved_molecules, on='drugName', how='left')
    .persist()
)
joined_mapped_linked.show(1, False, True)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 drugName              | CHEMBL121790                                                                                                                                                                                                                                                                                                                        
 evidenceDiseaseName   | marfan syndrome/loeys-dietz syndrome/familial thoracic aortic aneurysms and dissections                                                                                                                                                                                            

22/05/17 14:07:19 WARN CacheManager: Asked to cache already cached data.
22/05/17 14:07:19 WARN CacheManager: Asked to cache already cached data.
22/05/17 14:07:19 WARN CacheManager: Asked to cache already cached data.


In [25]:
# Collecting list of variants
v_list = (
    joined_mapped_linked
    .filter(f.col('linkedDiseaseName').isNotNull())
    .select(f.explode('variantIds').alias('variantId'))
    .distinct()
    .collect()
)
variant_list = [variant.variantId.replace('_', ' ') for variant in v_list]

In [27]:
# Get residue nb and type
import io
import requests
import pandas as pd

def get_variant_info_protvar(variant_id):

    URL = 'https://www.ebi.ac.uk/ProtVar/api/download/stream'

    headers = {
        'accept': '*/*',
    }

    params = {
        'function': 'false',
        'population': 'false',
        'structure': 'false',
    }

    response = requests.post(URL, params=params, headers=headers, json=variant_id)

    return (
            pd.read_csv(io.StringIO(response.content.decode('utf-8')))
            [['User_input', 'Gene', 'Codon_change', 'Amino_acid_change', 'Protein_name', 'Amino_acid_position', 'Consequences']]
    )

# protVarMappings = spark.createDataFrame(get_variant_info_protvar(variant_list))
# protVarMappings.show()

protVarMappings_pdf = get_variant_info_protvar(variant_list)
protVarMappings_pdf.head()

Unnamed: 0,User_input,Gene,Codon_change,Amino_acid_change,Protein_name,Amino_acid_position,Consequences
0,18 31598592 G A,TTR,Ggc/Agc,Gly/Ser,Transthyretin,121.0,missense
1,4 73412007 G A,ALB,cGc/cAc,Arg/His,Albumin,242.0,missense
2,18 31595247 C A,TTR,Cau/Aau,His/Asn,Transthyretin,110.0,missense
3,4 73412007 G C,ALB,cGc/cCc,Arg/Pro,Albumin,242.0,missense
4,18 31595242 A G,TTR,cAu/cGu,His/Arg,Transthyretin,108.0,missense


In [54]:
def get_mutation(row):

    try:
        position = int(row['Amino_acid_position'])
        return row['Amino_acid_change'].replace('/', str(position))
    except:
        return None


# Dict with variant and residue nb and type in the good format
protVar_map = (
    protVarMappings_pdf
    .assign(
        variantId = lambda df: df.User_input.str.replace(' ', '_'),
        mutation = lambda df: df.apply(get_mutation, axis=1)
    )
    [['variantId', 'mutation']]
    .groupby('variantId')
    .agg({
        'mutation': lambda s: ' '.join(s.loc[s.notna()].to_list())
    })
    .mutation
    .to_dict()
)
protVar_map

{'10_102834074_G_A': 'Arg239*',
 '10_102835316_C_T': 'Arg125Gln',
 '10_102835350_A_C': 'Phe114Val',
 '10_102837075_C_T': 'Arg96Gln',
 '10_102837076_G_A': 'Arg96Trp',
 '10_103090689_A_T': 'Tyr457*',
 '10_103101286_G_A': 'Arg144*',
 '10_43116635_C_G': 'Leu730Val',
 '10_43116637_A_G': 'Leu730Leu',
 '10_43116661_G_A': 'Val738Val',
 '10_43118450_A_G': 'Ile788Val',
 '10_43119548_G_A': 'Val804Met',
 '10_43119548_G_C': 'Val804Leu',
 '10_43119548_G_T': 'Val804Leu',
 '10_43119550_G_A': 'Val804Val',
 '10_43119550_G_T': 'Val804Val',
 '10_43119551_G_C': 'Glu805Gln',
 '10_43119554_T_C': 'Tyr806His',
 '10_43119555_A_G': 'Tyr806Cys',
 '10_43119556_C_T': 'Tyr806Tyr',
 '10_43119557_G_T': 'Ala807Ser',
 '10_43119558_C_T': 'Ala807Val',
 '10_43119570_C_G': 'Ser811Cys',
 '10_43120114_C_G': 'Leu881Val',
 '10_43120116_G_A': 'Leu881Leu',
 '10_43120149_T_C': 'Asp892Asp',
 '10_92607199_G_T': 'Gly117Cys',
 '10_92607231_G_A': 'Trp127*',
 '10_93600698_C_T': 'Ala73Thr',
 '10_93600920_G_C': 'Arg37Gly',
 '10_94947919_T

In [122]:
ca = {}
ca.update({'ici': 'et la'})
print(ca)

{'ici': 'et la'}


In [123]:
tests = [1, 2, 3, 4, 5, 6]
test_dict = {1: 'un', 2: 'deux', 3: 'trois', 4: 'quatre', 5: 'cinq', 6: 'six'}

In [124]:
ca.update({test: test_dict[test] for test in tests if test in test_dict})
print(ca)

{'ici': 'et la', 1: 'un', 2: 'deux', 3: 'trois', 4: 'quatre', 5: 'cinq', 6: 'six'}


In [127]:
map_mutations = f.udf( lambda variantIds: ' '.join([protVar_map[variant] for variant in variantIds if variant in protVar_map]), StringType())

w_mutations = (
    joined_mapped_linked
    .filter(f.col('linkedDiseaseName').isNotNull())
    .withColumn('mutations', map_mutations(f.col('variantIds')))
    # .show(10, False, True)
    .persist()
)

w_mutations.show(5, False, True)

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 drugName              | INDOMETHACIN                                                                                                     

22/05/17 14:44:31 WARN CacheManager: Asked to cache already cached data.


In [89]:
w_mutations.printSchema()

root
 |-- drugName: string (nullable = true)
 |-- evidenceDiseaseName: string (nullable = true)
 |-- interactingTargetName: string (nullable = true)
 |-- variantIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- drugId: string (nullable = true)
 |-- linkedDiseaseName: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- linkedTargetName: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- mutations: string (nullable = true)



In [67]:
(
    w_mutations
    .groupBy('drugName', 'interactingTargetName')
    .agg(
        f.collect_set('evidenceDiseaseName').alias('evidenceDiseaseNames'),
        f.first('drugId'),
        f.first('linkedDiseaseName'),
        f.first('linkedTargetName'),
        f.collect_set('mutations')
    )
    # .withColumn('newDiseases', compareList())
    .show(20, False, True)
)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [19]:
# Let's find those lements in array a, which not in array b:
interesting_joined_mapped_linked = (
    joined_mapped_linked_pd
    .assign(new_values = lambda df: df.apply(lambda row: [a for a in row['mappedtargetName'] if a not in row['linkedTargetName']], axis=1))
)
interesting_joined_mapped_linked_sp = spark.createDataFrame(interesting_joined_mapped_linked)
# interesting_joined_mapped_linked_sp.write.parquet('interesting_drug_repurpose')

In [20]:
interesting_joined_mapped_linked_sp.count()

717