In [2]:
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F

spark = SparkSession.builder.getOrCreate()
coloc=spark.read.parquet("gs://genetics-portal-dev-data/22.09.1/outputs/v2d_coloc").filter(F.col("right_type") != "gwas")

                                                                                

In [4]:
### load ot_genetics_portal 

var_filter_lof = [
    ### High impact variants https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html
    "SO_0001589",  ## frameshit_variant
    "SO_0001587",  ## stop_gained
    "SO_0001574",  ## splice_acceptor_variant
    "SO_0001575",  ## splice_donor_variant
    "SO_0002012",  ## start_lost
    "SO_0001578",  ## stop_lost
    "SO_0001893",  ## transcript_ablation
]

gof = ["SO_0002053"]
lof = ["SO_0002054"]

evd = (
    spark.read.parquet(
        "gs://open-targets-data-releases/23.06/output/etl/parquet/evidence/sourceId=ot_genetics_portal"
    )
    .selectExpr(
        "studyId",
        "variantId as locusId",
        "targetId",
        "diseaseId",
        "oddsRatio",
        "beta",
        "variantFunctionalConsequenceFromQtlId",
        "variantFunctionalConsequenceId"
       # "score",
    )
    
    .withColumn(
        "effect",
        F.when((F.col("beta") > 0) | (F.col("oddsRatio") > 1), "risk")
        .when((F.col("beta") < 0) | (F.col("oddsRatio") < 1), "protective")
        .otherwise("unknown"),  # 23% of evidence has unknown direction of effect
    )
    .withColumn("functionalVar",F.when(F.col("variantFunctionalConsequenceId").isin(var_filter_lof), F.lit("lof")).otherwise("noInfo"))
    .persist()
)

In [5]:
### load v2d for exploring

v2d=spark.read.parquet("gs://genetics-portal-dev-data/22.09.1/outputs/v2d")
v2d_gwas=v2d.select(
        F.concat_ws("_", "lead_chrom", "lead_pos", "lead_ref", "lead_alt").alias(
            "left_locus_id"
        ),
        F.col("beta"),
        F.col("odds_ratio"),
        F.col("study_id"),
        F.col("trait_reported"),
        F.col("trait_efos"),
        F.col("n_cases"),
        F.col("n_initial"))

In [6]:
### load QTL tissues mapped to therapy areas 
onto_samples=spark.read.csv("hdfs:///mydata/updated/mappedSQLtissuesGTP3_5.csv",header=True)

                                                                                

In [7]:
onto_samples.show()

+--------------------+--------------------+--------------------+----+----+
|            original|             curated|  curated_simplified| _c3| _c4|
+--------------------+--------------------+--------------------+----+----+
|Whole_Blood - EFO...|Whole_Blood - EFO...|Whole_Blood - EFO...|null|null|
|VAGINA - OTAR_000...|Vagina - OTAR_000...|Vagina - OTAR_000...|null|null|
|Vagina - OTAR_000...|Vagina - OTAR_000...|Vagina - OTAR_000...|null|null|
|UTERUS - OTAR_000...|Uterus - OTAR_000...|Uterus - OTAR_000...|null|null|
|Uterus - OTAR_000...|Uterus - OTAR_000...|Uterus - OTAR_000...|null|null|
|UBERON_0001969 - ...|UBERON_0001969 - ...|UBERON_0001969 - ...|null|null|
|UBERON_0000178 - ...|UBERON_0000178 - ...|UBERON_0000178 - ...|null|null|
|TREG_NAIVE - EFO_...|TREG_NAIVE - EFO_...|TREG_NAIVE - EFO_...|null|null|
|TREG_MEMORY - EFO...|TREG_MEMORY - EFO...|TREG_MEMORY - EFO...|null|null|
|TRANSVERSE_COLON ...|TRANSVERSE_COLON ...|TRANSVERSE_COLON ...|null|null|
|THYROID - EFO_000...|THY

In [8]:
### take ontology of samples 
samplesOnto=onto_samples.withColumn("right_bio_feature", F.split(F.col("original"), " - ").getItem(0)
).withColumn("therapyArea", F.split(F.col("curated_simplified"), " - ").getItem(2)
).withColumn("EFO", F.split(F.col("curated_simplified"), " - ").getItem(1)
).drop("curated_simplified","original","curated","_c3","_c4")

In [9]:
samplesOnto.show(200)

[Stage 6:>                                                          (0 + 1) / 1]

+--------------------+--------------------+------------+
|   right_bio_feature|         therapyArea|         EFO|
+--------------------+--------------------+------------+
|         Whole_Blood| hematologic disease| EFO_0005803|
|              VAGINA|reproductive syst...|OTAR_0000017|
|              Vagina|reproductive syst...|OTAR_0000017|
|              UTERUS|reproductive syst...|OTAR_0000017|
|              Uterus|reproductive syst...|OTAR_0000017|
|      UBERON_0001969| hematologic disease| EFO_0005803|
|      UBERON_0000178| hematologic disease| EFO_0005803|
|          TREG_NAIVE|immune system dis...| EFO_0000540|
|         TREG_MEMORY|immune system dis...| EFO_0000540|
|    TRANSVERSE_COLON|gastrointestinal ...| EFO_0010282|
|             THYROID|endocrine system ...| EFO_0001379|
|             Thyroid|endocrine system ...| EFO_0001379|
|          TH2_MEMORY|immune system dis...| EFO_0000540|
|         TH17_MEMORY|immune system dis...| EFO_0000540|
|       TH1-17_MEMORY|immune sy

                                                                                

In [10]:
#### make format for left_locus and right locus in coloc 
## assessment of beta
## join biosamples with the phenotype 
coloc2 = coloc.select(
    F.concat_ws("_", "left_chrom", "left_pos", "left_ref", "left_alt").alias(
        "left_locus_id"
    ),
    F.concat_ws("_", "right_chrom", "right_pos", "right_ref", "right_alt").alias(
        "right_locus_id"
    ),
    F.col("left_study").alias("left_study_id"),
    F.col("right_study").alias("right_study_id"),
    "right_gene_id",
    "coloc_h4",
    "left_var_right_study_beta",
    F.col("left_type"),
    F.col("right_type"),
    F.col("right_bio_feature"),
    F.col("is_flipped"),
).withColumn(
    "beta_assessed",
    F.when(F.col("left_var_right_study_beta") > 0,
    F.lit("gof")).when(F.col("left_var_right_study_beta") < 0, F.lit("lof")).otherwise(F.lit("neutral"))
).join(samplesOnto,on=["right_bio_feature"],how="left")

In [11]:
### check for disparities using count of different assessment for beta for target

disparities=coloc2.groupBy("left_locus_id", "left_study_id", "right_gene_id"
    ).agg(
    F.size(F.collect_set("beta_assessed")).alias("count"),
)
### add the label of which left_locus_id,left_study_id and right_gene_id are having contradictions 

coloc3 = coloc2.join(
    disparities, on=["left_locus_id", "left_study_id", "right_gene_id"], how="left"
)

### dataset for focusing only in disparities using studiId and 
coloc4=coloc3.filter(F.col("count") > 1).groupBy(
    "left_locus_id", "left_study_id", "right_gene_id"
).agg(F.size(F.collect_set("right_study_id")),F.size(F.collect_set("right_bio_feature")))

In [12]:
## we should include the study right type because is the one that could give us contradictions
coloc3.filter((F.col("count") > 1) & (F.col("beta_assessed") != "neutral")).groupBy(
    "left_locus_id",
    "left_study_id",
    "right_gene_id",
    
    "right_study_id",
).pivot("beta_assessed").agg(
    F.collect_set("right_bio_feature").alias("tissues"),
    F.collect_set("therapyArea").alias("therArea"),
    F.size(F.collect_set("right_bio_feature")).alias("nr_tissues"),
).show(vertical=True, truncate=False)

### groupBy therapeutic area
###

                                                                                

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 left_locus_id  | 10_100152307_T_C                                                                                                                                                                                                              

                                                                                

In [24]:
coloc_study=coloc3.filter((F.col("count") > 1) & (F.col("beta_assessed") != "neutral")).groupBy(
    "left_locus_id",
    "left_study_id",
    "right_gene_id",
    #"right_study_id",
).pivot("beta_assessed").agg(
    F.collect_set("right_bio_feature").alias("tissues"),
    F.collect_set("therapyArea").alias("therArea"),
    F.size(F.collect_set("right_bio_feature")).alias("nr_tissues"),
    F.size(F.collect_set("therapyArea")).alias("nr_therArea"),
    F.countDistinct("right_study_id").alias("count_Studies")
).withColumn("therAreaOverlap", F.array_intersect(F.col("gof_therArea"),F.col("lof_therArea"))
).withColumn("nr_therAreaOverlap", F.size(F.array_intersect(F.col("gof_therArea"),F.col("lof_therArea")))).persist()

                                                                                

In [25]:
coloc_study.count()

                                                                                

259827

In [26]:
coloc_study.groupBy("nr_therAreaOverlap").count().show()

+------------------+------+
|nr_therAreaOverlap| count|
+------------------+------+
|                12|    81|
|                 1| 71626|
|                 6|  2246|
|                 3|  7598|
|                 5|  3031|
|                 9|   738|
|                 4|  5228|
|                 8|  1148|
|                 7|  1700|
|                10|   453|
|                11|   314|
|                 2| 16867|
|                 0|148777|
|                13|    20|
+------------------+------+



In [12]:
coloc3.select("right_study_id").distinct().count()

                                                                                

38

In [97]:
coloc3.select("right_bio_feature").distinct().toPandas().to_csv("tissues_coloc.csv")

                                                                                

In [68]:
print((243220+16607)/666261)

0.389977801492208


In [38]:
coloc3=coloc2.join(v2d_gwas.withColumnRenamed("study_id","left_study_id"), on=["left_locus_id","left_study_id"],how="left").persist()

In [39]:
coloc3.filter(F.col("is_flipped")=="true").show(truncate=False,vertical=True)

[Stage 42:>                                                         (0 + 1) / 1]

-RECORD 0---------------------------------------------
 left_locus_id             | 10_100152307_T_C         
 left_study_id             | GCST90025955             
 right_locus_id            | 10_100129660_T_C         
 right_study_id            | GTEx-sQTL                
 right_gene_id             | ENSG00000155287          
 coloc_h4                  | 0.9678297263356717       
 left_var_right_study_beta | -0.4308566451072693      
 left_type                 | gwas                     
 right_type                | sqtl                     
 right_bio_feature         | Brain_Cortex             
 is_flipped                | true                     
 beta                      | 0.0280807                
 odds_ratio                | null                     
 trait_reported            | Apolipoprotein A1 levels 
 trait_efos                | [EFO_0004614]            
 n_cases                   | null                     
 n_initial                 | 398508                   
-RECORD 1-

                                                                                

In [46]:
coloc3.groupBy("left_locus_id","right_locus_id","left_study_id").agg(F.countDistinct("right_gene_id").alias("count")).sort(F.col("count").desc()).show()

23/09/14 10:43:09 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_155_27 !
23/09/14 10:43:09 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_155_7 !
23/09/14 10:43:09 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_155_95 !
23/09/14 10:43:09 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_155_121 !
23/09/14 10:43:09 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_155_197 !
23/09/14 10:43:09 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_155_91 !
23/09/14 10:43:09 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_155_81 !
23/09/14 10:43:09 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_155_160 !
23/09/14 10:43:09 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_155_116 !
23/09/14 10:43:09 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_155_54 !
23/09/14 10:43:09 WARN BlockManagerMasterEndpoi

KeyboardInterrupt: 

