In [None]:
#### 

import pyspark
from pyspark.sql import DataFrame, SparkSession
import pyspark.sql.functions as F
from pyspark.sql import Window

from psutil import virtual_memory
from pyspark import SparkFiles
from pyspark.conf import SparkConf
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import col


def detect_spark_memory_limit():
    """Spark does not automatically use all available memory on a machine. When working on large datasets, this may
    cause Java heap space errors, even though there is plenty of RAM available. To fix this, we detect the total amount
    of physical memory and allow Spark to use (almost) all of it."""
    mem_gib = virtual_memory().total >> 30
    return int(mem_gib * 0.9)


spark_mem_limit = detect_spark_memory_limit()
spark_conf = (
    SparkConf()
    .set("spark.driver.memory", f"{spark_mem_limit}g")
    .set("spark.executor.memory", f"{spark_mem_limit}g")
    .set("spark.driver.maxResultSize", "0")
    .set("spark.debug.maxToStringFields", "2000000000")
    .set("spark.sql.execution.arrow.maxRecordsPerBatch", "500000")
    ###.set("spark.executor.heartbeatInterval", "3600s")
    .set(
        "spark.sql.execution.arrow.pyspark.enabled", "true"
    )  ## esto lo pongo por esto: https://stackoverflow.com/questions/69973790/pyspark-spark-sparkexception-job-aborted-due-to-stage-failure-task-0-in-stage
    .set("spark.ui.showConsoleProgress", "false")
)

spark = (
    SparkSession.builder.config(conf=spark_conf)
    .master("local[*]")
    .config("spark.driver.bindAddress", "127.0.0.1")
    .config("spark.driver.host","localhost") ### Run locally 
    .getOrCreate()
)

In [None]:
#1# Make a list of variant of interest (Sequence ontology terms) to subset data of interest. 

### Bear in mind that SO works with ontology structure as: SO:XXXXXX, but databases has the SO as: SO_XXXXXX

var_filter_lof = [
    ### High impact variants https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html
    "SO_0001589",## frameshit_variant
    "SO_0001587",## stop_gained
    "SO_0001574",## splice_acceptor_variant
    "SO_0001575",## splice_donor_variant
    "SO_0002012",## start_lost
    "SO_0001578",## stop_lost
    "SO_0001893",## transcript_ablation
    # "SO:0001889", ## transcript_amplification ## the Only HIGH impact that increase protein.
]

gof=['SO_0002053']
lof=['SO_0002054']
## Building Sequence Ontology 
so_path="/Users/juanr/Desktop/Target_Engine/data_download/sequenceOntology_20221118.csv"
so_ontology=spark.read.csv(so_path, header=True)
building=(so_ontology
.select(F.col('Accession'), F.col('Parents'))
.withColumn('Parentalind',
    F.split(F.col('Parents'), ","))
.withColumn('Parentalind', F.explode_outer('Parentalind'))
.groupBy('Parentalind')
.agg(F.collect_list(F.col('Accession')).alias('childrens'))
.join(so_ontology, F.col('Parentalind')==so_ontology.Accession, 'right')
)


### Load evidence datasources downloaded in January 2023: 

otgenetics_evidence_path="/Users/juanr/Desktop/Target_Engine/DownloadFebruary_Release23.02/evidence/sourceId=ot_genetics_portal"
otgenetics=spark.read.parquet(otgenetics_evidence_path)
gene_burden_path="/Users/juanr/Desktop/Target_Engine/DownloadFebruary_Release23.02/evidence/sourceId=gene_burden"
gene_burden=spark.read.parquet(gene_burden_path)
eva_path="/Users/juanr/Desktop/Target_Engine/DownloadFebruary_Release23.02/evidence/sourceId=eva"
eva_germline=spark.read.parquet(eva_path)
eva_somatic_path="/Users/juanr/Desktop/Target_Engine/DownloadFebruary_Release23.02/evidence/sourceId=eva_somatic"
eva_somatic=spark.read.parquet(eva_somatic_path)
gel_path="/Users/juanr/Desktop/Target_Engine/DownloadFebruary_Release23.02/evidence/sourceId=genomics_england"
gel=spark.read.parquet(gel_path)
g2p_path="/Users/juanr/Desktop/Target_Engine/DownloadFebruary_Release23.02/evidence/sourceId=gene2phenotype"
g2p=spark.read.parquet(g2p_path)
uniprot_path="/Users/juanr/Desktop/Target_Engine/DownloadFebruary_Release23.02/evidence/sourceId=uniprot_literature"
uniprot=spark.read.parquet(uniprot_path)
uniprotvar_path="/Users/juanr/Desktop/Target_Engine/DownloadFebruary_Release23.02/evidence/sourceId=uniprot_variants"
uniprotvar=spark.read.parquet(uniprotvar_path)
orphanet_path="/Users/juanr/Desktop/Target_Engine/DownloadFebruary_Release23.02/evidence/sourceId=orphanet"
orphanet=spark.read.parquet(orphanet_path)
clingen_path="/Users/juanr/Desktop/Target_Engine/DownloadFebruary_Release23.02/evidence/sourceId=clingen"
clingen=spark.read.parquet(clingen_path)
cgc_path="/Users/juanr/Desktop/Target_Engine/DownloadFebruary_Release23.02/evidence/sourceId=cancer_gene_census"
cgc=spark.read.parquet(cgc_path)
intogen_path="/Users/juanr/Desktop/Target_Engine/DownloadFebruary_Release23.02/evidence/sourceId=intogen"
intogen=spark.read.parquet(intogen_path)
impc_path="/Users/juanr/Desktop/Target_Engine/DownloadFebruary_Release23.02/evidence/sourceId=impc"
impc=spark.read.parquet(impc_path)
chembl_evidences="/Users/juanr/Desktop/Target_Engine/DownloadFebruary_Release23.02/evidence/sourceId=chembl/"
chembl=spark.read.parquet(chembl_evidences)


## others
target_path="/Users/juanr/Desktop/Target_Engine/downloadedEvidencesJanuary/targets/"
target=spark.read.parquet(target_path)
disease_path="/Users/juanr/Desktop/Target_Engine/data_download/Parquet/diseases/"
diseases=spark.read.parquet(disease_path)
dis_name=diseases.select('id','name')
indication_path="/Users/juanr/Desktop/Target_Engine/downloadedEvidencesJanuary/indication/"
indication=spark.read.parquet(indication_path)
drug_path="/Users/juanr/Desktop/Target_Engine/downloadedEvidencesJanuary/molecule/"
drug=spark.read.parquet(drug_path)
mecact_path="/Users/juanr/Desktop/Target_Engine/downloadedEvidencesJanuary/mechanismOfAction/"
mecact=spark.read.parquet(mecact_path)

#### GENE BURDEN

### We manually annotated those studies using LoF or PTV variants

burden_lof_path="/Users/juanr/Desktop/Target_Engine/Conteo_estudios_geneBurden_20230117.csv"
burden_lof=spark.read.csv(burden_lof_path, header=True)
burden_lof=burden_lof.withColumnRenamed('statisticalMethodOverview','stMethod')

#### Para gene burden la funcion no tiene que hacer un filtrado de variantes

### EVA/ClinVar 

##- Manually annotate which are the clinicalSignificances meaningfull: pathogenic, risk factor, protective

clinSign_germline_path="/Users/juanr/Desktop/Target_Engine/eva_clinSig_20230117.csv"
clinSign_somatic_path="/Users/juanr/Desktop/Target_Engine/eva_somatic_clinSig_20230117.csv"

clinSign_germline=spark.read.csv(clinSign_germline_path, header=True)
clinSign_germline=clinSign_germline.withColumnRenamed('clinicalSignificances','significances')
clinSign_somatic=spark.read.csv(clinSign_somatic_path, header=True)
clinSign_somatic=clinSign_somatic.withColumnRenamed('clinicalSignificances','significances')

##-  Transform array of clinicalSignificances into Strings to check them.

eva_somatic_toAsses=(eva_somatic
.withColumn('clinicalSignificances',F.concat_ws(",",F.col("clinicalSignificances"))))

eva_germline_toAsses=(eva_germline
.withColumn('clinicalSignificances',F.concat_ws(",",F.col("clinicalSignificances"))))


## annotate TSG/oncogene/bivalent using 'hallmarks.attributes'

oncotsg_list = ['TSG','oncogene','Oncogene','oncogene','oncogene,TSG','TSG,oncogene','fusion,oncogene','oncogene,fusion']

oncolabel=(target
.select('id','approvedSymbol',F.explode_outer(F.col('hallmarks.attributes')))
.select('id','approvedSymbol','col.description')
.filter(F.col('description').isin(oncotsg_list))
.groupBy('id','approvedSymbol')
.agg(F.collect_set('description').alias('description'))
.withColumn('description_splited',
    F.concat_ws(",", F.col('description')))
.withColumn('TSorOncogene', 
    F.when(
        (F.col('description_splited').rlike('ncogene') &
        F.col('description_splited').rlike('TSG'))
        , F.lit('bivalent'))
    .when(
        F.col('description_splited').rlike('ncogene(\s|$)')
        , F.lit('oncogene'))
    .when(
        F.col('description_splited').rlike('TSG(\s|$)')
        , F.lit('TSG'))       
    .otherwise(F.lit('noEvaluable')))
.withColumnRenamed('id','target_id')

)

#### rlike('('+Keywords+')(\s|$)'



### Hacer el join del actionType con el chembl para sacar los mecanismos de accion. 
inhibitors = [
'RNAI INHIBITOR',
'NEGATIVE MODULATOR',
'NEGATIVE ALLOSTERIC MODULATOR',
'ANTAGONIST',
'ANTISENSE INHIBITOR',
'BLOCKER',
'INHIBITOR',
'DEGRADER',
'INVERSE AGONIST',
'ALLOSTERIC ANTAGONIST']

activators=[
'PARTIAL AGONIST',
'ACTIVATOR',
'POSITIVE ALLOSTERIC MODULATOR',
'POSITIVE MODULATOR',
'AGONIST',
'SEQUESTERING AGENT']

columnas= ['activator','inhibitor']
both=activators+inhibitors

actiontype2=(mecact
.select('chemblIds','actionType','mechanismOfAction','targets')
.select(F.explode_outer('chemblIds').alias('drugId2'),'actionType','mechanismOfAction','targets')
.select(F.explode_outer('targets').alias('targetId2'), 'drugId2','actionType','mechanismOfAction')
.dropDuplicates())


chembl1=chembl.select('targetId','drugId','diseaseId','clinicalPhase','diseaseFromSourceId')
chembl2=(chembl1
.join(actiontype2,
    (actiontype2.drugId2==F.col('drugId')) & 
    (actiontype2.targetId2==F.col('targetId')),
 'left')
.drop('targetId2','drugId2')
###.dropDuplicates()
.withColumn('twoCategories_new',
    F.when(F.col('actionType').isin(inhibitors), F.lit('inhibitor'))
    .when(F.col('actionType').isin(activators), F.lit('activator'))
    .otherwise(F.lit('noEvaluable'))))

chembl3=(chembl2
.filter(F.col('twoCategories_new')!='noEvaluable')
.groupBy('targetId','diseaseId')
.pivot('twoCategories_new')
.agg(F.count('targetId')))

chembl4=(chembl3
.select(
    'targetId',
    'diseaseId',
    ##'clinicalPhase',
    *(F.col(c).cast("int").alias(c) for c in columnas))
.withColumn('coherency',
    F.when(
        (
        (F.col('activator').isNotNull()) &
        (F.col('inhibitor').isNotNull())
        ),
            F.when(
                (F.col('activator'))-(F.col('inhibitor'))!=(F.col('activator'))
                ,F.lit('dispar'))
    )
))

In [None]:
### Join all datasets

dfs=[otgenetics,gene_burden,eva_germline,eva_somatic,g2p,orphanet,cgc,intogen,impc,chembl]

all = dfs[0]
for df in dfs[1:]:
    all = all.unionByName(df, allowMissingColumns=True)
all.count()

In [None]:
#### 20230203 ### 
prueba_assessment=(all
.withColumn("beta",F.col("beta").cast('float')) ## from ot genetics & gene burden
.withColumn("OddsRatio",F.col("OddsRatio").cast('float')) ## from ot genetics & gene burden
.withColumn('clinicalSignificances',F.concat_ws(",",F.col("clinicalSignificances"))) ### from eva
.withColumn('exploded',F.explode_outer(F.col('mutatedSamples'))) ### para cgc e intogen
.withColumn('variantConsequence',F.col('exploded.functionalConsequenceId'))### para cgc e intogen
### .withColumn('numberSamplesSameMutationType',F.col('exploded.numberSamplesWithMutationType'))### para cgc e intogen
.withColumn("mutatedSamplesVariantInfo", F.coalesce(F.col('mutatedSamples.functionalConsequenceId'), F.array()))### para cgc e intogen

.join(oncolabel, 
oncolabel.target_id==F.col('targetId'), 'left')### para cgc 
.join(burden_lof, 
burden_lof.stMethod == F.col('statisticalMethodOverview'), 'left') ### para gene burden
.join(actiontype2, ## para chembl
    (actiontype2.drugId2==F.col('drugId')) & 
    (actiontype2.targetId2==F.col('targetId')),
 'left')
##.drop('targetId2','drugId2')
###.dropDuplicates()

.withColumn('Assessment',

#### Ot_genetics Portal ### updated to include the coloc+gwas analysis 
        F.when(F.col('datasourceId')=='ot_genetics_portal',

                F.when(  ### label 14 evidences that are contradictory
                        (                       
                        (F.col('variantFunctionalConsequenceFromQtlId')=='SO_0002315') &
                        (F.col('variantFunctionalConsequenceId').isin(var_filter_lof)) 
                        )
                        ,
                        F.lit('dispar'))
                ### evidences with gwas+coloc increased expression without +var_lof
                .when(  
                        (
                        (F.col('beta').isNull()) &
                        (F.col('variantFunctionalConsequenceFromQtlId')=='SO_0002315') &
                        (F.col('variantFunctionalConsequenceId').isin(var_filter_lof) == False)
                        ),
                                F.when((F.col('OddsRatio') >1), F.lit('GoF_risk'))
                                 .when((F.col('OddsRatio') <1) , F.lit('GoF_protect'))                 
                        )
                .when(
                        (
                        (F.col('oddsRatio').isNull()) &
                        (F.col('variantFunctionalConsequenceFromQtlId')=='SO_0002315') &
                        (F.col('variantFunctionalConsequenceId').isin(var_filter_lof) == False)
                        ),
                                F.when((F.col('beta') <0), F.lit('GoF_protect'))
                                .when((F.col('beta') >0), F.lit('GoF_risk'))
                        ) 


                ### evidences with coherent Gwas-coloc + var_lof
                .when(  
                        (
                        (F.col('beta').isNull()) &
                        (F.col('variantFunctionalConsequenceFromQtlId')=='SO_0002316') &
                        (F.col('variantFunctionalConsequenceId').isin(var_filter_lof))
                        ),
                                F.when((F.col('OddsRatio') >1), F.lit('LoF_risk'))
                                 .when((F.col('OddsRatio') <1) , F.lit('LoF_protect'))                 
                        )
                .when(
                        (
                        (F.col('oddsRatio').isNull()) &
                        (F.col('variantFunctionalConsequenceFromQtlId')=='SO_0002316') &
                        (F.col('variantFunctionalConsequenceId').isin(var_filter_lof))
                        ),
                                F.when((F.col('beta') <0), F.lit('LoF_protect'))
                                .when((F.col('beta') >0), F.lit('LoF_risk'))
                        ) 
                ### evidences with colo+Gwas data but not variants
                .when(  
                        (
                        (F.col('beta').isNull()) &
                        (F.col('variantFunctionalConsequenceFromQtlId')=='SO_0002316') &
                        (F.col('variantFunctionalConsequenceId').isin(var_filter_lof) == False)
                        ),
                                F.when((F.col('OddsRatio') >1), F.lit('LoF_risk'))
                                 .when((F.col('OddsRatio') <1) , F.lit('LoF_protect'))                 
                        )
                .when(
                        (
                        (F.col('oddsRatio').isNull()) &
                        (F.col('variantFunctionalConsequenceFromQtlId')=='SO_0002316') &
                        (F.col('variantFunctionalConsequenceId').isin(var_filter_lof) == False)
                        ),
                                F.when((F.col('beta') <0), F.lit('LoF_protect'))
                                .when((F.col('beta') >0), F.lit('LoF_risk'))
                        ) 
                ### evidences with coherent non/inconclusive gwas+coloc + var_lof
                .when(  
                        (
                        (F.col('beta').isNull()) &
                                (
                                (F.col('variantFunctionalConsequenceFromQtlId')=='SO_0002314') |
                                (F.col('variantFunctionalConsequenceFromQtlId').isNull())
                                ) &
                        (F.col('variantFunctionalConsequenceId').isin(var_filter_lof))
                        ),
                                F.when((F.col('OddsRatio') >1), F.lit('LoF_risk'))
                                 .when((F.col('OddsRatio') <1) , F.lit('LoF_protect'))                 
                        )
                .when(  
                        (
                        (F.col('oddsRatio').isNull()) &
                                (
                                (F.col('variantFunctionalConsequenceFromQtlId')=='SO_0002314') |
                                (F.col('variantFunctionalConsequenceFromQtlId').isNull())
                                ) &
                        (F.col('variantFunctionalConsequenceId').isin(var_filter_lof))
                        ),
                                F.when((F.col('beta') <0), F.lit('LoF_protect'))
                                .when((F.col('beta') >0), F.lit('LoF_risk'))
                        ) 
                
                .otherwise(F.lit('noEvaluable')))  ### son tambien no data las que tiene riesgo pero no tienen LoF

#### Gene burden                            
        .when(F.col('datasourceId')=='gene_burden',
        ### .filter(F.col('variantType').isin(var_filter))
                F.when(
                        (
                        (F.col('whatToDo')=='get')&
                        (F.col('beta').isNull()) &  
                        (F.col('OddsRatio') > 1)
                        ), 
                        F.lit('LoF_risk')
                        )
                .when(
                        (
                        (F.col('whatToDo')=='get')&
                        (F.col('beta').isNull()) & 
                        (F.col('OddsRatio') <1) 
                        ),
                        F.lit('LoF_protect')
                )
                .when(
                        (
                        (F.col('whatToDo')=='get')&
                        (F.col('OddsRatio').isNull()) & 
                        (F.col('beta') >0)
                        ), 
                        F.lit('LoF_risk')
                )
                .when(
                        (
                        (F.col('whatToDo')=='get')&
                        (F.col('OddsRatio').isNull()) & 
                        (F.col('beta') <0) 
                        ),
                        F.lit('LoF_protect')
                )
                .otherwise(F.lit('noEvaluable')) ### son tambien no data las que tiene riesgo pero no se ensayan LoF o PT 
                        )
#### Eva_germline                        
        .when(F.col('datasourceId')=='eva',              
        #### .filter(F.col('variantFunctionalConsequenceId').isin(var_filter_lof))
                F.when(
                        (
                        ## (F.col('clinicalSignificances')!='likely pathogenic') &  
                        (F.col('variantFunctionalConsequenceId').isin(var_filter_lof)) &
                        F.col('clinicalSignificances').rlike('(pathogenic)$')
                        ),
                        F.lit('LoF_risk'))
                .when(
                        (
                        F.col('clinicalSignificances').contains('protective')&  
                        F.col('variantFunctionalConsequenceId').isin(var_filter_lof)),
                        F.lit('LoF_protect'))
                .otherwise(F.lit('noEvaluable')) ### Son todas aquellas que tenen info pero no son patogenicas/protective  + LoF
                        )
        #### Eva_somatic  
        .when(F.col('datasourceId')=='eva_somatic',              
        #### .filter(F.col('variantFunctionalConsequenceId').isin(var_filter_lof))
                F.when(
                        (
                        ##(F.col('clinicalSignificances')!='likely pathogenic') &  
                        (F.col('variantFunctionalConsequenceId').isin(var_filter_lof)) &
                        F.col('clinicalSignificances').rlike('(pathogenic)$')
                        ),
                        F.lit('LoF_risk'))
                .when(
                        (
                        F.col('clinicalSignificances').contains('protective')&  
                        F.col('variantFunctionalConsequenceId').isin(var_filter_lof)),
                        F.lit('LoF_protect'))
                .otherwise(F.lit('noEvaluable')) ### Son todas aquellas que tenen info pero no son patogenicas/protective  + LoF
                        )
#### G2P                     
        .when(F.col('datasourceId')=='gene2phenotype',  
                F.when(
                        F.col('variantFunctionalConsequenceId')=='SO_0002317', F.lit('LoF_risk')
                        ) ### absent gene product
                .when(
                        F.col('variantFunctionalConsequenceId')=='SO_0002315', F.lit('GoF_risk')
                        ) ### increased gene product level
                .otherwise(F.lit('noEvaluable')))
#### Orphanet 
        .when(F.col('datasourceId')=='orphanet',  
                F.when(
                        F.col('variantFunctionalConsequenceId')=='SO_0002054', F.lit('LoF_risk')
                        ) ### Loss of Function Variant
                .when(
                        F.col('variantFunctionalConsequenceId')=='SO_0002053', F.lit('GoF_risk')
                        ) ### Gain_of_Function Variant
                .otherwise(F.lit('noEvaluable')))
 #### CGC               
        .when(F.col('datasourceId')=='cancer_gene_census',          
                F.when(
                        F.col('TSorOncogene')=='oncogene', F.lit('GoF_risk')
                )
                .when(
                        F.col('TSorOncogene')=='TSG', F.lit('LoF_risk')
                        )
                .when(
                        F.col('TSorOncogene')=='bivalent', F.lit('bivalent_risk'))

                .otherwise(
                        F.when(
                                F.arrays_overlap(
                                F.col('mutatedSamples.functionalConsequenceId'),
                                F.array([F.lit(i) for i in (var_filter_lof)])
                                ), F.lit('LoF_risk')
                                )
                        .otherwise(F.lit('noEvaluable'))
                        )
                ) #### Aqui asumimos que todo lo que esta incluido da riesgo, pero solo podemos dar LoF porque ya no tienen dato de TSG/oncogen
#### intogen
        .when(F.col('datasourceId')=='intogen',     
                F.when(
                        F.arrays_overlap(
                                F.col('mutatedSamples.functionalConsequenceId'),
                                F.array([F.lit(i) for i in (gof)])
                                ), F.lit('GoF_risk'))
                .when(
                        F.arrays_overlap(
                                F.col('mutatedSamples.functionalConsequenceId'),
                                F.array([F.lit(i) for i in (lof)])
                                ), F.lit('LoF_risk'))

                .otherwise(F.lit('noEvaluable'))
        )
#### impc        
        .when(F.col('datasourceId')=='impc', 
                F.when(
                        F.col('diseaseId').isNotNull(), F.lit('KO_risk')
                        )
                .otherwise(F.lit('noEvaluable')))
### chembl        
        .when(F.col('datasourceId')=='chembl',
                F.when(
                        F.col('actionType').isin(inhibitors), F.lit('LoF_protect')
                        )
                .when(
                        F.col('actionType').isin(activators), F.lit('GoF_protect')
                )
                .otherwise(F.lit('noEvaluable'))
                )
)

### Homogenizar para contar todos los datos juntos:
.withColumn('homogenized',
    F.when(F.col('Assessment')=='KO_risk', F.lit('LoF_risk'))
    .otherwise(F.col('Assessment')))
.withColumn('tendency',
    F.when(F.col('homogenized').contains('risk'), F.lit('Risk'))
    .when(F.col('homogenized').contains('protect'), F.lit('Protect'))
    .otherwise(F.lit('noEvaluable')))
.withColumn('variation',
    F.when(F.col('homogenized').contains('LoF'), F.lit('LoF'))
    .when(F.col('homogenized').contains('GoF'), F.lit('GoF'))
    .otherwise(F.lit('noEvaluable')))
)

In [None]:
#### checking contradictions intra datasources 03.02.2023 #### 

terms=['noEvaluable','bivalent_risk','null','dispar']


coherency_toAssess=(prueba_assessment
.filter((F.col('Assessment')).isin(terms)==False)
.groupBy('targetId','diseaseId','datasourceId')
.pivot('homogenized')
.agg(F.count('targetId')))

columns=[
 'GoF_risk',
 'LoF_protect',
 'LoF_risk',
 'GoF_protect']

coherency_assessed=(coherency_toAssess
.select(
    F.col('targetId').alias('targetId3'),
    F.col('datasourceId').alias('datasourceId3'),
    F.col('diseaseId').alias('diseaseId3'),
    *(F.col(c).cast("int").alias(c) for c in columns))

.withColumn('coherency',
    F.when(
        (
        (F.col('GoF_risk').isNotNull()) &
        (F.col('LoF_risk').isNotNull())
        ),
            F.when(
                (F.col('GoF_risk'))-(F.col('LoF_risk'))!=(F.col('GoF_risk'))
                ,F.lit('dispar'))
        )
    .when(
        (
        (F.col('LoF_protect').isNotNull()) &
        (F.col('LoF_risk').isNotNull())
        ),
            F.when(
                (F.col('LoF_protect'))-(F.col('LoF_risk'))!=(F.col('LoF_protect'))
                ,F.lit('dispar'))
        )   
    .when(
        (
        (F.col('GoF_protect').isNotNull()) &
        (F.col('GoF_risk').isNotNull())
        ),
            F.when(
                (F.col('GoF_protect'))-(F.col('GoF_risk'))!=(F.col('GoF_protect'))
                ,F.lit('dispar'))
        )   
    .when(
        (
        (F.col('GoF_protect').isNotNull()) &
        (F.col('LoF_protect').isNotNull())
        ),
            F.when(
                (F.col('GoF_protect'))-(F.col('LoF_protect'))!=(F.col('GoF_protect'))
                ,F.lit('dispar'))
        )      
        
        ))


#### Build dataset for checking intradatasource disparities 

columnstoassess=prueba_assessment.drop('targetId','diseaseId','datasourceId').columns
terms=['noEvaluable','bivalent_risk','null','dispar']
### Make the incoherencies dataset: join disease name, approved symbol and collect how many evidences are supporting every column

intradatasource_disparities=(coherency_assessed
.filter(F.col('coherency')=='dispar')
.groupBy('datasourceId3','targetId3','diseaseId3')
.agg(F.count('targetId3').alias('targetI'))
.withColumnRenamed('targetId3','targetIdU')
.withColumnRenamed('datasourceId3','datasourceI')
.withColumnRenamed('diseaseId3','diseaseI')

.join((prueba_assessment.filter(F.col('Assessment').isin(terms)==False)), 
    (F.col('targetIdU')==prueba_assessment.targetId)&
    (F.col('datasourceI')==prueba_assessment.datasourceId) &
    (F.col('diseaseI')==prueba_assessment.diseaseId),'left')
.join(diseases.select('id','name',), F.col('diseaseId')==diseases.id,'left')
.groupBy('targetId','diseaseI','name','datasourceI')
.agg(F.collect_list('clinicalSignificances').alias('clinicalSignificances'),
    F.collect_list('beta').alias('betaValues'),
    F.collect_list('OddsRatio').alias('oddsRatio'),
    F.collect_list('variantFunctionalConsequenceId').alias('varFunctConsId'),
    F.collect_list('drugId').alias('drugId'),
    F.collect_set('actionType').alias('actionTypeDif'),
    F.collect_list('diseaseFromSource'))
.withColumnRenamed('targetId','targetiddd')
.withColumnRenamed('name','diseaseName')
.join(coherency_assessed, 
    (F.col('targetIddd')==coherency_assessed.targetId3) &
    (F.col('datasourceI')==coherency_assessed.datasourceId3) &
    (F.col('diseaseI')==coherency_assessed.diseaseId3)
)
### anadir el approved symbol:
.join(target.select('id','approvedSymbol'), F.col('targetiddd')==target.id, 'left')
)

In [None]:
### Include the calculation of overlapping
terms=['noEvaluable','bivalent_risk','null','dispar']
#### reminder of 'toprocess': 
toprocess=(prueba_assessment
.filter(
    (F.col('Assessment').isin(terms)==False))
.groupBy('targetId','diseaseId')
.pivot('homogenized')
.agg(F.count('targetId'))
.join(contradictIntra.select('targetId3','diseaseId3','filterOut'),
    (F.col('targetId')==contradictIntra.targetId3) & 
    (F.col('diseaseId')==contradictIntra.diseaseId3)
    ,'left')
.withColumn('filterOut',
    F.when(F.col('filterOut')=='out', F.lit(F.col('filterOut')))
    .otherwise(F.lit('keep'))))
#####


columns=[
 'GoF_risk',
 'LoF_protect',
 'LoF_risk',
 'GoF_protect']

coherencyInter_assessed_wOut=(toprocess
##.filter(F.col('filterOut')!='out')
.select(
    F.col('targetId'),#.alias('targetId3'),
    F.col('diseaseId'),#.alias('diseaseId3'),
    F.col('filterOut'),
    *(F.col(c).cast("int").alias(c) for c in columns))

.withColumn('coherency',
    F.when(
        (
        (F.col('GoF_risk').isNotNull()) &
        (F.col('LoF_risk').isNotNull())
        ),
            F.when(
                (F.col('GoF_risk'))-(F.col('LoF_risk'))!=(F.col('GoF_risk'))
                ,F.lit('dispar'))
        )
    .when(
        (
        (F.col('LoF_protect').isNotNull()) &
        (F.col('LoF_risk').isNotNull())
        ),
            F.when(
                (F.col('LoF_protect'))-(F.col('LoF_risk'))!=(F.col('LoF_protect'))
                ,F.lit('dispar'))
        )   
    .when(
        (
        (F.col('GoF_protect').isNotNull()) &
        (F.col('GoF_risk').isNotNull())
        ),
            F.when(
                (F.col('GoF_protect'))-(F.col('GoF_risk'))!=(F.col('GoF_protect'))
                ,F.lit('dispar'))
        )   
    .when(
        (
        (F.col('GoF_protect').isNotNull()) &
        (F.col('LoF_protect').isNotNull())
        ),
            F.when(
                (F.col('GoF_protect'))-(F.col('LoF_protect'))!=(F.col('GoF_protect'))
                ,F.lit('dispar'))
        )      
    .when(
        (
        (F.col('GoF_protect').isNotNull()) &
        (F.col('LoF_risk').isNotNull())
        ),
            F.when(
                (F.col('GoF_protect'))-(F.col('LoF_risk'))!=(F.col('GoF_protect'))
                ,F.lit('coherent'))
    )      
    .when(
        (
        (F.col('LoF_protect').isNotNull()) &
        (F.col('GoF_risk').isNotNull())
        ),
            F.when(
                (F.col('LoF_protect'))-(F.col('GoF_risk'))!=(F.col('LoF_protect'))
                ,F.lit('coherent'))
        )   
    .otherwise(F.lit('take')))
)

In [None]:
### Build the dataset for Disparities inter datasource: 


terms=['noEvaluable','bivalent_risk','null','dispar']

### Make the incoherencies dataset: join disease name, approved symbol and collect how many evidences are supporting every column

interdatasource=(coherencyInter_assessed_wOut
.filter(F.col('coherency')=='dispar')
.groupBy('targetId','diseaseId')
.agg(F.count('targetId').alias('targetI'))
.withColumnRenamed('targetId','targetIdU')
.withColumnRenamed('diseaseId','diseaseI')

.join((prueba_assessment.filter(F.col('Assessment').isin(terms)==False)), 
    (F.col('targetIdU')==prueba_assessment.targetId)&
###    (F.col('datasourceI')==prueba_assessment.datasourceId) &
    (F.col('diseaseI')==prueba_assessment.diseaseId),'left')
.join(diseases.select('id','name',), F.col('diseaseId')==diseases.id,'left')
.groupBy('targetId','diseaseI','name')
.agg(F.collect_list('clinicalSignificances').alias('clinicalSignificances'),
    F.collect_list('beta').alias('betaValues'),
    F.collect_list('OddsRatio').alias('oddsRatio'),
    F.collect_list('variantFunctionalConsequenceId').alias('varFunctConsId'),
    F.collect_list('drugId').alias('drugId'),
    F.collect_set('actionType').alias('actionTypeDif'),
    F.collect_set('datasourceId'),
    F.collect_list('diseaseFromSource'))
.withColumnRenamed('targetId','targetiddd')
.withColumnRenamed('name','diseaseName')
.join(coherencyInter_assessed_wOut, 
    (F.col('targetIddd')==coherencyInter_assessed_wOut.targetId) &
##    (F.col('datasourceI')==coherency_assessed.datasourceId3) &
    (F.col('diseaseI')==coherencyInter_assessed_wOut.diseaseId)
)
### anadir el approved symbol:
.join(target.select('id','approvedSymbol'), F.col('targetiddd')==target.id, 'left')
)

##interdatasource.count()
### 13.548 counts

In [None]:
### las columnas con null no nos valen para las operaciones, necesitamos cambiarlas por 0: 

##interdatasource.fillna(value=0, subset=["GoF_risk","LoF_protect","LoF_risk","GoF_protect",'totalEvidences'])

interdatasourcePercentage=(interdatasource

.withColumn('minoritaryPercentage',
    F.when(
            F.col('LoF_protect')>= F.col('LoF_risk'), F.lit((F.col('LoF_risk')/(F.col('LoF_risk')+F.col('LoF_protect')))*100))
    .when(
            F.col('LoF_protect') < F.col('LoF_risk'), F.lit((F.col('LoF_protect')/(F.col('LoF_risk')+F.col('LoF_protect')))*100))

    .when(
            F.col('GoF_protect') < F.col('GoF_risk'), F.lit((F.col('GoF_protect')/(F.col('GoF_protect')+F.col('GoF_risk')))*100))
    .when(
            F.col('GoF_protect') >= F.col('GoF_risk'), F.lit((F.col('GoF_risk')/(F.col('GoF_protect')+F.col('GoF_risk')))*100))
            
    .when(
            F.col('GoF_risk') < F.col('LoF_risk'), F.lit((F.col('GoF_risk')/(F.col('GoF_risk')+F.col('LoF_risk')))*100))
    .when(
            F.col('GoF_risk') >= F.col('LoF_risk'), F.lit((F.col('LoF_risk')/(F.col('GoF_risk')+F.col('LoF_risk')))*100))    

    .when(
            F.col('GoF_protect') < F.col('LoF_protect'), F.lit((F.col('GoF_protect')/(F.col('GoF_protect')+F.col('LoF_protect')))*100))
    .when(
            F.col('GoF_protect') >= F.col('LoF_protect'), F.lit((F.col('LoF_protect')/(F.col('GoF_protect')+F.col('LoF_protect')))*100))   
    
    )
.fillna(value=0, subset=["GoF_risk","LoF_protect","LoF_risk","GoF_protect"])
.withColumn('totalEvidences',
        F.expr("GoF_risk + LoF_protect + LoF_risk + GoF_protect")
))
### .toPandas().to_csv('interdatasourcetest_TODAY.csv')


In [None]:
#### Filter by 'take' and study the distribution of evidences

countPairsEvidences=(coherencyInter_assessed_wOut
.filter(
    (F.col('coherency')=='take'))
.fillna(value=0, subset=["GoF_risk","LoF_protect","LoF_risk","GoF_protect"])
.withColumn('totalEvidences',
        F.expr("GoF_risk + LoF_protect + LoF_risk + GoF_protect"))
.withColumn('totalEvidences_woLR',
        F.expr("GoF_risk + LoF_protect + GoF_protect")
))
countPairsEvidences2=countPairsEvidences.toPandas()

In [None]:

### make the matrix of coincidences between datasources:
from pyspark.sql.functions import monotonically_increasing_id 


terms=['noEvaluable','bivalent_risk','null','dispar']

tdds=(prueba_assessment ### order target-traits pairs by datasource
.filter(
    (F.col('Assessment').isin(terms)==False))
.groupBy('targetId','diseaseId','datasourceId')
.agg(F.count('targetId'))
.select(F.col('targetId').alias('targetIddd'),
    F.col('diseaseId').alias('diseaseIdddd'),
    F.col('datasourceId')))

 ### multiply every target-trait by every datasource they appear on. 
 # Add an unique ID to identify and count distinctly

analysis=(prueba_assessment
.filter(
    (F.col('Assessment').isin(terms)==False))
.groupBy('targetId','diseaseId')
.agg(F.count('targetId'))
.withColumn('nr',monotonically_increasing_id())
.join(tdds, (F.col('targetId')==tdds.targetIddd) & (F.col('diseaseId')==tdds.diseaseIdddd),'left')
)
analysisdo=analysis.select('targetId','diseaseId','nr','datasourceId')

df_collect = (analysisdo
## user_id = nr
## item_id = datasourceId
        .select("nr", "datasourceId")
        .groupBy("datasourceId")
        .agg(F.collect_set("nr").alias("nrs")))
### Step 2. Cross join df_collect with itself to get all item-item combinations

df_crossjoin = (df_collect
                    .join(df_collect
                            .withColumnRenamed("datasourceId", "datasourceId_y")
                            .withColumnRenamed("nrs", "nrs_y")))
### Step 2. Find user union and intersection and the count

df_ui = (df_crossjoin
                 .withColumn("nrs_union", 
                         F.size((F.array_union("nrs", "nrs_y"))))
                 .withColumn("nrs_intersect", 
                             F.size(F.array_intersect("nrs", "nrs_y"))))
    
### Step 3. Pivot to get item-item matrix

df_matrix_union = (df_ui
                   .groupBy("datasourceId")
                   .pivot("datasourceId_y")
                   .agg(F.first("nrs_union"))
                   .orderBy("datasourceId"))

df_matrix_intrsct = (df_ui
                   .groupBy("datasourceId")
                   .pivot("datasourceId_y")
                   .agg(F.first("nrs_intersect"))
                   .orderBy("datasourceId"))

In [None]:
### Matrix of contradictions per datasource and intradatasource 
###
######
## to construct the matrix of coincidences between datasources we need to get: coherent and dispar. 

## 1) Get pairs target-diseases with label dispar/coherent
toget=['dispar','coherent']
pairs_analysis=(coherencyInter_assessed_wOut
.filter(F.col('coherency').isin(toget))
.withColumnRenamed('targetId','targetId2')
.withColumnRenamed('diseaseId','diseaseId2')
)
testeado=toprocess.join(pairs_analysis, (pairs_analysis.targetId2==F.col('targetId')) & (pairs_analysis.diseaseId2==F.col('diseaseId')), 'right')

#######

toprocess=(prueba_assessment
.filter(
    (F.col('Assessment').isin(terms)==False))
.groupBy('targetId','diseaseId')
.pivot('homogenized')
.agg(F.count('targetId'))
.join(contradictIntra.select('targetId3','diseaseId3','filterOut'),
    (F.col('targetId')==contradictIntra.targetId3) & 
    (F.col('diseaseId')==contradictIntra.diseaseId3)
    ,'left')
.withColumn('filterOut2',
    F.when(F.col('filterOut')=='out', F.lit(F.col('filterOut')))
    .otherwise(F.lit('keep')))
.drop('filterOut')

)

pairs_analysis=(coherencyInter_assessed_wOut
.filter(F.col('coherency').isin(toget))
.withColumnRenamed('targetId','targetId2')
.withColumnRenamed('diseaseId','diseaseId2')
)
testeado=toprocess.join(pairs_analysis, (pairs_analysis.targetId2==F.col('targetId')) & (pairs_analysis.diseaseId2==F.col('diseaseId')), 'right')

allcontradict_coherent=(testeado
.filter( 
    (F.col('coherency')=='dispar') & (F.col('filterOut')=='out') | 
    (F.col('coherency')=='dispar') & (F.col('filterOut')=='keep') | 
    (F.col('coherency')=='coherent') & (F.col('filterOut')=='keep'))
.select('targetId','diseaseId')
.withColumn('nr',monotonically_increasing_id())
.join(tdds, (F.col('targetId')==tdds.targetIddd) & (F.col('diseaseId')==tdds.diseaseIdddd),'left')
.select('targetId','diseaseId','nr','datasourceId'))


#### make the pipeline 



df_collect = (allcontradict_coherent
## user_id = nr
## item_id = datasourceId
        .select("nr", "datasourceId")
        .groupBy("datasourceId")
        .agg(F.collect_set("nr").alias("nrs")))
### Step 2. Cross join df_collect with itself to get all item-item combinations

df_crossjoin = (df_collect
                    .join(df_collect
                            .withColumnRenamed("datasourceId", "datasourceId_y")
                            .withColumnRenamed("nrs", "nrs_y")))
### Step 2. Find user union and intersection and the count

df_ui = (df_crossjoin
                 .withColumn("nrs_union", 
                         F.size((F.array_union("nrs", "nrs_y"))))
                 .withColumn("nrs_intersect", 
                             F.size(F.array_intersect("nrs", "nrs_y"))))
    
### Step 3. Pivot to get item-item matrix

df_matrix_union = (df_ui
                   .groupBy("datasourceId")
                   .pivot("datasourceId_y")
                   .agg(F.first("nrs_union"))
                   .orderBy("datasourceId"))

df_matrix_intrsct = (df_ui
                   .groupBy("datasourceId")
                   .pivot("datasourceId_y")
                   .agg(F.first("nrs_intersect"))
                   .orderBy("datasourceId"))

In [None]:
### Matrix of contradictions per datasource and intradatasource <<removing COHERENT>>

allcontradict_WOcoherent=(testeado
.filter( 
    (F.col('coherency')=='dispar') & (F.col('filterOut')=='out') | 
    (F.col('coherency')=='dispar') & (F.col('filterOut')=='keep')
    ## | (F.col('coherency')=='coherent') & (F.col('filterOut')=='keep')
    )
.select('targetId','diseaseId')
.withColumn('nr',monotonically_increasing_id())
.join(tdds, (F.col('targetId')==tdds.targetIddd) & (F.col('diseaseId')==tdds.diseaseIdddd),'left')
.select('targetId','diseaseId','nr','datasourceId'))


#### make the pipeline 

df_collect = (allcontradict_WOcoherent
## user_id = nr
## item_id = datasourceId
        .select("nr", "datasourceId")
        .groupBy("datasourceId")
        .agg(F.collect_set("nr").alias("nrs")))
### Step 2. Cross join df_collect with itself to get all item-item combinations

df_crossjoin = (df_collect
                    .join(df_collect
                            .withColumnRenamed("datasourceId", "datasourceId_y")
                            .withColumnRenamed("nrs", "nrs_y")))
### Step 2. Find user union and intersection and the count

df_ui = (df_crossjoin
                 .withColumn("nrs_union", 
                         F.size((F.array_union("nrs", "nrs_y"))))
                 .withColumn("nrs_intersect", 
                             F.size(F.array_intersect("nrs", "nrs_y"))))
    
### Step 3. Pivot to get item-item matrix

df_matrix_union = (df_ui
                   .groupBy("datasourceId")
                   .pivot("datasourceId_y")
                   .agg(F.first("nrs_union"))
                   .orderBy("datasourceId"))

df_matrix_intrsct_alldispar = (df_ui
                   .groupBy("datasourceId")
                   .pivot("datasourceId_y")
                   .agg(F.first("nrs_intersect"))
                   .orderBy("datasourceId"))

In [None]:
### Matrix of ONLY <contradictions per datasource>> and intradatasource <<removing COHERENT and INTRAdatasource. 

allcontradict_WOcoherentWOintradata=(testeado
.filter( 
    (F.col('coherency')=='dispar') & (F.col('filterOut')=='keep') 
    ## | (F.col('coherency')=='dispar') & (F.col('filterOut')=='keep')
    ## | (F.col('coherency')=='coherent') & (F.col('filterOut')=='keep')
    )
.select('targetId','diseaseId')
.withColumn('nr',monotonically_increasing_id())
.join(tdds, (F.col('targetId')==tdds.targetIddd) & (F.col('diseaseId')==tdds.diseaseIdddd),'left')
.select('targetId','diseaseId','nr','datasourceId'))


#### make the pipeline 



df_collect = (allcontradict_WOcoherentWOintradata
## user_id = nr
## item_id = datasourceId
        .select("nr", "datasourceId")
        .groupBy("datasourceId")
        .agg(F.collect_set("nr").alias("nrs")))
### Step 2. Cross join df_collect with itself to get all item-item combinations

df_crossjoin = (df_collect
                    .join(df_collect
                            .withColumnRenamed("datasourceId", "datasourceId_y")
                            .withColumnRenamed("nrs", "nrs_y")))
### Step 2. Find user union and intersection and the count

df_ui = (df_crossjoin
                 .withColumn("nrs_union", 
                         F.size((F.array_union("nrs", "nrs_y"))))
                 .withColumn("nrs_intersect", 
                             F.size(F.array_intersect("nrs", "nrs_y"))))
    
### Step 3. Pivot to get item-item matrix

df_matrix_union = (df_ui
                   .groupBy("datasourceId")
                   .pivot("datasourceId_y")
                   .agg(F.first("nrs_union"))
                   .orderBy("datasourceId"))

df_matrix_intrsct_alldispar_WOintradata = (df_ui
                   .groupBy("datasourceId")
                   .pivot("datasourceId_y")
                   .agg(F.first("nrs_intersect"))
                   .orderBy("datasourceId"))