## First try with Pyspark!
### 1. Read datasets : TARGET from '/pub/databases/opentargets/platform/21.11/output/etl/parquet'
#### See parquet file manipulation with pyspark [here](https://spark.apache.org/docs/latest/sql-data-sources-parquet.html)

**Apache Parquet file is a columnar storage format available to any project in the Hadoop ecosystem, regardless of the choice of data processing framework, data model, or programming language.**

In [9]:
import pyspark
from pyspark.sql.functions import (
    col, udf, struct, lit, split, regexp_replace, create_map, min as spark_min, max as spark_max,
    count, sum as spar_sum, explode, when, concat, lower, format_string
)

from pyspark.sql.types import (
    FloatType, ArrayType, StructType, StructField, StringType, IntegerType, TimestampType
)

from pyspark.sql import SparkSession, Row
from pyspark.conf import SparkConf
from itertools import chain
from pyspark import SparkFiles

# for heatmap:
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt


In [1]:
# establish spark connection

# SparkConf is used to specify the configuration of your Spark application. This is used to set Spark application parameters as key-value pairs.
sparkConf = (
    SparkConf()
    .set('spark.driver.memory', '15g')
    .set('spark.executor.memory', '15g')
    .set('spark.driver.maxResultSize', '0')
)

# 
spark = (
    SparkSession.builder
    .config(conf=sparkConf)
    .master('local[*]')
    .getOrCreate()
)

NameError: name 'SparkConf' is not defined

In [11]:
# Read in target parquet file
file_types = ["part-00000", "part-00001", "part-00002", "part-00003"]
paths = {
    'test': '/Users/marinegirardey/Documents/opentarget_internship/targets/{}-ad8db45e-239a-4036-88a1-012033909e5a-c000.snappy.parquet'
}
for file_type in file_types:
    print(f'looking at the schema of {file_type}')
    for key, value in paths.items():
        print(f'Opening {key} dataset:')
        print(spark.read.parquet(value.format(file_type), header=True).printSchema())

looking at the schema of part-00000
Opening test dataset:


                                                                                

root
 |-- id: string (nullable = true)
 |-- approvedSymbol: string (nullable = true)
 |-- biotype: string (nullable = true)
 |-- transcriptIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- genomicLocation: struct (nullable = true)
 |    |-- chromosome: string (nullable = true)
 |    |-- start: long (nullable = true)
 |    |-- end: long (nullable = true)
 |    |-- strand: integer (nullable = true)
 |-- alternativeGenes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- approvedName: string (nullable = true)
 |-- go: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- source: string (nullable = true)
 |    |    |-- evidence: string (nullable = true)
 |    |    |-- aspect: string (nullable = true)
 |    |    |-- geneProduct: string (nullable = true)
 |    |    |-- ecoId: string (nullable = true)
 |-- hallmarks: struct (nullable = true)
 |    |-- attribu

In [12]:
# Read one file
df = (
    spark.read.parquet('/Users/marinegirardey/Documents/opentarget_internship/targets/part-00000-ad8db45e-239a-4036-88a1-012033909e5a-c000.snappy.parquet', header=True)
    .persist()
)
# Show the file in a meaningful way
print(df.show(1, vertical=True, truncate=False)) # Vertical : to organise data in a meaningful way, truncate : to have access to all the content of a line
print(type(df))

22/01/17 15:53:45 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 5:>                                                          (0 + 1) / 1]

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 id                   | ENSG00000020219                                                                                                                                                                                                                       
 approvedSymbol       | CCT8L1P                                                                                                                                                                                                                               
 biotype              | processed_pseudogene                                                                                                                                                                                               

                                                                                

### 2. Filter the dataset

#### DataFrame filter() with Column Condition

In [13]:
# Exemple
df.filter(df.state == "OH").show(truncate=False)

AttributeError: 'DataFrame' object has no attribute 'state'

#### DataFrame filter() with SQL Expression

In [None]:
# Exemple
df.filter("gender == 'M'").show()

#### PySpark Filter with Multiple Conditions

In [None]:
# Exemple
df.filter( (df.state  == "OH") & (df.gender  == "M") ).show(truncate=False)

See more [here](https://sparkbyexamples.com/pyspark/pyspark-where-filter/)

**distinct()** : Used to drop/remove the duplicate rows (all columns) from DataFrame

**isNotNull()** : True if the current expression is NOT null.

**show()** : Prints the first n rows to the console.

In [14]:
(
    df
    .filter(col('id').isNotNull())
    .distinct()
    .show()
)

[Stage 6:>                                                          (0 + 1) / 1]

+---------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+------------------+--------------------+--------------------+----+--------------------+--------------------+--------------+--------------------+--------------------+-----------------+--------------------+
|             id| approvedSymbol|             biotype|       transcriptIds|     genomicLocation|    alternativeGenes|        approvedName|                  go|           hallmarks|            synonyms|      symbolSynonyms|        nameSynonyms|functionDescriptions|subcellularLocations|targetClass|   obsoleteSymbols|       obsoleteNames|          constraint| tep|          proteinIds|             dbXrefs|chemicalProbes|          homologues|        tractability|safetyLiabilities|            pathwa

                                                                                

In [15]:
(
    df
    .filter(col('id').isNotNull())
    .select(col('id'))
    .distinct()
#     .count()
    .show(truncate=False)
)

+---------------+
|id             |
+---------------+
|ENSG00000020219|
|ENSG00000059588|
|ENSG00000070182|
|ENSG00000070366|
|ENSG00000072071|
|ENSG00000073536|
|ENSG00000075290|
|ENSG00000083454|
|ENSG00000083782|
|ENSG00000086200|
|ENSG00000087087|
|ENSG00000087502|
|ENSG00000092201|
|ENSG00000100567|
|ENSG00000101198|
|ENSG00000102078|
|ENSG00000103544|
|ENSG00000105707|
|ENSG00000105993|
|ENSG00000106006|
+---------------+
only showing top 20 rows



In [16]:
paths = {
    'target_file': '/Users/marinegirardey/Documents/opentarget_internship/targets/part-00000-ad8db45e-239a-4036-88a1-012033909e5a-c000.snappy.parquet',
}
file_types = ["CancerG2P", "DDG2P", "EyeG2P", "SkinG2P"]


gene2phenotype_schema = (
    StructType()
    .add('id', StringType())
    .add('approvedSymbol', IntegerType())
    .add('biotype', StringType())
    .add('transcriptIds', StringType())
    .add('genomicLocation', StringType())
    .add('alternativeGenes', StringType())
    .add('approvedName', StringType())
    .add('go', StringType())
)

full_df = (
    spark.read.parquet(
        [paths['target_file'].format(file_type) for file_type in file_types], 
        schema=gene2phenotype_schema, enforceSchema=True, header=True
    )
    .persist()
)

print(full_df.count())
full_df.printSchema()     

Py4JJavaError: An error occurred while calling o77.parquet.
: java.lang.ClassCastException: java.util.ArrayList cannot be cast to java.lang.String
	at org.apache.spark.sql.internal.SessionState.$anonfun$newHadoopConfWithOptions$1(SessionState.scala:105)
	at org.apache.spark.sql.internal.SessionState.$anonfun$newHadoopConfWithOptions$1$adapted(SessionState.scala:105)
	at scala.collection.immutable.Map$Map1.foreach(Map.scala:193)
	at org.apache.spark.sql.internal.SessionState.newHadoopConfWithOptions(SessionState.scala:105)
	at org.apache.spark.sql.execution.datasources.DataSource.newHadoopConfiguration(DataSource.scala:116)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:369)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:274)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$3(DataFrameReader.scala:245)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:245)
	at org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:596)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)


In [17]:
(
    full_df
    .filter(col('id').isNotNull())
    .select('id')
#     .distinct()
#     .show(truncate=False)
    .count()
#     .show(1, vertical=True, truncate=False)
)

NameError: name 'full_df' is not defined

22/01/17 18:13:13 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 131931 ms exceeds timeout 120000 ms
22/01/17 18:13:13 WARN SparkContext: Killing executors is not supported by current scheduler.


In [None]:
df = spark.createDataFrame([(5, "hello")], ['a', 'b'])
df

DataFrame[a: bigint, b: string]

In [None]:
df.select(format_string('%d %s', df.a, df.b).alias('v')).collect()
[Row(v='5 hello')]

In [None]:
00001 + 2

SyntaxError: leading zeros in decimal integer literals are not permitted; use an 0o prefix for octal integers (682028953.py, line 1)