# Observations

In [2]:
import findspark
findspark.init()
findspark.find()

'C:\\Users\\Mirna Elizondo\\anaconda3\\lib\\site-packages\\pyspark'

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ConditionFeatures').getOrCreate()

In [23]:
from pyspark.ml.feature import VectorSlicer, VectorAssembler, ChiSqSelector, VectorIndexer, UnivariateFeatureSelector, VarianceThresholdSelector
from pyspark.sql.functions import *
import numpy as np
from pyspark.sql.types import IntegerType

### Reading and Merging Data

In [17]:
df = spark.read.option("header",True).csv('../../synthea-sample-data/data/10k_synthea_covid19_csv/observations.csv').select('PATIENT','Code', 'Description')
deathDf = spark.read.option("header",True).csv('../../synthea-sample-data/data/10k_synthea_covid19_csv/patients.csv').select('Id', 'DEATHDATE')
deadSet = df.join(deathDf, (df.PATIENT == deathDf.Id)).na.drop().drop('Id', 'Code')
labels = spark.read.option("header",True).csv('dfCovid_DeceasedCovid.csv').select('PATIENT', 'covid-19', 'deceased & covid-19')
merged = df.join(deathDf, (df.PATIENT == deathDf.Id), 'left').drop( 'Id')

merged = merged.withColumn('deceased', when(col('DEATHDATE').isNotNull(), 1)).na.fill(0)
merged = merged.join(labels, ('PATIENT'), 'left').dropDuplicates()

merged.show()
deadSet.show()

+--------------------+-------+--------------------+---------+--------+--------+-------------------+
|             PATIENT|   Code|         Description|DEATHDATE|deceased|covid-19|deceased & covid-19|
+--------------------+-------+--------------------+---------+--------+--------+-------------------+
|001d6ed3-6837-430...| 2339-0|             Glucose|     null|       0|       0|                  0|
|001d6ed3-6837-430...| 2339-0|             Glucose|     null|       0|       1|                  0|
|001d6ed3-6837-430...| 6299-2|       Urea Nitrogen|     null|       0|       0|                  0|
|001d6ed3-6837-430...| 6299-2|       Urea Nitrogen|     null|       0|       1|                  0|
|001d6ed3-6837-430...|38483-4|          Creatinine|     null|       0|       0|                  0|
|001d6ed3-6837-430...|38483-4|          Creatinine|     null|       0|       1|                  0|
|001d6ed3-6837-430...|49765-1|             Calcium|     null|       0|       0|                  0|


In [18]:
groupedDf = merged.groupBy("PATIENT", 'Code').pivot("Code").agg(count("Code").alias("count")).na.fill(0)
merged =merged.select('PATIENT', 'deceased', 'covid-19', 'deceased & covid-19')
finalDf = groupedDf.join(merged, ['PATIENT'], 'left')
finalDf.printSchema()
print(len(finalDf.columns))

root
 |-- PATIENT: string (nullable = true)
 |-- Code: string (nullable = true)
 |-- 10230-1: long (nullable = true)
 |-- 10480-2: long (nullable = true)
 |-- 10834-0: long (nullable = true)
 |-- 14804-9: long (nullable = true)
 |-- 14959-1: long (nullable = true)
 |-- 1742-6: long (nullable = true)
 |-- 1751-7: long (nullable = true)
 |-- 17861-6: long (nullable = true)
 |-- 18262-6: long (nullable = true)
 |-- 1920-8: long (nullable = true)
 |-- 1960-4: long (nullable = true)
 |-- 1975-2: long (nullable = true)
 |-- 1988-5: long (nullable = true)
 |-- 19926-5: long (nullable = true)
 |-- 19994-3: long (nullable = true)
 |-- 2019-8: long (nullable = true)
 |-- 2028-9: long (nullable = true)
 |-- 20454-5: long (nullable = true)
 |-- 20505-4: long (nullable = true)
 |-- 20565-8: long (nullable = true)
 |-- 20570-8: long (nullable = true)
 |-- 2069-3: long (nullable = true)
 |-- 2075-0: long (nullable = true)
 |-- 2085-9: long (nullable = true)
 |-- 2093-3: long (nullable = true)
 |-- 21

In [30]:
cols = list(set(finalDf.columns) - {'PATIENT', 'deceased', 'Code', 'Description', 'covid-19', 'deceased & covid-19'})
assembler = VectorAssembler().setInputCols(cols).setOutputCol('features')
finalDf = finalDf.withColumn("covid-19", finalDf["covid-19"].cast(IntegerType())).withColumn("deceased & covid-19", finalDf["deceased & covid-19"].cast(IntegerType()))
df = assembler.transform(finalDf)
df.printSchema()


root
 |-- PATIENT: string (nullable = true)
 |-- Code: string (nullable = true)
 |-- 10230-1: long (nullable = true)
 |-- 10480-2: long (nullable = true)
 |-- 10834-0: long (nullable = true)
 |-- 14804-9: long (nullable = true)
 |-- 14959-1: long (nullable = true)
 |-- 1742-6: long (nullable = true)
 |-- 1751-7: long (nullable = true)
 |-- 17861-6: long (nullable = true)
 |-- 18262-6: long (nullable = true)
 |-- 1920-8: long (nullable = true)
 |-- 1960-4: long (nullable = true)
 |-- 1975-2: long (nullable = true)
 |-- 1988-5: long (nullable = true)
 |-- 19926-5: long (nullable = true)
 |-- 19994-3: long (nullable = true)
 |-- 2019-8: long (nullable = true)
 |-- 2028-9: long (nullable = true)
 |-- 20454-5: long (nullable = true)
 |-- 20505-4: long (nullable = true)
 |-- 20565-8: long (nullable = true)
 |-- 20570-8: long (nullable = true)
 |-- 2069-3: long (nullable = true)
 |-- 2075-0: long (nullable = true)
 |-- 2085-9: long (nullable = true)
 |-- 2093-3: long (nullable = true)
 |-- 21

## Pyspark Feature Selection

ChiSqSelector is deprecated in version 3.1.0 of Spark
- can still be implimented using UnivariateFeatureSelector (estimator=chisq)
    - tested on Enclave and implemented them
    
- VectorSlicer (removes constants (0)) and R-Formula(stats) not used

#### Chi- Squared Selector
Deceased

In [31]:
selector = ChiSqSelector(numTopFeatures=10, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="deceased")
chiResult = selector.fit(df).transform(df)
print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
chiResult.show()

ChiSqSelector output with top 10 features selected
+--------------------+------+-------+-------+-------+-------+-------+------+------+-------+-------+------+------+------+------+-------+-------+------+------+-------+-------+-------+-------+------+------+------+------+-------+------+------+-------+-------+-------+-------+-------+------+------+------+------+-------+------+-------+-------+-------+-------+------+------+------+------+-------+------+------+-------+------+------+------+------+-------+-------+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+---------+-------+-------+------+------+-------+-------+-------+-------+-------+------+------+------+-------+------+------+------+------+------+------+------+------+------+------+-------+-------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+-------+-------+-------+-------+-------+------+------+------+------+-------+-----+

Covid-19

In [35]:
df = assembler.transform(finalDf)
df = df.na.drop()
selector = ChiSqSelector(numTopFeatures=10, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="covid-19")
chiResult = selector.fit(df).transform(df)
print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
chiResult.show()

ChiSqSelector output with top 10 features selected
+--------------------+------+-------+-------+-------+-------+-------+------+------+-------+-------+------+------+------+------+-------+-------+------+------+-------+-------+-------+-------+------+------+------+------+-------+------+------+-------+-------+-------+-------+-------+------+------+------+------+-------+------+-------+-------+-------+-------+------+------+------+------+-------+------+------+-------+------+------+------+------+-------+-------+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+---------+-------+-------+------+------+-------+-------+-------+-------+-------+------+------+------+-------+------+------+------+------+------+------+------+------+------+------+-------+-------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+-------+-------+-------+-------+-------+------+------+------+------+-------+-----+

Deceased & Covid-19

In [37]:
df = assembler.transform(finalDf)
df = df.na.drop()
selector = ChiSqSelector(numTopFeatures=10, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="deceased & covid-19")
chiResult = selector.fit(df).transform(df)
print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
chiResult.show()

ChiSqSelector output with top 10 features selected
+--------------------+------+-------+-------+-------+-------+-------+------+------+-------+-------+------+------+------+------+-------+-------+------+------+-------+-------+-------+-------+------+------+------+------+-------+------+------+-------+-------+-------+-------+-------+------+------+------+------+-------+------+-------+-------+-------+-------+------+------+------+------+-------+------+------+-------+------+------+------+------+-------+-------+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+---------+-------+-------+------+------+-------+-------+-------+-------+-------+------+------+------+-------+------+------+------+------+------+------+------+------+------+------+-------+-------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+-------+-------+-------+-------+-------+------+------+------+------+-------+-----+

#### Univariate Feature Selector
Deceased

In [38]:
df = assembler.transform(finalDf)
selector = UnivariateFeatureSelector(featuresCol="features", outputCol="selectedFeatures",
                                     labelCol="deceased", selectionMode="numTopFeatures")
selector.setFeatureType("continuous").setLabelType("continuous").setSelectionThreshold(10)

uniResult = selector.fit(df).transform(df)

print("UnivariateFeatureSelector output with top %d features selected using f_classif"
      % selector.getSelectionThreshold())
uniResult.select('features', 'selectedFeatures').show()

UnivariateFeatureSelector output with top 10 features selected using f_classif
+-----------------+----------------+
|         features|selectedFeatures|
+-----------------+----------------+
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
+-----------------+----------------+
only showing top 20 rows



Covid-19

In [39]:
df = assembler.transform(finalDf)
df = df.na.drop()

selector = UnivariateFeatureSelector(featuresCol="features", outputCol="selectedFeatures",
                                     labelCol="covid-19", selectionMode="numTopFeatures")
selector.setFeatureType("continuous").setLabelType("continuous").setSelectionThreshold(10)
''
uniResult = selector.fit(df).transform(df)

print("UnivariateFeatureSelector output with top %d features selected using f_classif"
      % selector.getSelectionThreshold())
uniResult.select('features', 'selectedFeatures').show()

UnivariateFeatureSelector output with top 10 features selected using f_classif
+-----------------+----------------+
|         features|selectedFeatures|
+-----------------+----------------+
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
+-----------------+----------------+
only showing top 20 rows



Deceased & Covid-19

In [40]:
df = assembler.transform(finalDf)
df = df.na.drop()

selector = UnivariateFeatureSelector(featuresCol="features", outputCol="selectedFeatures",
                                     labelCol="deceased & covid-19", selectionMode="numTopFeatures")
selector.setFeatureType("continuous").setLabelType("continuous").setSelectionThreshold(10)

uniResult = selector.fit(df).transform(df)

print("UnivariateFeatureSelector output with top %d features selected using f_classif"
      % selector.getSelectionThreshold())
uniResult.select('features', 'selectedFeatures').show()

UnivariateFeatureSelector output with top 10 features selected using f_classif
+-----------------+----------------+
|         features|selectedFeatures|
+-----------------+----------------+
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
|(189,[178],[2.0])|      (10,[],[])|
+-----------------+----------------+
only showing top 20 rows



#### Variance Threshold (0.3)

In [7]:
cols = list(set(df.columns) - {'PATIENT', 'deceased', 'Code', 'Description', 'covid-19', 'deceased & covid-19'})
df = df.select(cols)
selector = VarianceThresholdSelector(varianceThreshold=(0.3) , outputCol="selectedFeatures")

model = selector.fit(df)
varResult = model.transform(df)
model.selectedFeatures
print("Output: Features with variance lower than %f are removed." % selector.getVarianceThreshold())
varResult.select('features', 'selectedFeatures').show()

Output: Features with variance lower than 0.300000 are removed.
+-----------------+----------------+
|         features|selectedFeatures|
+-----------------+----------------+
|(189,[126],[1.0])|      (66,[],[])|
|(189,[126],[1.0])|      (66,[],[])|
|(189,[126],[1.0])|      (66,[],[])|
|(189,[126],[1.0])|      (66,[],[])|
|(189,[126],[1.0])|      (66,[],[])|
|(189,[126],[1.0])|      (66,[],[])|
|(189,[126],[1.0])|      (66,[],[])|
|(189,[126],[1.0])|      (66,[],[])|
|(189,[126],[1.0])|      (66,[],[])|
|(189,[126],[1.0])|      (66,[],[])|
|(189,[126],[1.0])|      (66,[],[])|
|(189,[126],[1.0])|      (66,[],[])|
|(189,[126],[1.0])|      (66,[],[])|
|(189,[126],[1.0])|      (66,[],[])|
|(189,[126],[1.0])|      (66,[],[])|
|(189,[126],[1.0])|      (66,[],[])|
|(189,[126],[1.0])|      (66,[],[])|
|(189,[126],[1.0])|      (66,[],[])|
|(189,[126],[1.0])|      (66,[],[])|
|(189,[126],[1.0])|      (66,[],[])|
+-----------------+----------------+
only showing top 20 rows

