# Medications

In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\Users\\Mirna Elizondo\\anaconda3\\lib\\site-packages\\pyspark'

In [8]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ConditionFeatures').getOrCreate()
from pyspark.ml.feature import VectorSlicer, VectorAssembler, ChiSqSelector, VectorIndexer, UnivariateFeatureSelector, VarianceThresholdSelector
from pyspark.sql.functions import *
import numpy as np
from pyspark.sql.types import IntegerType

### Reading and Merging Data

In [5]:
df = spark.read.option("header",True).csv('../../synthea-sample-data/data/10k_synthea_covid19_csv/medications.csv').select('PATIENT','Code', 'Description')
deathDf = spark.read.option("header",True).csv('../../synthea-sample-data/data/10k_synthea_covid19_csv/patients.csv').select('Id', 'DEATHDATE')
deadSet = df.join(deathDf, (df.PATIENT == deathDf.Id)).na.drop().drop('Id', 'Code')
labels = spark.read.option("header",True).csv('dfCovid_DeceasedCovid.csv').select('PATIENT', 'covid-19', 'deceased & covid-19')

merged = df.join(deathDf, (df.PATIENT == deathDf.Id), 'left').drop( 'Id')

merged = merged.withColumn('deceased', when(col('DEATHDATE').isNotNull(), 1)).na.fill(0)
merged = merged.join(labels, ('PATIENT'), 'left').dropDuplicates()

merged.show()
deadSet.show()

+--------------------+-------+--------------------+----------+--------+--------+-------------------+
|             PATIENT|   Code|         Description| DEATHDATE|deceased|covid-19|deceased & covid-19|
+--------------------+-------+--------------------+----------+--------+--------+-------------------+
|001d6ed3-6837-430...| 314231|Simvastatin 10 MG...|      null|       0|       0|                  0|
|001d6ed3-6837-430...| 314231|Simvastatin 10 MG...|      null|       0|       1|                  0|
|001d6ed3-6837-430...| 313782|Acetaminophen 325...|      null|       0|       0|                  0|
|001d6ed3-6837-430...| 313782|Acetaminophen 325...|      null|       0|       1|                  0|
|0071513f-d0e9-407...| 312938|Sertraline 100 MG...|      null|       0|       1|                  0|
|0071513f-d0e9-407...| 312938|Sertraline 100 MG...|      null|       0|       0|                  0|
|0071513f-d0e9-407...| 856980|Acetaminophen/Hyd...|      null|       0|       1|           

In [6]:
groupedDf = merged.groupBy("PATIENT", 'Code').pivot("Code").agg(count("Code").alias("count")).na.fill(0)
merged =merged.select('PATIENT', 'deceased', 'covid-19', 'deceased & covid-19')
finalDf = groupedDf.join(merged, ['PATIENT'], 'left')
finalDf.printSchema()
print(len(finalDf.columns))

root
 |-- PATIENT: string (nullable = true)
 |-- Code: string (nullable = true)
 |-- 1000126: long (nullable = true)
 |-- 1014676: long (nullable = true)
 |-- 1014678: long (nullable = true)
 |-- 1043400: long (nullable = true)
 |-- 1049221: long (nullable = true)
 |-- 1049630: long (nullable = true)
 |-- 1049635: long (nullable = true)
 |-- 105078: long (nullable = true)
 |-- 105585: long (nullable = true)
 |-- 106258: long (nullable = true)
 |-- 106892: long (nullable = true)
 |-- 1091392: long (nullable = true)
 |-- 1094107: long (nullable = true)
 |-- 1100184: long (nullable = true)
 |-- 1114085: long (nullable = true)
 |-- 1116758: long (nullable = true)
 |-- 1190795: long (nullable = true)
 |-- 1234995: long (nullable = true)
 |-- 1359133: long (nullable = true)
 |-- 1363309: long (nullable = true)
 |-- 1367439: long (nullable = true)
 |-- 141918: long (nullable = true)
 |-- 1534809: long (nullable = true)
 |-- 1599803: long (nullable = true)
 |-- 1601380: long (nullable = true)


In [9]:
cols = list(set(finalDf.columns) - {'PATIENT', 'deceased', 'Code', 'Description', 'covid-19', 'deceased & covid-19'})
assembler = VectorAssembler().setInputCols(cols).setOutputCol('features')
finalDf = finalDf.withColumn("covid-19", finalDf["covid-19"].cast(IntegerType())).withColumn("deceased & covid-19", finalDf["deceased & covid-19"].cast(IntegerType()))
df = assembler.transform(finalDf)
df.printSchema()

root
 |-- PATIENT: string (nullable = true)
 |-- Code: string (nullable = true)
 |-- 1000126: long (nullable = true)
 |-- 1014676: long (nullable = true)
 |-- 1014678: long (nullable = true)
 |-- 1043400: long (nullable = true)
 |-- 1049221: long (nullable = true)
 |-- 1049630: long (nullable = true)
 |-- 1049635: long (nullable = true)
 |-- 105078: long (nullable = true)
 |-- 105585: long (nullable = true)
 |-- 106258: long (nullable = true)
 |-- 106892: long (nullable = true)
 |-- 1091392: long (nullable = true)
 |-- 1094107: long (nullable = true)
 |-- 1100184: long (nullable = true)
 |-- 1114085: long (nullable = true)
 |-- 1116758: long (nullable = true)
 |-- 1190795: long (nullable = true)
 |-- 1234995: long (nullable = true)
 |-- 1359133: long (nullable = true)
 |-- 1363309: long (nullable = true)
 |-- 1367439: long (nullable = true)
 |-- 141918: long (nullable = true)
 |-- 1534809: long (nullable = true)
 |-- 1599803: long (nullable = true)
 |-- 1601380: long (nullable = true)


## Pyspark Feature Selection
ChiSqSelector is deprecated in version 3.1.0 of Spark
- can still be implimented using UnivariateFeatureSelector (estimator=chisq)
    - tested on Enclave and implemented them
    
- VectorSlicer (removes constants (0)) and R-Formula(stats) not used

#### Chi- Squared Selector
Deceased

In [10]:
selector = ChiSqSelector(numTopFeatures=10, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="deceased")
chiResult = selector.fit(df).transform(df)
print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
chiResult.show()

ChiSqSelector output with top 10 features selected
+--------------------+-------+-------+-------+-------+-------+-------+-------+-------+------+------+------+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+-------+-------+------+------+------+-------+-------+-------+-------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+---

Covid-19

In [11]:
df = assembler.transform(finalDf)
df = df.na.drop()
selector = ChiSqSelector(numTopFeatures=10, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="covid-19")
chiResult = selector.fit(df).transform(df)
print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
chiResult.show()

ChiSqSelector output with top 10 features selected
+--------------------+-------+-------+-------+-------+-------+-------+-------+-------+------+------+------+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+-------+-------+------+------+------+-------+-------+-------+-------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+---

Deceased & Covid-19

In [12]:
df = assembler.transform(finalDf)
df = df.na.drop()
selector = ChiSqSelector(numTopFeatures=10, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="deceased & covid-19")
chiResult = selector.fit(df).transform(df)
print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
chiResult.show()

ChiSqSelector output with top 10 features selected
+--------------------+-------+-------+-------+-------+-------+-------+-------+-------+------+------+------+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+-------+-------+------+------+------+-------+-------+-------+-------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+---

### Univariate Feature Selection
Deceased

In [13]:
df = assembler.transform(finalDf)

selector = UnivariateFeatureSelector(featuresCol="features", outputCol="selectedFeatures",
                                     labelCol="deceased", selectionMode="numTopFeatures")
selector.setFeatureType("continuous").setLabelType("continuous").setSelectionThreshold(10)

uniResult = selector.fit(df).transform(df)

print("UnivariateFeatureSelector output with top %d features selected using f_classif"
      % selector.getSelectionThreshold())
uniResult.select('features', 'selectedFeatures').show()

UnivariateFeatureSelector output with top 10 features selected using f_classif
+-----------------+----------------+
|         features|selectedFeatures|
+-----------------+----------------+
| (169,[66],[2.0])|      (10,[],[])|
| (169,[66],[2.0])|      (10,[],[])|
| (169,[66],[2.0])|      (10,[],[])|
| (169,[66],[2.0])|      (10,[],[])|
|(169,[160],[2.0])|      (10,[],[])|
|(169,[160],[2.0])|      (10,[],[])|
|(169,[160],[2.0])|      (10,[],[])|
|(169,[160],[2.0])|      (10,[],[])|
| (169,[14],[1.0])|      (10,[],[])|
| (169,[14],[1.0])|      (10,[],[])|
| (169,[66],[1.0])|      (10,[],[])|
| (169,[66],[1.0])|      (10,[],[])|
|(169,[134],[1.0])|      (10,[],[])|
|(169,[134],[1.0])|      (10,[],[])|
|(169,[134],[1.0])|      (10,[],[])|
|(169,[134],[1.0])|      (10,[],[])|
|(169,[134],[1.0])|      (10,[],[])|
|(169,[134],[1.0])|      (10,[],[])|
|(169,[134],[1.0])|      (10,[],[])|
| (169,[27],[1.0])|      (10,[],[])|
+-----------------+----------------+
only showing top 20 rows



Covid-19

In [14]:
df = assembler.transform(finalDf)
df = df.na.drop()

selector = UnivariateFeatureSelector(featuresCol="features", outputCol="selectedFeatures",
                                     labelCol="covid-19", selectionMode="numTopFeatures")
selector.setFeatureType("continuous").setLabelType("continuous").setSelectionThreshold(10)
''
uniResult = selector.fit(df).transform(df)

print("UnivariateFeatureSelector output with top %d features selected using f_classif"
      % selector.getSelectionThreshold())
uniResult.select('features', 'selectedFeatures').show()

UnivariateFeatureSelector output with top 10 features selected using f_classif
+-----------------+----------------+
|         features|selectedFeatures|
+-----------------+----------------+
| (169,[66],[2.0])|      (10,[],[])|
| (169,[66],[2.0])|      (10,[],[])|
| (169,[66],[2.0])|      (10,[],[])|
| (169,[66],[2.0])|      (10,[],[])|
|(169,[160],[2.0])|      (10,[],[])|
|(169,[160],[2.0])|      (10,[],[])|
|(169,[160],[2.0])|      (10,[],[])|
|(169,[160],[2.0])|      (10,[],[])|
| (169,[14],[1.0])|      (10,[],[])|
| (169,[14],[1.0])|      (10,[],[])|
| (169,[66],[1.0])|      (10,[],[])|
| (169,[66],[1.0])|      (10,[],[])|
|(169,[134],[1.0])|      (10,[],[])|
|(169,[134],[1.0])|      (10,[],[])|
|(169,[134],[1.0])|      (10,[],[])|
|(169,[134],[1.0])|      (10,[],[])|
|(169,[134],[1.0])|      (10,[],[])|
|(169,[134],[1.0])|      (10,[],[])|
|(169,[134],[1.0])|      (10,[],[])|
| (169,[27],[1.0])|      (10,[],[])|
+-----------------+----------------+
only showing top 20 rows



Deceased & Covid-19

In [15]:
df = assembler.transform(finalDf)
df = df.na.drop()

selector = UnivariateFeatureSelector(featuresCol="features", outputCol="selectedFeatures",
                                     labelCol="deceased & covid-19", selectionMode="numTopFeatures")
selector.setFeatureType("continuous").setLabelType("continuous").setSelectionThreshold(10)

uniResult = selector.fit(df).transform(df)

print("UnivariateFeatureSelector output with top %d features selected using f_classif"
      % selector.getSelectionThreshold())
uniResult.select('features', 'selectedFeatures').show()

UnivariateFeatureSelector output with top 10 features selected using f_classif
+-----------------+----------------+
|         features|selectedFeatures|
+-----------------+----------------+
| (169,[66],[2.0])|      (10,[],[])|
| (169,[66],[2.0])|      (10,[],[])|
| (169,[66],[2.0])|      (10,[],[])|
| (169,[66],[2.0])|      (10,[],[])|
|(169,[160],[2.0])|      (10,[],[])|
|(169,[160],[2.0])|      (10,[],[])|
|(169,[160],[2.0])|      (10,[],[])|
|(169,[160],[2.0])|      (10,[],[])|
| (169,[14],[1.0])|      (10,[],[])|
| (169,[14],[1.0])|      (10,[],[])|
| (169,[66],[1.0])|      (10,[],[])|
| (169,[66],[1.0])|      (10,[],[])|
|(169,[134],[1.0])|      (10,[],[])|
|(169,[134],[1.0])|      (10,[],[])|
|(169,[134],[1.0])|      (10,[],[])|
|(169,[134],[1.0])|      (10,[],[])|
|(169,[134],[1.0])|      (10,[],[])|
|(169,[134],[1.0])|      (10,[],[])|
|(169,[134],[1.0])|      (10,[],[])|
| (169,[27],[1.0])|      (10,[],[])|
+-----------------+----------------+
only showing top 20 rows



### Variance Threshold (0.3)

In [9]:
cols = list(set(df.columns) - {'PATIENT', 'deceased', 'Code', 'Description', 'covid-19', 'deceased & covid-19'})
df = df.select(cols)
selector = VarianceThresholdSelector(varianceThreshold=(0.3) , outputCol="selectedFeatures")

model = selector.fit(df)
varResult = model.transform(df)
model.selectedFeatures
print("Output: Features with variance lower than %f are removed." % selector.getVarianceThreshold())
varResult.select('features', 'selectedFeatures').show()

Output: Features with variance lower than 0.300000 are removed.
+-----------------+----------------+
|         features|selectedFeatures|
+-----------------+----------------+
| (169,[86],[1.0])|      (22,[],[])|
| (169,[86],[1.0])|      (22,[],[])|
|(169,[164],[1.0])|      (22,[],[])|
|(169,[164],[1.0])|      (22,[],[])|
| (169,[64],[1.0])|      (22,[],[])|
| (169,[64],[1.0])|      (22,[],[])|
|(169,[130],[1.0])|      (22,[],[])|
|(169,[130],[1.0])|      (22,[],[])|
| (169,[89],[1.0])|      (22,[],[])|
| (169,[89],[1.0])|      (22,[],[])|
| (169,[89],[1.0])|      (22,[],[])|
| (169,[89],[1.0])|      (22,[],[])|
| (169,[89],[1.0])|      (22,[],[])|
| (169,[89],[1.0])|      (22,[],[])|
| (169,[89],[1.0])|      (22,[],[])|
| (169,[89],[1.0])|      (22,[],[])|
| (169,[89],[1.0])|      (22,[],[])|
| (169,[89],[1.0])|      (22,[],[])|
| (169,[89],[1.0])|      (22,[],[])|
| (169,[89],[1.0])|      (22,[],[])|
+-----------------+----------------+
only showing top 20 rows

