## Random Forest Classifier

Random Forest learning algorithm for classification. It supports both binary and multiclass labels, as well as both continuous and categorical features

In [1]:
import findspark
findspark.init()
findspark.find()
MAX_MEMORY = "16g"

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ObservationsFeatures').config("spark.executor.memory", MAX_MEMORY).config("spark.driver.memory", MAX_MEMORY).getOrCreate()


from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorSlicer, VectorAssembler, ChiSqSelector, VectorIndexer, UnivariateFeatureSelector, VarianceThresholdSelector
from pyspark.sql.functions import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
import numpy as np
from sklearn.metrics import confusion_matrix
from pyspark.sql.types import IntegerType

### Reading and Merging Data

In [2]:
df = spark.read.option("header",True).csv('../../synthea-sample-data/data/10k_synthea_covid19_csv/observations.csv').select('PATIENT','Code', 'Description')
deathDf = spark.read.option("header",True).csv('../../synthea-sample-data/data/10k_synthea_covid19_csv/patients.csv').select('Id', 'DEATHDATE')
deadSet = df.join(deathDf, (df.PATIENT == deathDf.Id)).na.drop().drop('Id', 'Code')
labels = spark.read.option("header",True).csv('../FeatureSelection/dfCovid_DeceasedCovid.csv').select('PATIENT', 'covid-19', 'deceased & covid-19')

merged = df.join(deathDf, (df.PATIENT == deathDf.Id), 'left').drop( 'Id')

merged = merged.withColumn('deceased', when(col('DEATHDATE').isNotNull(), 1)).na.fill(0)
merged = merged.join(labels, ('PATIENT'), 'left').dropDuplicates()

In [3]:
groupedDf = merged.groupBy("PATIENT", 'Code').pivot("Code").agg(count("Code").alias("count")).na.fill(0)
merged =merged.select('PATIENT', 'deceased', 'covid-19', 'deceased & covid-19')
finalDf = groupedDf.join(merged, ['PATIENT'], 'left')


In [4]:
cols = list(set(finalDf.columns) - {'PATIENT', 'deceased', 'Code', 'Description', 'covid-19', 'deceased & covid-19'})
assembler = VectorAssembler().setInputCols(cols).setOutputCol('features')
finalDf = finalDf.withColumn("covid-19", finalDf["covid-19"].cast(IntegerType())).withColumn("deceased & covid-19", finalDf["deceased & covid-19"].cast(IntegerType()))
df = assembler.transform(finalDf)
df.printSchema()

root
 |-- PATIENT: string (nullable = true)
 |-- Code: string (nullable = true)
 |-- 10230-1: long (nullable = true)
 |-- 10480-2: long (nullable = true)
 |-- 10834-0: long (nullable = true)
 |-- 14804-9: long (nullable = true)
 |-- 14959-1: long (nullable = true)
 |-- 1742-6: long (nullable = true)
 |-- 1751-7: long (nullable = true)
 |-- 17861-6: long (nullable = true)
 |-- 18262-6: long (nullable = true)
 |-- 1920-8: long (nullable = true)
 |-- 1960-4: long (nullable = true)
 |-- 1975-2: long (nullable = true)
 |-- 1988-5: long (nullable = true)
 |-- 19926-5: long (nullable = true)
 |-- 19994-3: long (nullable = true)
 |-- 2019-8: long (nullable = true)
 |-- 2028-9: long (nullable = true)
 |-- 20454-5: long (nullable = true)
 |-- 20505-4: long (nullable = true)
 |-- 20565-8: long (nullable = true)
 |-- 20570-8: long (nullable = true)
 |-- 2069-3: long (nullable = true)
 |-- 2075-0: long (nullable = true)
 |-- 2085-9: long (nullable = true)
 |-- 2093-3: long (nullable = true)
 |-- 21

### Chi-Squared Features

Deceased

In [5]:
selector = ChiSqSelector(numTopFeatures=10, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="deceased")
chiResult = selector.fit(df).transform(df)

In [6]:
metrics=['accuracy', 'weightedPrecision', 'weightedRecall', 'f1']
metrics_bin=['areaUnderROC', 'areaUnderPR']

def printPerformance(results):
    for m in metrics: print('{}: {:f}'.format(m, my_eval_bin.evaluate(results, {my_eval.metricName: m})))
    for m in metrics_bin: print('{}: {:f}'.format(m, my_eval_bin.evaluate(results, {my_eval_bin.metricName: m})))
    y_test = np.array(results.select("deceased").collect()).flatten()
    y_pred = np.array(results.select("prediction").collect()).flatten()
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
rf = RandomForestClassifier(numTrees=3, maxDepth=5, labelCol="deceased", seed=42,leafCol="leafId")
(train, test) = chiResult.randomSplit([0.8, 0.2])
model = rf.fit(train).setFeaturesCol('features')
my_eval = MulticlassClassificationEvaluator(labelCol = 'deceased', predictionCol = 'prediction')
my_eval_bin = BinaryClassificationEvaluator(labelCol ='deceased', rawPredictionCol = 'prediction')
results = model.transform(test)
print("\n[Performance on Deceased Label Test set]")
printPerformance(results)

Covid-19

In [None]:
df = assembler.transform(finalDf)
df = df.na.drop()
selector = ChiSqSelector(numTopFeatures=10, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="covid-19")
chiResult = selector.fit(df).transform(df)

In [None]:
def printPerformance(results):
    for m in metrics: print('{}: {:f}'.format(m, my_eval_bin.evaluate(results, {my_eval.metricName: m})))
    for m in metrics_bin: print('{}: {:f}'.format(m, my_eval_bin.evaluate(results, {my_eval_bin.metricName: m})))
    y_test = np.array(results.select("covid-19").collect()).flatten()
    y_pred = np.array(results.select("prediction").collect()).flatten()
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
rf = RandomForestClassifier(numTrees=3, maxDepth=5, labelCol="covid-19", seed=42,leafCol="leafId")
(train, test) = chiResult.randomSplit([0.8, 0.2])
model = rf.fit(train).setFeaturesCol('features')
my_eval = MulticlassClassificationEvaluator(labelCol = 'covid-19', predictionCol = 'prediction')
my_eval_bin = BinaryClassificationEvaluator(labelCol ='covid-19', rawPredictionCol = 'prediction')
results = model.transform(test)
print("\n[Performance on covid-19 Label Test set]")
printPerformance(results)

Deceased & Covid-19 

In [None]:
df = assembler.transform(finalDf)
df = df.na.drop()
selector = ChiSqSelector(numTopFeatures=10, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="deceased & covid-19")
chiResult = selector.fit(df).transform(df)


In [None]:
def printPerformance(results):
    for m in metrics: print('{}: {:f}'.format(m, my_eval_bin.evaluate(results, {my_eval.metricName: m})))
    for m in metrics_bin: print('{}: {:f}'.format(m, my_eval_bin.evaluate(results, {my_eval_bin.metricName: m})))
    y_test = np.array(results.select("deceased & covid-19").collect()).flatten()
    y_pred = np.array(results.select("prediction").collect()).flatten()
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
rf = RandomForestClassifier(numTrees=3, maxDepth=5, labelCol="deceased & covid-19", seed=42,leafCol="leafId")
(train, test) = chiResult.randomSplit([0.8, 0.2])
model = rf.fit(train).setFeaturesCol('features')
my_eval = MulticlassClassificationEvaluator(labelCol = 'deceased & covid-19', predictionCol = 'prediction')
my_eval_bin = BinaryClassificationEvaluator(labelCol ='deceased & covid-19', rawPredictionCol = 'prediction')
results = model.transform(test)
print("\n[Performance on  deceased & covid-19  Label Test set]")
printPerformance(results)