## Random Forest Classifier

Random Forest learning algorithm for classification. It supports both binary and multiclass labels, as well as both continuous and categorical features

In [1]:
import findspark
findspark.init()
findspark.find()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ConditionFeatures').getOrCreate()

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorSlicer, VectorAssembler, ChiSqSelector, VectorIndexer, UnivariateFeatureSelector, VarianceThresholdSelector
from pyspark.sql.functions import *
import numpy as np
from pyspark.sql.types import IntegerType

### Reading and Merging Data

In [2]:
df = spark.read.option("header",True).csv('../../synthea-sample-data/data/10k_synthea_covid19_csv/conditions.csv').select('PATIENT','Code', 'Description')
deathDf = spark.read.option("header",True).csv('../../synthea-sample-data/data/10k_synthea_covid19_csv/patients.csv').select('Id', 'DEATHDATE')
deadSet = df.join(deathDf, (df.PATIENT == deathDf.Id)).na.drop().drop('Id', 'Code')
labels = spark.read.option("header",True).csv('../FeatureSelection/dfCovid_DeceasedCovid.csv').select('PATIENT', 'covid-19', 'deceased & covid-19')

merged = df.join(deathDf, (df.PATIENT == deathDf.Id), 'left').drop( 'Id')

merged = merged.withColumn('deceased', when(col('DEATHDATE').isNotNull(), 1)).na.fill(0)
merged = merged.join(labels, ('PATIENT'), 'left').dropDuplicates()

In [3]:
groupedDf = merged.groupBy("PATIENT", 'Code').pivot("Code").agg(count("Code").alias("count")).na.fill(0)
merged =merged.select('PATIENT', 'deceased', 'covid-19', 'deceased & covid-19')
finalDf = groupedDf.join(merged, ['PATIENT'], 'left')


In [4]:
cols = list(set(finalDf.columns) - {'PATIENT', 'deceased', 'Code', 'Description', 'covid-19', 'deceased & covid-19'})
assembler = VectorAssembler().setInputCols(cols).setOutputCol('features')
finalDf = finalDf.withColumn("covid-19", finalDf["covid-19"].cast(IntegerType())).withColumn("deceased & covid-19", finalDf["deceased & covid-19"].cast(IntegerType()))
df = assembler.transform(finalDf)
df.printSchema()

root
 |-- PATIENT: string (nullable = true)
 |-- Code: string (nullable = true)
 |-- 10509002: long (nullable = true)
 |-- 109838007: long (nullable = true)
 |-- 110030002: long (nullable = true)
 |-- 124171000119105: long (nullable = true)
 |-- 126906006: long (nullable = true)
 |-- 127013003: long (nullable = true)
 |-- 127295002: long (nullable = true)
 |-- 128613002: long (nullable = true)
 |-- 132281000119108: long (nullable = true)
 |-- 1501000119109: long (nullable = true)
 |-- 1551000119108: long (nullable = true)
 |-- 156073000: long (nullable = true)
 |-- 157141000119108: long (nullable = true)
 |-- 15777000: long (nullable = true)
 |-- 16114001: long (nullable = true)
 |-- 161622006: long (nullable = true)
 |-- 162573006: long (nullable = true)
 |-- 162864005: long (nullable = true)
 |-- 1734006: long (nullable = true)
 |-- 185086009: long (nullable = true)
 |-- 190905008: long (nullable = true)
 |-- 19169002: long (nullable = true)
 |-- 192127007: long (nullable = true)
 |-

### Chi-Squared Features

Deceased

In [5]:
selector = ChiSqSelector(numTopFeatures=10, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="deceased")
chiResult = selector.fit(df).transform(df)

In [6]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="deceased", seed=42,leafCol="leafId")
(train, test) = chiResult.randomSplit([0.8, 0.2])
model = rf.fit(train).setFeaturesCol('features')


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "C:\Users\Mirna Elizondo\anaconda3\lib\site-packages\py4j\clientserver.py", line 475, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "C:\Users\Mirna Elizondo\anaconda3\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Mirna Elizondo\anaconda3\lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "C:\Users\Mirna Elizondo\anaconda3\lib\site-packages\py4j\clientserver.py", line 503, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it

In [7]:
rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="deceased", seed=42,leafCol="leafId")
(train, test) = chiResult.randomSplit([0.8, 0.2])
model = rf.fit(train).setFeaturesCol('selectedFeatures')

ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it

Covid-19

In [None]:
df = assembler.transform(finalDf)
df = df.na.drop()
selector = ChiSqSelector(numTopFeatures=10, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="covid-19")
chiResult = selector.fit(df).transform(df)

Deceased & Covid-19 

In [None]:
df = assembler.transform(finalDf)
df = df.na.drop()
selector = ChiSqSelector(numTopFeatures=10, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="deceased & covid-19")
chiResult = selector.fit(df).transform(df)
