In [1]:
# Import all necessary libraries and create spark session
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,upper,udf,element_at,explode,regexp_replace,size
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, DateType

from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression

import os

spark = SparkSession \
    .builder \
    .appName("FHIR Analytics with Python") \
    .config("spark.sql.extensions", "com.datastax.spark.connector.CassandraSparkExtensions") \
    .getOrCreate()
keyspace = "myCatalog.hfs_data"


In [2]:
# create a reference to our Cassandra catalog
spark.conf.set("spark.sql.catalog.myCatalog", "com.datastax.spark.connector.datasource.CassandraCatalog")

# Increase memory for better performance
spark.conf.set("spark.cassandra.input.split.sizeInMB", "67108864")


In [3]:
# define UDFs to pull some data out of our structs
def getMaritalStatus(ms):
    if (ms == None):
        return None
    return ms.text_
gms = udf(getMaritalStatus, StringType())
def getBirthDate(bd):
    return bd[0]
gbd = udf(getBirthDate, DateType())

# a reusable function to get a specific LOINC code and average all for by Patient ID
def getAvgForLoinc(loinc_code, agg_column, df):
    return df.filter(col("LoincCode").like(loinc_code)) \
                         .withColumnRenamed("ValueQuantity", agg_column) \
                         .select(col("Subject"), col(agg_column)) \
                         .groupBy("Subject").agg(F.round(F.avg(col(agg_column)), 3).alias(agg_column))

In [4]:
# Read in the tables we care about
rawPatient = spark.read.table(keyspace + ".patient")
rawReference = spark.read.table(keyspace + ".reference")
rawObservation = spark.read.table(keyspace + ".observation")
rawEncounter = spark.read.table(keyspace + ".encounter")

# An alternative way to read tables, left for reference
#rawReference = spark.read.format("org.apache.spark.sql.cassandra").options(**{"table": "reference","keyspace": "hfs_data"}).load()
#rawObservation = spark.read.format("org.apache.spark.sql.cassandra").options(**{"table": "observation","keyspace": "hfs_data"}).load()


In [5]:
patientDataFrame = rawPatient.select(col("id").alias("PatientId"), gms(col("maritalstatus")).alias("Marital Status"), col("birthdate")["0"].alias("birthdate"), upper(col("gender")).alias("Gender")).withColumn("Age", F.round(F.datediff(F.current_date(), F.to_date(col("birthdate"))) / 365, 1))

In [6]:
patientDataFrame.cache()
patientDataFrame.count()

3735

In [7]:
# select only Patient references and manipulate the column values so they are ready for joining with other tables

referenceDataFrame = rawReference.where(rawReference.reference.like("Patient%")).select(regexp_replace(rawReference.id, "#hidden", "").alias("id"), regexp_replace(rawReference.reference, "Patient/", "").alias("reference"))

In [8]:
referenceDataFrame.cache()
referenceDataFrame.count()

895166

In [9]:
# rename some observation columns AND join with reference table to resolve Patient ID references properly

observationDataFrame = rawObservation.select(col("id").alias("ObservationId"), col("code"), col("component"), col("valuequantity"), col("subject").alias("PatientReferenceId"))
observationDataFrame = observationDataFrame.join(referenceDataFrame, referenceDataFrame.id == observationDataFrame.PatientReferenceId) \
                  .withColumn("id", col("id").cast(StringType())).withColumn("reference", col("reference").cast(StringType())) \
                  .withColumnRenamed('reference', "PatientId") \
                  .drop(col("id")) \
                  .drop(col("PatientReferenceId"))




In [10]:
# Filter and select LOINC information from observation dataframe

observationDataFrame_loinc = observationDataFrame \
    .select(col("PatientId").alias("Subject"), \
            col("ObservationId"), \
            col("code").coding[0].code.alias("LoincCode"), \
            col("valuequantity").value.alias("ValueQuantity"))

loinc_code_list = ["8480-6", "8462-4","29463-7","8302-2","33914-3","2571-8","2085-9","18262-6","2093-3","39156-5","55284-4", "195967001", "233678006"]


observationDataFrame_loinc = observationDataFrame_loinc \
    .filter(col("LoincCode").isin(loinc_code_list))\
    .na.drop()

In [11]:
observationDataFrame_loinc.cache()

DataFrame[Subject: string, ObservationId: string, LoincCode: string, ValueQuantity: decimal(38,18)]

In [12]:
# Select only Body Weight observations from pre-filtered observation data frame

body_weight_df = observationDataFrame_loinc \
    .filter(col("LoincCode").like("%29463-7%")) \
    .withColumnRenamed("ValueQuantity", "Body Weight") \
    .select(col("Subject"), col("Body Weight")) \
    .na.drop() \
    .dropDuplicates()

In [13]:
# Calculate Systolic, Dystolic and total BP, then avg per patient

blood_pressure_df = observationDataFrame.select(col("PatientId").alias("Subject"),
                          "ObservationId",
                          col("component")[0].code.coding[0].code.alias("DBPCode"), \
                          col("component")[0].valuequantity.value.alias("Diastolic Blood Pressure"), \
                          col("component")[1].code.coding[0].code.alias("SBPCode"), \
                          col("component")[1].valuequantity.value.alias("Systolic Blood Pressure")).na.drop()



blood_pressure_df = blood_pressure_df.withColumn("Blood Pressure", \
      F.round(col("Diastolic Blood Pressure")+(col("Systolic Blood Pressure") - col("Diastolic Blood Pressure"))/3))

blood_pressure_avg_df = blood_pressure_df.groupBy(col("Subject")).agg(F.round(F.avg("Diastolic Blood Pressure"), 3).alias("Diastolic BP"), \
                                           F.round(F.avg("Systolic Blood Pressure"), 3).alias("Systolic BP"), \
                                           F.round(F.avg("Blood Pressure"), 3).alias("BP") \
                                          )


In [14]:
# Select all encounters that show as Asthma diagnoses

encounterDataFrame = rawEncounter.filter(F.size(col("reasoncode")) > 0).select(col("subject"), col("reasoncode")[0].coding[0]["code"].alias("Asthma")) \
                                 .withColumn("Asthma", F.when(col("Asthma").isin(["195967001","233678006"]), F.lit(1)).otherwise(F.lit(0)))

encounterDataFrame = encounterDataFrame.join(referenceDataFrame, encounterDataFrame.subject == referenceDataFrame.id) \
                                               .drop("subject", "id") \
                                               .withColumnRenamed("reference", "subject")

encounterDataFrame_asthma = encounterDataFrame.groupBy("subject").agg(F.max(col("Asthma")).alias("Asthma"))

In [15]:
# Calculate avg body weight per patient and join with patient data frame

patient_calc_df = body_weight_df.groupBy("Subject").agg(F.round(F.avg("Body Weight"), 3).alias("Body Weight")) \
                                .join(patientDataFrame, body_weight_df.Subject == patientDataFrame.PatientId) \
                                .drop("Subject")

#print("BW " + str(patient_calc_df.count()))

# Add literal demo info to patients
patient_calc_df = patient_calc_df.dropDuplicates() \
                                 .withColumn("Disease", F.array(F.lit("0"))) \
                                 .withColumn("PostalCode", F.array(F.lit("0")))

# Join BP info by patient
# COMMENTED OUT BECAUSE LASSO
patient_calc_df = patient_calc_df.join(blood_pressure_avg_df, \
                                             blood_pressure_avg_df.Subject == patient_calc_df.PatientId, "left") \
                                       .drop("Subject")

#print("BP " + str(patient_calc_df.count()))
     

# Join asthma info by patient
patient_calc_df = patient_calc_df.join(encounterDataFrame_asthma, encounterDataFrame_asthma.subject == patient_calc_df.PatientId, "left") \
                   .dropDuplicates() \
                   .drop(col("subject"))

#print("Asthma " + str(patient_calc_df.count()))

# Calculate and join avg triglycerides by patient
triglycerides_df = getAvgForLoinc("%2571-8%", "Triglycerides", observationDataFrame_loinc)

patient_calc_df = patient_calc_df.join(triglycerides_df, patient_calc_df.PatientId == triglycerides_df.Subject, "left") \
                                 .drop(col("Subject"))

#print("Triglycerides " + str(patient_calc_df.count()))


# Calculate and join average EGFR by patient
egfrLoincCode = ["88294-4", "33914-3"]
agg_column = "Estimated Glomerular Filtration Rate"

egfr_df = observationDataFrame_loinc.select(col("Subject"), col("LoincCode"), col("ValueQuantity")) \
                                 .filter(col("LoincCode").isin(egfrLoincCode)) \
                                 .withColumn(agg_column, col("ValueQuantity")) \
                                 .groupBy("Subject").agg(F.round(F.avg(col(agg_column))).alias(agg_column))

# COMMENTED OUT BECAUSE LASSO
patient_calc_df = patient_calc_df.join(egfr_df, patient_calc_df.PatientId == egfr_df.Subject, "left") \
                                 .drop(col("Subject"))

#print("EGFR " + str(patient_calc_df.count()))


# COMMENTED OUT BECAUSE LASSO
# Calculate and join avg LDL by patient
ldl_df = getAvgForLoinc("%18262-6%", "Low Density Lipoprotein", observationDataFrame_loinc)

patient_calc_df = patient_calc_df.join(ldl_df, patient_calc_df.PatientId == ldl_df.Subject, "left") \
                   .dropDuplicates().drop(col("Subject"))

#print("LDL " + str(patient_calc_df.count()))


# Calculate and join average HDL by patient
hdl_df = getAvgForLoinc("%2085-9%", "High Density Lipoprotein Cholesterol", observationDataFrame_loinc)

patient_calc_df = patient_calc_df.join(hdl_df, patient_calc_df.PatientId == hdl_df.Subject, "left") \
                   .drop(col("Subject"))

#print("HDL " + str(patient_calc_df.count()))


# Calculate and join average height by patient
height_df = getAvgForLoinc("%8302-2%", "Body Height", observationDataFrame_loinc)

patient_calc_df = patient_calc_df.join(height_df, patient_calc_df.PatientId == height_df.Subject, "left") \
                   .drop(col("Subject"))

#print("Height " + str(patient_calc_df.count()))


#Calculate and join average BMI by patient
bmi_df = getAvgForLoinc("%39156-5%", "BMI", observationDataFrame_loinc)
    
patient_calc_df = patient_calc_df.join(bmi_df, patient_calc_df.PatientId == bmi_df.Subject, "left") \
                   .drop(col("Subject"))

#print("BMI " + str(patient_calc_df.count()))


#Calculate and join average cholesterol by patient
cholesterol_df = getAvgForLoinc("%2093-3%", "Total Cholesterol", observationDataFrame_loinc)

patient_calc_df = patient_calc_df.join(cholesterol_df, patient_calc_df.PatientId == cholesterol_df.Subject, "left") \
                   .drop(col("Subject"))

#print("Cholesterol " + str(patient_calc_df.count()))



In [16]:
asthma_dataset = patient_calc_df.drop("PatientId", "Disease", "PostalCode", "birthdate", "Diagnosed Date", "Marital Status").na.drop()
#asthma_dataset = spark.read.csv("/Users/Harrison/projects/helios/analytics-blog-article/AsthmaDataset.csv", header = True, inferSchema = True).drop("PatientId", "Disease", "PostalCode", "birthdate", "Diagnosed Date", "Birth Date").na.drop()

In [17]:
asthma_dataset.cache()
asthma_dataset.filter(col("Asthma") ==1).count()

1

In [19]:
asthma_dataset.cache()

DataFrame[Body Weight: decimal(38,3), Gender: string, Age: double, Diastolic BP: decimal(38,3), Systolic BP: decimal(38,3), BP: decimal(38,3), Asthma: int, Triglycerides: decimal(38,3), Estimated Glomerular Filtration Rate: decimal(38,0), Low Density Lipoprotein: decimal(38,3), High Density Lipoprotein Cholesterol: decimal(38,3), Body Height: decimal(38,3), BMI: decimal(38,3), Total Cholesterol: decimal(38,3)]

In [20]:
asthma_dataset.show()

+-----------+------+-----+------------+-----------+-------+------+-------------+------------------------------------+-----------------------+------------------------------------+-----------+------+-----------------+
|Body Weight|Gender|  Age|Diastolic BP|Systolic BP|     BP|Asthma|Triglycerides|Estimated Glomerular Filtration Rate|Low Density Lipoprotein|High Density Lipoprotein Cholesterol|Body Height|   BMI|Total Cholesterol|
+-----------+------+-----+------------+-----------+-------+------+-------------+------------------------------------+-----------------------+------------------------------------+-----------+------+-----------------+
|     89.700|  MALE| 81.7|      78.833|    116.167| 91.333|     0|      132.788|                                 119|                 78.340|                              67.913|    175.900|28.990|          172.810|
|     88.725|  MALE| 56.9|      81.750|    119.375| 94.250|     0|      147.112|                                  78|                102

In [21]:
asthma_dataset.count()

94

In [22]:
asthma_dataset.filter(col("Asthma") == 1).show()

+-----------+------+----+------------+-----------+-------+------+-------------+------------------------------------+-----------------------+------------------------------------+-----------+------+-----------------+
|Body Weight|Gender| Age|Diastolic BP|Systolic BP|     BP|Asthma|Triglycerides|Estimated Glomerular Filtration Rate|Low Density Lipoprotein|High Density Lipoprotein Cholesterol|Body Height|   BMI|Total Cholesterol|
+-----------+------+----+------------+-----------+-------+------+-------------+------------------------------------+-----------------------+------------------------------------+-----------+------+-----------------+
|     77.820|FEMALE|38.3|     100.600|    147.300|116.200|     1|      184.088|                                 129|                136.573|                              46.143|    152.700|33.703|          219.530|
+-----------+------+----+------------+-----------+-------+------+-------------+------------------------------------+-----------------------+

In [23]:
trainDF, testDF = asthma_dataset.randomSplit([.8, .2], seed = 42)
print(f"""There are {trainDF.count()} rows in the training set and {testDF.count()} in the test set""")

There are 77 rows in the training set and 17 in the test set


In [24]:
# num_asthma = trainDF.groupBy("Asthma").count().select("count").filter(col("Asthma") == 1).first()[0]
# ratio = num_asthma/trainDF.count()

# trainDF = trainDF.withColumn("weights", F.when(trainDF.Asthma == 1, 1-ratio).otherwise(ratio))
# trainDF.select("Asthma", "weights").show()

In [25]:
categoricalCols = [field for (field, dataType) in trainDF.dtypes if dataType == "string"]
indexOutputCols = [x + "Index" for x in categoricalCols]
oheOutputCols = [x + "OHE" for x in categoricalCols]

stringIndexer = StringIndexer(inputCols = categoricalCols, outputCols = indexOutputCols, handleInvalid = "skip")
oheEncoder = OneHotEncoder(inputCols = indexOutputCols, outputCols = oheOutputCols)

numericCols = [field for (field, dataType) in trainDF.dtypes
                if dataType != "string" and field != "Asthma"]

assemblerInputs = oheOutputCols + numericCols

vecAssembler = VectorAssembler(inputCols = assemblerInputs, outputCol = "features")

In [26]:
# Train a model one time, see a baseline using Generalized Linear Regression

from pyspark.ml.regression import GeneralizedLinearRegression

glm = GeneralizedLinearRegression(family = "binomial", labelCol = "Asthma")
glm_pipeline = Pipeline(stages = [stringIndexer, oheEncoder, vecAssembler, glm])

trainDF.show(truncate = False)
glm_model = glm_pipeline.fit(trainDF)
predDF = glm_model.transform(testDF)

#summary = glm_model.stages[-1].summary
# print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
# print("T Values: " + str(summary.tValues))
# print("P Values: " + str(summary.pValues))
# print("Dispersion: " + str(summary.dispersion))
# print("Null Deviance: " + str(summary.nullDeviance))
# print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull))
# print("Deviance: " + str(summary.deviance))
# print("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom))
# print("AIC: " + str(summary.aic))
# print("Deviance Residuals: ")
# summary.residuals().show()
# print(summary)
predDF.select("features", "Asthma", "prediction", "Gender").filter(col("Asthma") == 1).show()

## Check if we have GenderOHE_FEMALE

+-----------+------+-----+------------+-----------+-------+------+-------------+------------------------------------+-----------------------+------------------------------------+-----------+------+-----------------+
|Body Weight|Gender|Age  |Diastolic BP|Systolic BP|BP     |Asthma|Triglycerides|Estimated Glomerular Filtration Rate|Low Density Lipoprotein|High Density Lipoprotein Cholesterol|Body Height|BMI   |Total Cholesterol|
+-----------+------+-----+------------+-----------+-------+------+-------------+------------------------------------+-----------------------+------------------------------------+-----------+------+-----------------+
|89.700     |MALE  |81.7 |78.833      |116.167    |91.333 |0     |132.788      |119                                 |78.340                 |67.913                              |175.900    |28.990|172.810          |
|88.725     |MALE  |56.9 |81.750      |119.375    |94.250 |0     |147.112      |78                                  |102.081            

In [27]:
# Use CrossValidator to select the best model from 2 possible regParams

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

paramGrid = ParamGridBuilder() \
    .addGrid(glm.regParam, [0.1, 0.01]) \
    .build()

crossval = CrossValidator(estimator=glm_pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(labelCol = "Asthma"),
                          numFolds=2
                         )  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(trainDF)
prediction = cvModel.transform(testDF)
selected = prediction.select("Asthma", "prediction").show()

+------+--------------------+
|Asthma|          prediction|
+------+--------------------+
|     0|6.485247738964128...|
|     0|3.613388537075863...|
|     0|             1.0E-16|
|     0|             1.0E-16|
|     0|             1.0E-16|
|     0|             1.0E-16|
|     0|3.154426051761708...|
|     0|3.801658789252083...|
|     0|             1.0E-16|
|     0|             1.0E-16|
|     0|             1.0E-16|
|     0|             1.0E-16|
|     0|             1.0E-16|
|     0|             1.0E-16|
|     0|             1.0E-16|
|     0|             1.0E-16|
|     0|             1.0E-16|
+------+--------------------+



In [28]:
# Print the summary (need help interpreting this)

trainingSummary = cvModel.bestModel.stages[-1].summary

print(trainingSummary)

Coefficients:
             Feature Estimate    Std Error T Value P Value
         (Intercept)   5.4238 3326550.6240  0.0000  1.0000
      GenderOHE_MALE  -0.0645   62155.5389  0.0000  1.0000
         Body Weight   0.9099   19774.1400  0.0000  1.0000
                 Age   0.0517    9897.9016  0.0000  1.0000
        Diastolic BP   0.5884   44531.4956  0.0000  1.0000
         Systolic BP  -0.4232   28224.0135  0.0000  1.0000
                  BP   0.2588   50941.6582  0.0000  1.0000
       Triglycerides   0.0121    6831.6685  0.0000  1.0000
Estimated Glomeru...   0.2562    3715.7622  0.0001  0.9999
Low Density Lipop...   0.0749   22698.8167  0.0000  1.0000
High Density Lipo...  -0.0294   33295.8135  0.0000  1.0000
         Body Height  -1.8624   20665.8911 -0.0001  0.9999
                 BMI   1.1022   53069.4640  0.0000  1.0000
   Total Cholesterol   0.5459   23485.7555  0.0000  1.0000

(Dispersion parameter for binomial family taken to be 1.0000)
   Null deviance: 10.6746 on 63 degree

In [29]:
# Try a "default" Logistic Regression
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol = "Asthma", featuresCol = "features", maxIter=10)
pipeline = Pipeline(stages = [stringIndexer, oheEncoder, vecAssembler, lr])

# Fit the model
lrModel = pipeline.fit(trainDF)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.stages[-1].coefficients))
print("Intercept: " + str(lrModel.stages[-1].intercept))

print(lrModel.stages[-1].summary)


predDF = lrModel.transform(testDF)

print(predDF.schema.names)

predDF.select("features", "Asthma", "prediction").filter("Asthma > 0").show()

Coefficients: [-2.5318347181054275,-0.04459336258051747,-0.031900724741654306,0.11905140147548993,0.05410605229101334,0.09241761273526654,0.015584702677107482,0.01629516802287645,0.02759625317541442,-0.14901724113786272,-0.22263782774010563,0.3958524860030187,0.00764123815383942]
Intercept: -4.378136616114738
<pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary object at 0x122cf3af0>
['Body Weight', 'Gender', 'Age', 'Diastolic BP', 'Systolic BP', 'BP', 'Asthma', 'Triglycerides', 'Estimated Glomerular Filtration Rate', 'Low Density Lipoprotein', 'High Density Lipoprotein Cholesterol', 'Body Height', 'BMI', 'Total Cholesterol', 'GenderIndex', 'GenderOHE', 'features', 'rawPrediction', 'probability', 'prediction']
+--------+------+----------+
|features|Asthma|prediction|
+--------+------+----------+
+--------+------+----------+



In [30]:
predDF.count()

17

In [31]:
trainDF.na.drop().count()

77

In [32]:
### DON'T REALLY KNOW WHAT THIS DOES - it seems like 'lr.setThreshold' is supposed to improve the prediction ability

# Extract the summary from the returned LogisticRegressionModel instance trained
# in the earlier example
trainingSummary = lrModel.stages[-1].summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
    .select('threshold').head()['threshold']
fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').show()
print(bestThreshold)
lr.setThreshold(bestThreshold)

objectiveHistory:
0.06931537170094178
0.06814456134613474
0.059685310050892657
0.028902506543761274
0.02041662641407601
0.012745281897140283
0.007304360091786571
0.0035943083060730845
0.0018256087139155073
0.0009152221830390941
0.0004630651710574936
+--------------------+---+
|                 FPR|TPR|
+--------------------+---+
|                 0.0|0.0|
|                 0.0|1.0|
|0.013157894736842105|1.0|
| 0.02631578947368421|1.0|
|0.039473684210526314|1.0|
| 0.05263157894736842|1.0|
| 0.06578947368421052|1.0|
| 0.07894736842105263|1.0|
| 0.09210526315789473|1.0|
| 0.10526315789473684|1.0|
| 0.11842105263157894|1.0|
| 0.13157894736842105|1.0|
| 0.14473684210526316|1.0|
| 0.15789473684210525|1.0|
| 0.17105263157894737|1.0|
| 0.18421052631578946|1.0|
| 0.19736842105263158|1.0|
| 0.21052631578947367|1.0|
|  0.2236842105263158|1.0|
| 0.23684210526315788|1.0|
+--------------------+---+
only showing top 20 rows

areaUnderROC: 1.0
+--------------+
|max(F-Measure)|
+--------------+
|      

LogisticRegression_477a49cb3856

In [33]:
predDF = lrModel.transform(testDF)

print(predDF.schema.names)

predDF.select("Asthma", "prediction", "probability").filter("Asthma == 1").show()

['Body Weight', 'Gender', 'Age', 'Diastolic BP', 'Systolic BP', 'BP', 'Asthma', 'Triglycerides', 'Estimated Glomerular Filtration Rate', 'Low Density Lipoprotein', 'High Density Lipoprotein Cholesterol', 'Body Height', 'BMI', 'Total Cholesterol', 'GenderIndex', 'GenderOHE', 'features', 'rawPrediction', 'probability', 'prediction']
+------+----------+-----------+
|Asthma|prediction|probability|
+------+----------+-----------+
+------+----------+-----------+



In [34]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator

paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(labelCol = "Asthma"),
                          numFolds=2
                         )  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(trainDF)
prediction = cvModel.transform(testDF)
selected = prediction.select("Asthma", "prediction").where(col("Asthma") == 1).show()

+------+----------+
|Asthma|prediction|
+------+----------+
+------+----------+



In [35]:
roc = lrModel.stages[-1].summary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
print('Training set areaUnderROC for untuned model: ' + str(lrModel.stages[-1].summary.areaUnderROC))

roc = cvModel.bestModel.stages[-1].summary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
print('Training set areaUnderROC for BEST model: ' + str(cvModel.bestModel.stages[-1].summary.areaUnderROC))

NameError: name 'plt' is not defined

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol = "Asthma", featuresCol = "features", maxIter=10, elasticNetParam = 1)
# lr.setThreshold(bestThreshold)
pipeline = Pipeline(stages = [stringIndexer, oheEncoder, vecAssembler, lr])

# Fit the model
lrModel = pipeline.fit(trainDF)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.stages[-1].coefficients))
print("Intercept: " + str(lrModel.stages[-1].intercept))

print(str(lrModel.stages[-1].summary.accuracy))

predDF = lrModel.transform(testDF)

predDF.select("features", "Asthma", "prediction").filter((col("Asthma") ==1)).show()

In [None]:
### DON'T REALLY KNOW WHAT THIS DOES - it seems like 'lr.setThreshold' is supposed to improve the prediction ability

# Extract the summary from the returned LogisticRegressionModel instance trained
# in the earlier example
trainingSummary = cvModel.bestModel.stages[-1].summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
print(maxFMeasure)
bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
    .select('threshold').head()['threshold']
print(bestThreshold)
#lr.setThreshold(bestThreshold)

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator

paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.1, 0.01, 0.05, .3]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(labelCol = "Asthma"),
                          numFolds=2
                         )  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(trainDF)
prediction = cvModel.transform(testDF)
selected = prediction.select("Asthma", "prediction").where((col("Asthma") == 1)).show(100)

In [None]:
roc = lrModel.stages[-1].summary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
print('Training set areaUnderROC for untuned model: ' + str(lrModel.stages[-1].summary.areaUnderROC))

roc = cvModel.bestModel.stages[-1].summary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
print('Training set areaUnderROC for BEST model: ' + str(cvModel.bestModel.stages[-1].summary.areaUnderROC))

In [None]:
print(str(lrModel.stages[-1].summary.truepositive))