In [1]:
# Import all necessary libraries and create spark session
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,upper,udf,element_at,explode,regexp_replace,size
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, DateType

from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression


import os

spark = SparkSession \
    .builder \
    .appName("FHIR Analytics with Python") \
    .getOrCreate()
keyspace = "myCatalog.hfs_data"


In [2]:
from sparkdl import DeepImageFeaturizer
from pyspark.ml import image

img_dir = "/Users/Harrison/Downloads/cancer_imagery/"

benign_train = spark.read.format("image").load(img_dir + "train/benign").withColumn("label", F.lit(0))
malignant_train = spark.read.format("image").load(img_dir + "train/malignant").withColumn("label", F.lit(1))

train_df = benign_train.unionAll(malignant_train)

benign_test = spark.read.format("image").load(img_dir + "test/benign").withColumn("label", F.lit(0))
malignant_test = spark.read.format("image").load(img_dir + "test/malignant").withColumn("label", F.lit(1))

test_df = benign_test.unionAll(malignant_test)



In [3]:

from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer

featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3")
lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
p = Pipeline(stages=[featurizer, lr])


In [4]:
p_model = p.fit(train_df)
predictions = p_model.transform(test_df)

predictions.select("image.origin", "prediction").show(truncate=False)

+-------------------------------------------------------------------------------------------+----------+
|origin                                                                                     |prediction|
+-------------------------------------------------------------------------------------------+----------+
|file:///Users/Harrison/Downloads/cancer_imagery/test/benign/SOB_B_F-14-29960AB-400-003.png |0.0       |
|file:///Users/Harrison/Downloads/cancer_imagery/test/benign/SOB_B_F-14-29960AB-400-004.png |1.0       |
|file:///Users/Harrison/Downloads/cancer_imagery/test/benign/SOB_B_F-14-23060CD-400-007.png |0.0       |
|file:///Users/Harrison/Downloads/cancer_imagery/test/benign/SOB_B_F-14-29960AB-400-011.png |1.0       |
|file:///Users/Harrison/Downloads/cancer_imagery/test/benign/SOB_B_TA-14-3411F-400-012.png  |1.0       |
|file:///Users/Harrison/Downloads/cancer_imagery/test/benign/SOB_B_TA-14-3411F-400-004.png  |0.0       |
|file:///Users/Harrison/Downloads/cancer_imagery/test/b

In [5]:
# p_model.stages[-1].summary.areaUnderROC

# from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# predictionAndLabels = predictions.select("prediction", "label")
# evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
# print("Training set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))



IllegalArgumentException: 'Unsupported class file major version 55'

In [7]:
# pipelinePath = "/Users/Harrison/projects/helios/analytics-blog-article/cancer-model"
# p_model.stages[-1].write().overwrite().save(pipelinePath)

ValueError: ('Pipeline write will fail on this pipeline because stage %s of type %s is not MLWritable', 'DeepImageFeaturizer_fe53c03478c9', <class 'sparkdl.transformers.named_image.DeepImageFeaturizer'>)

In [6]:
# from pyspark.ml.classification import LogisticRegressionModel
# p_model = LogisticRegressionModel.load("/Users/Harrison/projects/helios/analytics-blog-article/cancer-model")

IllegalArgumentException: 'Unsupported class file major version 55'

In [14]:
import requests

# Search the Patient resource with family name Chalmers
r = requests.get("http://localhost:8181/fhir/Patient?family=chalmers")

# Store the Patient's FHIR id
patientId = r.json()['entry'][0]['resource']['id']

# Perform a search on the Encounter resource with the subject attribute equal to the patient's FHIR id
r = requests.get("http://localhost:8181/fhir/Encounter?date=ge2021-01-01&subject=" + patientId)

# Store the Encounter's FHIR id - here we use the 0'th Encounter in the search
encounterId = r.json()['entry'][0]['resource']['id']

# Perform search on Specimen that _has a DiagnosticReport with the stored encounterId
r = requests.get('http://localhost:8181/fhir/Specimen?_has:DiagnosticReport:specimen:encounter=' + encounterId)

# Store the Specimen's id
specimenId = r.json()['entry'][0]['resource']['id']

# Perform a search on Media for the matching specimen
r = requests.get('http://localhost:8181/fhir/Media?specimen=' + specimenId)

mediaUrls = map(lambda resource: resource['resource']['content']['url'], r.json()['entry'])

mediaUrls = list(mediaUrls)

test_df = spark.read.format("image").load(mediaUrls[0])

test_df = test_df.unionAll(spark.read.format("image").load(mediaUrls[1]))

test_df = test_df.unionAll(spark.read.format("image").load(mediaUrls[2]))

test_df = test_df.unionAll(spark.read.format("image").load(mediaUrls[3]))

test_df.show()

+--------------------+
|               image|
+--------------------+
|[file:///Users/Ha...|
|[file:///Users/Ha...|
|[file:///Users/Ha...|
|[file:///Users/ha...|
+--------------------+



In [15]:
from sparkdl import DeepImageFeaturizer

featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3")
test_df = featurizer.transform(test_df)
predictions = p_model.transform(test_df)

predictions.cache()

predictions.select("image.origin", "prediction").show(truncate=False)

IllegalArgumentException: 'requirement failed: Input DataFrame cannot already contain a column with name features'

In [None]:
list_predictions = list(map(lambda row: row[0], predictions.select("prediction").limit(4).collect()))

prediction_avg = sum(list_predictions) / len(list_predictions)

print(prediction_avg)

In [None]:
r = requests.get('http://localhost:8181/fhir/DiagnosticReport?encounter=' + encounterId)
diagnosticReportId = r.json()['entry'][0]['resource']['id']

risk_assessment = {
    'resourceType': 'RiskAssessment',
    'status': 'preliminary',
    'subject': {
        'reference': 'Patient/' + patientId
    },
    'encounter': {
        'reference': 'Encounter/' + encounterId
    },
    'reasonReference': {
        'reference': 'DiagnosticReport/' + diagnosticReportId
    },
    'prediction': {
        'probabilityDecimal': prediction_avg
    }
}

requests.post('http://localhost:8181/fhir/RiskAssessment', headers = {'Content-Type': 'application/fhir+json'}, json = risk_assessment)

In [None]:
from pyspark.sql import SparkSession
import sparknlp

spark_nlp = sparknlp.start()

In [None]:
import requests

# TODO turn this into a _has query with patient
r = requests.get("http://localhost:8181/fhir/Patient?family=chalmers")

patientId = r.json()['entry'][0]['resource']['id']

r = requests.get("http://localhost:8181/fhir/Encounter?date=ge2021-01-01&subject=" + patientId)

encounterId = r.json()['entry'][0]['resource']['id']

r = requests.get('http://localhost:8181/fhir/DiagnosticReport?encounter=' + encounterId)

text = r.json()['entry'][0]['resource']['text']['div']
diagnosticReportId = r.json()['entry'][0]['resource']['id']

diagnosticReport = r.json()['entry'][0]['resource']

In [None]:
from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords

preprocess_text = text.strip().replace("\n","").replace(",","").replace(".", ". ")

summ_per = summarize(preprocess_text, word_count = 50)

print("original text preprocessed: \n", preprocess_text)

print ("\n\nSummarized text: \n", summ_per)

In [None]:
diagnosticReport['conclusion'] = summ_per
requests.put("http://localhost:8181/fhir/DiagnosticReport/" + diagnosticReportId, headers = {'Content-Type': 'application/fhir+json'}, json = diagnosticReport)