In [2]:
import asyncio
import requests
import datetime
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
async def retrievePatientAgeAndGender(patientId):
    data = requests.get('https://fhir.monash.edu/hapi-fhir-jpaserver/fhir/Patient/' + patientId).json()
    
    # Get patient's age
    birthDate = data['birthDate']
    age = int(datetime.date.today().year) - int(datetime.datetime.strptime(birthDate, '%Y-%m-%d').year)
    
    # Get patient's gender
    gender = data['gender']
    
    return [age, gender]

In [None]:
async def retrievePatientData(patientId):
    # Weight, BMI, smoking, blood pressure
    categoryCodes = ['29463-7','39156-5','72166-2','55284-4']
    categoryValues = [None for _ in range(len(categoryCodes)+1)]
    
    data = requests.get('https://fhir.monash.edu/hapi-fhir-jpaserver/fhir/Observation?patient=' + patientId +
              '&code=29463-7,39156-5,72166-2,55284-4&_sort=-date').json()
    
    if data['total'] == 0:
        return None
    
    pageId = data['id']
    entries = data['entry']
    
    for i in range(len(entries)):
        code = entries[i]['resource']['code']['coding'][0]['code']
        index = categoryCodes.index(code)

        if categoryValues[index] is None:
            # Smoking 
            if index == 2:
                if entries[i]['resource']['valueCodeableConcept']['text'] == 'Never smoker':
                    categoryValues[index] = 0
                else:
                    categoryValues[index] = 1
            # Blood pressure
            elif index == 3:
                # Diastolic
                categoryValues[index] = entries[i]['resource']['component'][0]['valueQuantity']['value']
                # Systolic
                categoryValues[index+1] = entries[i]['resource']['component'][1]['valueQuantity']['value']
            # Body weight and BMI
            else:
                categoryValues[index] = entries[i]['resource']['valueQuantity']['value']
        
        # Check if fields have been completed
        if not None in categoryValues:
            return categoryValues
        
    if not None in categoryValues:
        return categoryValues 
    else:
        return None

In [4]:
def dataProcessing(dataTable):
    # Preprocessing
    # 2 categories, threshold, indices
    bins = (2, 200.0, 1000)
    categories = ['low', 'high']
    # Labels = targets to predict
    # Cuts values in cholesterol column and replace with categories according to bins
    dataTable['Cholesterol'] = pd.cut(dataTable['Cholesterol'], bins=bins, labels=categories)
    dataTable.loc[dataTable['Gender'] == 'male', 'Gender'] = 1
    dataTable.loc[dataTable['Gender'] == 'female', 'Gender'] = 0
    
    # Encode target labels into 0 and 1
    cholesterol_category = LabelEncoder()
    dataTable['Cholesterol'] = cholesterol_category.fit_transform(dataTable['Cholesterol'])
    
    # Separate into feature variables and response variable
    X = dataTable.drop('Cholesterol', axis=1)
    y = dataTable['Cholesterol']
    
    # Split data into training and testing sets
    # Seed with an integer to produce stable results 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Standard Scaling on feature variables for better results
    # Prevents features with higher variance to dominate prediction
    standard_scaler = StandardScaler()
    # Fit: compute mean and standard deviation
    # Transform: perform standardization by centering and scaling
    X_train = standard_scaler.fit_transform(X_train)
    X_test = standard_scaler.transform(X_test)
    
    RFC = RandomForestClassifier()
    RFC.fit(X_train,y_train)
    pred = RFC.predict(X_test)
    
    accuracy = RFC.score(X_test, y_test)
    
    print('Classification Report:')
    print(classification_report(y_test, pred))

    print('\nConfusion Matrix:')
    print(confusion_matrix(y_test, pred))

    print('\nFeature importances:')
    print(list(X.columns))
    print(RFC.feature_importances_)
    
    print('\nMean accuracy: ' + str(accuracy))
    
    return accuracy

In [None]:
patientCount = 0
patientIds = []
patientsData = []
pageId = ''
pageOffset = 50
dataTable = None
accuracy = 0.0

cholesterolObservations = requests.get('https://fhir.monash.edu/hapi-fhir-jpaserver/fhir/Observation?code=2093-3&_sort=-date&_count=50').json()
pageId = cholesterolObservations['id']
entries = cholesterolObservations['entry']

for i in range(len(entries)):
    cholesterolValue = entries[i]['resource']['valueQuantity']['value']

    # Only use data with cholesterol value higher than 180
    if cholesterolValue is None or float(cholesterolValue) < 180.0:
        continue

    # Get patient ID
    patientId = entries[i]['resource']['subject']['reference'][8:]

    # Check for existing patient ID
    if patientId in patientIds:
        continue

    patientIds.append(patientId)
    ageGender = await retrievePatientAgeAndGender(patientId)
    patientData = await retrievePatientData(patientId)

    if patientData is not None:
        patientData = ageGender + patientData + [float(cholesterolValue)]
        patientsData.append(patientData)
        patientCount += 1

while accuracy < 0.80:
    while patientCount < 1000:
        url = 'https://fhir.monash.edu/hapi-fhir-jpaserver/fhir?_getpages=' + pageId + \
        '&_getpagesoffset=' + str(pageOffset) + '&_count=50&_pretty=true&_bundletype=searchset'
        cholesterolObservations = requests.get(url).json()
        
        entries = cholesterolObservations['entry']
        for i in range(len(entries)):
            cholesterolValue = entries[i]['resource']['valueQuantity']['value']

            if cholesterolValue is None or float(cholesterolValue) < 180.0:
                continue

            # Get patient ID
            patientId = entries[i]['resource']['subject']['reference'][8:]

            # Check for existing patient ID
            if patientId in patientIds:
                continue

            patientIds.append(patientId)
            ageGender = await retrievePatientAgeAndGender(patientId)
            patientData = await retrievePatientData(patientId)

            if patientData is not None:
                patientData = ageGender + patientData + [float(cholesterolValue)]
                patientsData.append(patientData)
                patientCount += 1
        
        pageOffset += 50

    if dataTable is None:
        dataTable = pd.DataFrame(patientsData, columns=['Age','Gender','Weight','BMI','Smoking','Diastolic BP','Systolic BP','Cholesterol'])
        dataTable.to_csv('PatientsData_1000.txt', sep='\t', index=False)
    else:
        newRows = pd.DataFrame(patientsData, columns=['Age','Gender','Weight','BMI','Smoking','Diastolic BP','Systolic BP','Cholesterol'])
        dataTable = dataTable.append(newRows, ignore_index=True)
        dataTable.to_csv('PatientsData_2000.txt', sep='\t', index=False)
    
    patientsData = []

    accuracy = dataProcessing(dataTable)

In [8]:
# Reproducing results without server requests
dataTable = pd.read_csv('PatientsData_1000.txt', sep='\t')
accuracy = dataProcessing(dataTable)

Classification Report:
              precision    recall  f1-score   support

           0       0.12      0.04      0.06        49
           1       0.84      0.95      0.89       255

   micro avg       0.80      0.80      0.80       304
   macro avg       0.48      0.49      0.47       304
weighted avg       0.72      0.80      0.75       304


Confusion Matrix:
[[  2  47]
 [ 14 241]]

Feature importances:
['Age', 'Gender', 'Weight', 'BMI', 'Smoking', 'Diastolic BP', 'Systolic BP']
[0.19891076 0.01996407 0.2083376  0.2135792  0.04190462 0.13911488
 0.17818888]

Mean accuracy: 0.7993421052631579


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [9]:
dataTable = pd.read_csv('PatientsData_2000.txt', sep='\t')
accuracy = dataProcessing(dataTable)

Classification Report:
              precision    recall  f1-score   support

           0       0.25      0.07      0.10        61
           1       0.90      0.98      0.94       540

   micro avg       0.89      0.89      0.89       601
   macro avg       0.58      0.52      0.52       601
weighted avg       0.84      0.89      0.85       601


Confusion Matrix:
[[  4  57]
 [ 12 528]]

Feature importances:
['Age', 'Gender', 'Weight', 'BMI', 'Smoking', 'Diastolic BP', 'Systolic BP']
[0.19889117 0.01977301 0.220294   0.21038248 0.02350766 0.14018938
 0.18696229]

Mean accuracy: 0.8851913477537438


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
