# Create a Database of Oncological Entities Based on Unstructured Notes
In this notebook, we create a database of entities based on extracted terms from the notes in previous notebooks. 
This database can be used for dashboarding using [Databricks SQL](https://databricks.com/product/databricks-sql).

#0. Initial configurations

In [0]:
import os
import json
import string
import numpy as np
import pandas as pd


import sparknlp
import sparknlp_jsl
from sparknlp.base import *
from sparknlp.util import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.pretrained import ResourceDownloader

from pyspark.sql import functions as F
from pyspark.ml import Pipeline, PipelineModel

pd.set_option('max_colwidth', 100)
pd.set_option('display.max_columns', 100)  
pd.set_option('display.expand_frame_repr', False)


print('sparknlp.version : ',sparknlp.version())
print('sparknlp_jsl.version : ',sparknlp_jsl.version())

spark

In [0]:
delta_path='/FileStore/HLS/nlp/delta/jsl/'

## 1. Create Temporary Views

In [0]:
spark.read.load(f"{delta_path}/silver/icd10-hcc-df").createOrReplaceTempView('icd10Hcc')
spark.read.load(f"{delta_path}/gold/best-icd-mapped").createOrReplaceTempView('bestIcdMapped')
spark.read.load(f"{delta_path}/gold/rxnorm-res-cleaned").createOrReplaceTempView('rxnormRes')
spark.read.load(f"{delta_path}/silver/rxnorm-code-greedy-res").createOrReplaceTempView('rxnormCodeGreedy')
spark.read.load(f"{delta_path}/silver/temporal-re").createOrReplaceTempView('temporalRe')
spark.read.load(f"{delta_path}/silver/bodypart-relationships").createOrReplaceTempView('bodypartRelationships')
spark.read.load(f"{delta_path}/silver/cpt").createOrReplaceTempView('cpt')
spark.read.load(f"{delta_path}/silver/assertion").createOrReplaceTempView('assertion')

## 2. Create the Database

In [0]:
database_name='jsl_onc'
DatabaseName=''.join([st.capitalize() for st in database_name.split('_')])
database_path=f"{delta_path}tables/{database_name}"
print(f"{DatabaseName} database tables will be stored in {database_path}")

In [0]:
sql(f"DROP DATABASE IF EXISTS {DatabaseName} CASCADE;")
sql(f"CREATE DATABASE IF NOT EXISTS {DatabaseName} LOCATION '{database_path}'")
sql(f"USE {DatabaseName};")

## 3. Create Tables

In [0]:
%sql
CREATE OR REPLACE TABLE Rxnorm_Res AS (
  select md5(path) as note_id,path,confidence, drug_chunk,rxnorm_code,drugs as drug from rxnormRes
);

num_affected_rows,num_inserted_rows


In [0]:
%sql
CREATE OR REPLACE TABLE CPT AS (
  select md5(path) as note_id, path, confidence, chunks, entity,cpt_code,cpt
from cpt)

num_affected_rows,num_inserted_rows


In [0]:
%sql
CREATE OR REPLACE TABLE ASSERTION AS (
  select md5(path) as note_id, path, chunk, entity,assertion from assertion
)

num_affected_rows,num_inserted_rows


In [0]:
%sql
CREATE OR REPLACE TABLE TEMPORAL_RE AS (
  select md5(path) as note_id, * from temporalRe
)

num_affected_rows,num_inserted_rows


In [0]:
%sql
CREATE OR REPLACE TABLE BEST_ICD AS (
  select * from bestIcdMapped
)

num_affected_rows,num_inserted_rows


In [0]:
%sql
CREATE OR REPLACE TABLE ICD10_HCC AS (
  select md5(path) as note_id, path, confidence, final_chunk, entity,icd10_code,icd_codes_names,icd_code_billable
  from icd10Hcc
)

num_affected_rows,num_inserted_rows


In [0]:
%sql
select * from ICD10_HCC

note_id,path,confidence,final_chunk,entity,icd10_code,icd_codes_names,icd_code_billable
006f6ca6de0688514d637c5384496b59,dbfs:/FileStore/HLS/nlp/data/mt_onc_50/mt_oncology_15.txt,0.9995,neuropathic pain,Symptom,M792,neuropathic pain,1
006f6ca6de0688514d637c5384496b59,dbfs:/FileStore/HLS/nlp/data/mt_onc_50/mt_oncology_15.txt,0.8508,numbness,Symptom,R202,numbness of skin,1
006f6ca6de0688514d637c5384496b59,dbfs:/FileStore/HLS/nlp/data/mt_onc_50/mt_oncology_15.txt,0.3001,fatigued,Symptom,R53,fatigue,0
006f6ca6de0688514d637c5384496b59,dbfs:/FileStore/HLS/nlp/data/mt_onc_50/mt_oncology_15.txt,0.753,tired,Symptom,R538,feeling tired,0
006f6ca6de0688514d637c5384496b59,dbfs:/FileStore/HLS/nlp/data/mt_onc_50/mt_oncology_15.txt,0.1663,falls asleep while talking but is easily arousable,Symptom,Z7282,difficulty sleeping (finding),0
006f6ca6de0688514d637c5384496b59,dbfs:/FileStore/HLS/nlp/data/mt_onc_50/mt_oncology_15.txt,0.2777,lack of sleep,Symptom,Z723,lack of exercise,1
006f6ca6de0688514d637c5384496b59,dbfs:/FileStore/HLS/nlp/data/mt_onc_50/mt_oncology_15.txt,0.3001,fatigued,Symptom,R53,fatigue,0
006f6ca6de0688514d637c5384496b59,dbfs:/FileStore/HLS/nlp/data/mt_onc_50/mt_oncology_15.txt,0.9993,distress,Symptom,R6889,distress,1
006f6ca6de0688514d637c5384496b59,dbfs:/FileStore/HLS/nlp/data/mt_onc_50/mt_oncology_15.txt,0.0996,anicteric,Symptom,H530,amblyopic,0
006f6ca6de0688514d637c5384496b59,dbfs:/FileStore/HLS/nlp/data/mt_onc_50/mt_oncology_15.txt,0.9951,erythema,Symptom,L539,erythema,1


## License
Copyright / License info of the notebook. Copyright [2021] the Notebook Authors.  The source in this notebook is provided subject to the [Apache 2.0 License](https://spdx.org/licenses/Apache-2.0.html).

## Disclaimers
Databricks Inc. (“Databricks”) does not dispense medical, diagnosis, or treatment advice. This Solution Accelerator (“tool”) is for informational purposes only and may not be used as a substitute for professional medical advice, treatment, or diagnosis. This tool may not be used within Databricks to process Protected Health Information (“PHI”) as defined in the Health Insurance Portability and Accountability Act of 1996, unless you have executed with Databricks a contract that allows for processing PHI, an accompanying Business Associate Agreement (BAA), and are running this notebook within a HIPAA Account.  Please note that if you run this notebook within Azure Databricks, your contract with Microsoft applies.