In [1]:
# These are some commonly used R Packages.  
# The arrow package makes loading data with spark faster. 
library(readr, warn.conflicts = FALSE)
library(arrow, warn.conflicts = FALSE)
library(magrittr, warn.conflicts = FALSE)
library(stringr, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(rlang, warn.conflicts = FALSE)
library(data.table, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)
library(tidyr, warn.conflicts = FALSE)
library(truveta.notebook.study)
library(sparklyr)
library(ggplot2)
library(reshape2)

In [2]:
print("load snapshot")
con <- create_connection()
study <- get_study(con)
#print(study)
population_id = "ps-3ormi7swwukuhhu6kcqrqw4mue"
population <- get_population(con, study, title = "PancreaticMainPop")
snapshot <- get_latest_snapshot(con, population)
snapshot
# get list of tables from the snapshot
tables <- get_tables(con, snapshot)
tables

###### Set the output path

In [3]:
#Get your working directory
# use fs = true when reading and writing files locally

output_path_local <- get_output_path(con, study, fs = TRUE)
output_path_local

In [4]:
display_plot<- \(x, dpi = "screen", ...) {
    file <- tempfile()
    # dump as PNG
    ggplot2::ggsave(file, device = "png", plot = x, dpi = dpi, ...)
    # load as base64
    uri <- base64enc::dataURI(file = file, mime = "image/png")
    unlink(file)
    # display as HTML
    displayHTML(paste0('<img src="', uri, '">'))
}

#### Read the file

In [6]:
t1 <- paste(output_path_local, "/cases_control_allvar_new.csv.r", sep = "")
# cases_control_allvar_new
# use read.csv to read file into a R dataframe
cases_control_allvar_new <- read.csv(t1)
nrow(cases_control_allvar_new)

In [37]:
# Convert R DataFrame to Spark DataFrame
pandata <- as.DataFrame(pandata_all1)
createOrReplaceTempView(pandata, "pandata")

#### Write the file 

In [28]:
# Build path
file_to_write <- paste(output_path_local, "/temp_data14.csv.r", sep = "")

# use write.csv to write your file
write.csv(temp_data14, file_to_write, row.names = FALSE)

#### Create Final filtered dataset from the MainDataset

In [7]:
# Filter subjects based on condition(Atleast 2 diagnosis encounter and atleast 1 outpatient diagnosis encounter after 2018 date and 3 month before index date)
ADTTE <- RawMainDataset[(RawMainDataset$diagEnc90 >= 2 & !is.na(RawMainDataset$diagEnc90)) & (RawMainDataset$diagEnc90OutTemp >= 1 & !is.na(RawMainDataset$diagEnc90OutTemp)),]
nrow(ADTTE)

##### <mark>Create cases and control dataset</mark>

In [13]:
# After adding all variables Run later
cases <- ADTTE %>% mutate(Indicator == 1) %>% filter(LivDtFl == "Y")

# Select control : Subjects with No Liver metastases and death during the inital 1 year from the index date and minimum 2 diagnosis encounter after index date
control <- ADTTE %>% mutate(Indicator == 0) %>% filter(LivDtFl == "N" & FollowUpDiagEnc >= 2 & AVAL > 1  & !is.na(FollowUpDiagEnc))

cases_control_allvar <- rbind(cases,control) 
nrow(cases_control_allvar)


#### Other Metastases condition 

In [50]:
AllMetastasesCode = codeset(con, snapshot, "ICD10CM", 'selfAndDescendants',
  "C78")

# AllMetastasesSNOMEDCode = codeset(con, snapshot, "SNOMED CT", 'selfAndDescendants', "275266006")

#AllMetastasesCode = rbind(AllMetastasesICDCode,AllMetastasesSNOMEDCode)

create_view(AllMetastasesCode,'AllMetastasesCode')

In [51]:
# Subjects with other metastases than Liver metastases

sql2 <- "

WITH table1 AS   -- Select earliest diagnosis date for each PersonId & ConceptCode
(
    
    SELECT 
        c.PersonId,
        COALESCE(c.OnsetDateTime, c.RecordedDateTime) AS MetDt,
        amc.ConceptCode,
        ROW_NUMBER() OVER (PARTITION BY c.PersonId, amc.ConceptCode ORDER BY COALESCE(c.OnsetDateTime, c.RecordedDateTime) ASC) AS rn
    FROM condition c
    INNER JOIN ConditionCodeConceptMap ccm 
        ON c.CodeConceptMapId = ccm.Id
    INNER JOIN AllMetastasesCode amc 
        ON ccm.CodeConceptId = amc.ConceptId
    WHERE ccm.SourceConceptId IN (2703595, 2703594)
),

table2 AS   -- Keep only the earliest occurrence of each ConceptCode per PersonId
(
   
   SELECT PersonId, MetDt, ConceptCode
   FROM table1
   WHERE rn = 1
),

table3 AS   -- Aggregate concept codes for the same date per PersonId
(
 
  SELECT PersonId, MetDt, CONCAT_WS(',', COLLECT_SET(ConceptCode)) AS Metastases_Diagnosed_code
  FROM table2
  GROUP BY PersonId, MetDt
),

table4 AS    -- Join with pan_liv_final and filter based on PanDt for Baseline date
(

  SELECT a.*
  FROM table3 a
  INNER JOIN data b
   ON a.PersonId = b.PersonId
  WHERE (a.MetDt BETWEEN DATEADD(DAY, -360, CAST(b.PanDt AS DATE)) AND DATEADD(day, 30, CAST(b.PanDt AS DATE)))
),

table5 AS     -- Get the latest diagnosis date per PersonId
(
  SELECT PersonId, MAX(MetDt) AS OthMetDtBase
  FROM table4
  GROUP BY PersonId
),

table6 AS     -- Collate all concept codes per PersonId across all diagnosis dates
(
  SELECT 
      PersonId, 
      CONCAT_WS(',', COLLECT_SET(Metastases_Diagnosed_code)) AS OthMetCodeBase
  FROM table4
  GROUP BY PersonId
),

OthMetFlBase AS
(
SELECT 
  t5.PersonId, 
  t5.OthMetDtBase, 
  t6.OthMetCodeBase,
  CASE 
    WHEN t6.OthMetCodeBase != 'C78.7' THEN 'Y'
    ELSE 'N'
    END AS OthMetFlBase
FROM table5 t5
  INNER JOIN table6 t6 
  ON t5.PersonId = t6.PersonId
),

table4b AS    -- Join with pan_liv_final and filter based on PanDt for Post Baseline date
(

  SELECT a.*
  FROM table3 a
  INNER JOIN data b
   ON a.PersonId = b.PersonId
  WHERE (a.MetDt BETWEEN DATEADD(day, 31, b.PanDt) AND b.ADT)
),

table5b AS     -- Get the latest diagnosis date per PersonId
(
  SELECT PersonId, MAX(MetDt) AS OthMetDt30
  FROM table4b
  GROUP BY PersonId
),

table6b AS     -- Collate all concept codes per PersonId across all diagnosis dates
(
  SELECT 
      PersonId, 
      CONCAT_WS(',', COLLECT_SET(Metastases_Diagnosed_code)) AS OthMetCode30
  FROM table4b
  GROUP BY PersonId
),

OthMetFl30 AS
(
SELECT 
  t5.PersonId, 
  t5.OthMetDt30, 
  t6.OthMetCode30,
  CASE 
    WHEN t6.OthMetCode30 != 'C78.7' THEN 'Y'
    ELSE 'N'
    END AS OthMetFl30
FROM table5b t5
  INNER JOIN table6b t6 
  ON t5.PersonId = t6.PersonId
),

FinalTbl AS
(
  SELECT p.PersonId,a.OthMetFl30, a.OthMetDt30, a.OthMetCode30, b.OthMetDtBase, b.OthMetCodeBase, b.OthMetFlBase
  FROM data p
 LEFT JOIN OthMetFl30 a
  ON p.Personid = a.Personid
 LEFT JOIN OthMetFlBase b
  ON p.Personid = b.Personid
)

SELECT PersonId, OthMetDt30, OthMetCode30, OthMetDtBase, OthMetCodeBase,
    COALESCE(OthMetFlBase, 'N') AS OthMetFLBase,
    COALESCE(OthMetFl30, 'N') AS OthMetFl30
FROM FinalTbl

"

pan_OthMet <- load_sql_table(con, snapshot, sql2, view_name='pan_OthMet',output_mode = "sparklyr") %>% collect()

In [20]:
display_df(pan_OthMet,5)

In [13]:
sql <- 
"
  SELECT 
    COUNT(CASE WHEN LivDtFl = 'Y' THEN 1 END) AS Total_LivDtFl_Y,
    COUNT(CASE WHEN LivDtFlBase = 'Y' THEN 1 END) AS Total_LivDtFlBase_Y,
    COUNT(CASE WHEN LivDtFl30 = 'Y' THEN 1 END) AS Total_LivDtFl30_Y,
    COUNT(CASE WHEN OthMetFlBase = 'Y' THEN 1 END) AS Total_OthMetFlBase_Y,
    COUNT(CASE WHEN OthMetFl30 = 'Y' THEN 1 END) AS Total_OthMetFl30_Y,
    COUNT(CASE WHEN OthMetFlBase = 'Y' AND LivDtFl = 'N' THEN 1 END) AS Total_OthMetBaseY_LivMetN,
    COUNT(CASE WHEN OthMetFl30 = 'Y' AND LivDtFl = 'N' THEN 1 END) AS Total_OthMet30Y_LivMetN,
    COUNT(CASE WHEN (OthMetFl30 = 'Y' OR OthMetFlBase = 'Y') AND LivDtFl = 'N' THEN 1 END) AS Total_OthMet_NoLivM,
    COUNT(CASE WHEN LivDtFl = 'N' THEN 1 END) AS NoLiverMets
  FROM pan_OthMet
"
display_df(load_sql_table(con, snapshot, sql, view_name='count',output_mode = "sparklyr") )

In [None]:
# Subjects with other metastases than Liver metastases

sql2 <- "

WITH table1 AS   -- Select earliest diagnosis date for each PersonId & ConceptCode
(
    
    SELECT 
        c.PersonId,
        COALESCE(c.OnsetDateTime, c.RecordedDateTime) AS MetDt,
        amc.ConceptCode,
        ROW_NUMBER() OVER (PARTITION BY c.PersonId, amc.ConceptCode ORDER BY COALESCE(c.OnsetDateTime, c.RecordedDateTime) ASC) AS rn
    FROM condition c
    INNER JOIN ConditionCodeConceptMap ccm 
        ON c.CodeConceptMapId = ccm.Id
    INNER JOIN AllMetastasesCode amc 
        ON ccm.CodeConceptId = amc.ConceptId
    WHERE ccm.SourceConceptId IN (2703595, 2703594)
),

table2 AS   -- Keep only the earliest occurrence of each ConceptCode per PersonId
(
   
   SELECT PersonId, MetDt, ConceptCode
   FROM table1
   WHERE rn = 1
),

table3 AS   -- Aggregate concept codes for the same date per PersonId
(
 
  SELECT PersonId, MetDt, CONCAT_WS(',', COLLECT_SET(ConceptCode)) AS Metastases_Diagnosed_code
  FROM table2
  GROUP BY PersonId, MetDt
),

table4 AS    -- Join with pan_liv_final and filter based on PanDt for Baseline date
(

  SELECT a.*
  FROM table3 a
  INNER JOIN pandata b
   ON a.PersonId = b.PersonId
  WHERE (a.MetDt BETWEEN b.PanDt AND DATEADD(day, 30, b.PanDt))
),

table5 AS     -- Get the latest diagnosis date per PersonId
(
  SELECT PersonId, MAX(MetDt) AS OthMetDtBase
  FROM table4
  GROUP BY PersonId
),

table6 AS     -- Collate all concept codes per PersonId across all diagnosis dates
(
  SELECT 
      PersonId, 
      CONCAT_WS(',', COLLECT_SET(Metastases_Diagnosed_code)) AS OthMetCpBase
  FROM table4
  GROUP BY PersonId
)

OthMetFLBaseline AS
(
SELECT 
  t5.PersonId, 
  t5.OthMetDtBaseline, 
  t6.OthMetBase,
  CASE 
    WHEN t6.OthMetBase = 'C78.7' THEN 'N'
    ELSE 'Y'
    END AS OthMetFLBaseline
FROM table5 t5
  INNER JOIN table6 t6 
  ON t5.PersonId = t6.PersonId
),

table4b AS    -- Join with pan_liv_final and filter based on PanDt for Pre Baseline date
(

  SELECT a.*
  FROM table3 a
  INNER JOIN pandata b
   ON a.PersonId = b.PersonId
  WHERE (a.MetDt BETWEEN DATEADD(day, -360, b.PanDt) AND b.PanDt)
),

table5b AS     -- Get the latest diagnosis date per PersonId
(
  SELECT PersonId, MAX(MetDt) AS OthMetDtPstBase
  FROM table4b
  GROUP BY PersonId
),

table6b AS     -- Collate all concept codes per PersonId across all diagnosis dates
(
  SELECT 
      PersonId, 
      CONCAT_WS(',', COLLECT_SET(Metastases_Diagnosed_code)) AS OthMetPstBase
  FROM table4b
  GROUP BY PersonId
)

OthMetFLPstBase AS
(
SELECT 
  t5.PersonId, 
  t5.OthMetDtPstBase, 
  t6.OthMetCodePstBase,
  CASE 
    WHEN t6.OthMetCodePstBase = 'C78.7' THEN 'N'
    ELSE 'Y'
    END AS OthMetFLPstBase
FROM table5b t5
  INNER JOIN table6b t6 
  ON t5.PersonId = t6.PersonId
)

SELECT 
    COALESCE(a.PersonId, c.PersonId) AS PersonId, 
    a.OthMetDtBaseline, a.OthMetFLBaseline, a.OthMetCodeBase, 
    c.OthMetDtPstBase, c.OthMetFLPstBase, c.OthMetCodePstBase
FROM OthMetFLBaseline a
FULL OUTER JOIN OthMetFLPstBase c
ON a.PersonId = c.PersonId;
"
"
pan_OthMet <- load_sql_table(con, snapshot, sql2, view_name='pan_OthMet',output_mode = "sparklyr") 

#### Combine Other metastases flag

In [22]:
cases_control <- cases_control %>% left_join(pan_bmi, by="PersonId")

##### _**Procedure/Surgery**_

In [22]:
CPTCodePancreatecromy = codeset(con, snapshot,
  "CPT",
  'selfAndDescendants',
  "48150",
  "48152",
  "48153",
  "48154",
  "48155",
  "48140",
  "48145",
  "48146",
  "48999"
)

create_view(CPTCodePancreatecromy,"CPTCodePancreatecromy")

# Other procedure : "47760", "43846", "43266","43274"

In [25]:
#ignore this run the below one 
sql3 <- "
  WITH table1 AS (
    SELECT 
        PersonId,
        min(StartDateTime) AS SurgeryStartDt, 
        max(StartDateTime) AS SurgeryEndDt
    FROM procedure
    WHERE CodeConceptMapId IN (
        SELECT Id 
        FROM ProcedureCodeConceptMap 
        WHERE CodeConceptId IN (SELECT ConceptId FROM CPTCodePancreatecromy)
    )
    AND StartDateTime IS NOT NULL
    GROUP BY PersonId
),

table2 AS (
  SELECT 
    p.Personid, CAST(s.SurgeryStartDt AS DATE) SurgeryStartDt, CAST(s.SurgeryEndDt AS DATE) SurgeryEndDt,
    CAST(p.PanDt AS DATE) AS PanDt, CAST(p.LivDt AS DATE) LivDt, 
    p.LivDtFl30, p.LivDtFlBase, p.ADT, p.CNSR
  FROM data p 
  LEFT JOIN table1 s
    ON p.PersonId = s.PersonId 
)

  SELECT *, 
    CASE 
        WHEN LivDtFlBase = 'Y' AND SurgeryStartDt >= PanDt THEN 'Y'
        WHEN LivDtFl30 = 'Y' AND SurgeryStartDt BETWEEN LivDt AND ADT THEN 'Y'
        WHEN LivDtFl30 = 'Y' AND SurgeryStartDt BETWEEN PanDt AND DATEADD(day, -1, LivDt) THEN 'Y'
        WHEN LivDtFlBase = 'N' AND LivDtFl30 = 'N' AND SurgeryStartDt BETWEEN PanDt AND ADT THEN 'Y'
        ELSE 'N'
    END AS SuegeryFl, 

    CASE 
        WHEN LivDtFlBase = 'Y' AND SurgeryStartDt >= PanDt THEN 1
        WHEN LivDtFl30 = 'Y' AND SurgeryStartDt BETWEEN LivDt AND ADT THEN 2
        WHEN LivDtFl30 = 'Y' AND SurgeryStartDt BETWEEN PanDt AND DATEADD(day, -1, LivDt) THEN 3
        WHEN LivDtFlBase = 'N' AND LivDtFl30 = 'N' AND SurgeryStartDt BETWEEN PanDt AND ADT THEN 4
        ELSE 0
    END AS SPOC  
  FROM table2

"

pan_surgery <- load_sql_table(con, snapshot, sql3, view_name='pan_surgery',output_mode = "sparklyr")

In [34]:
sql <- "
SELECT COUNT(*)
FROM pan_surgery
WHERE SuegeryFl = 'Y'
LIMIT 1
"
display_df(load_sql_table(con, snapshot, sql, view_name='count',output_mode = "sparklyr"))

In [23]:
#Surgery Risk factor Analysis Flag: SurgeryFl1
#For cases where at least 1 surgery code is present from PanDt - 30 days to Analysis date, then “Y”
#For control if at least 1 surgery code is present after PanDt and before ADT, then “Y”
#Surgery  Presence Flag: SurgeryFl2
#For all subject if presence of surgery code after PanDt

sql4 <- "

  WITH table1 AS (
    SELECT 
        d.PersonId,
        CASE
         WHEN CAST(d.LivDt AS DATE) IS NOT NULL 
              AND CAST(p.StartDateTime AS DATE) BETWEEN DATEADD(DAY, -30, CAST(d.PanDt AS DATE)) AND DATEADD(DAY, -1, CAST(d.LivDt AS DATE))
         THEN 1
         WHEN CAST(d.LivDt AS DATE) IS NULL 
              AND CAST(p.StartDateTime AS DATE) BETWEEN DATEADD(DAY, -30, CAST(d.PanDt AS DATE)) 
              AND DATEADD(DAY, -1, CAST(d.ADT AS DATE))
         THEN 1   
         ELSE 0
         END AS SurgeryFl1,
        CASE  
        WHEN CAST(p.StartDateTime AS DATE) >= DATEADD(DAY, -30, CAST(d.PanDt AS DATE)) THEN 1 ELSE 0 
        END AS SurgeryFl2
    FROM data d
    LEFT JOIN procedure p ON p.PersonId = d.PersonId
    WHERE p.CodeConceptMapId IN (
        SELECT Id 
        FROM ProcedureCodeConceptMap 
        WHERE CodeConceptId IN (SELECT ConceptId FROM CPTCodePancreatecromy)
    )
    AND CAST(p.StartDateTime AS DATE) IS NOT NULL
),

table2 AS (
  SELECT PersonId, 
         SUM(SurgeryFl1) AS CountSurgFl1, 
         SUM(SurgeryFl2) AS CountSurgFl2
  FROM table1
  GROUP BY PersonId
)

SELECT PersonId, 
    CASE 
       WHEN CountSurgFl1 >= 1 THEN 1 ELSE 0 
    END AS SurgeryFl1, 
    CASE 
       WHEN CountSurgFl2 >= 1 THEN 1 ELSE 0 
    END AS SurgeryFl2
FROM table2

"

pan_surgery <- load_sql_table(con, snapshot, sql4, view_name='pan_surgery',output_mode = "sparklyr") %>% collect()

In [20]:
sum(pan_surgery$SurgeryFl1 == "Y", na.rm = TRUE)

In [9]:
display_df(pan_surgery,10)

##### Frequency of surgery at different stages

In [71]:
# create different surgery group 
sql3 <- "

WITH table1 AS (
SELECT 
    p.PersonId,
    CAST(p.StartDateTime AS DATE) AS SurgeryDt,  -- Converts StartDateTime to Date
    a.LivDtFl, 
    CAST(a.PanDt AS DATE) AS PanDt,  CAST(a.LivDt AS DATE) AS LivDt,
    CASE 
        WHEN p.StartDateTime BETWEEN DATEADD(day, -360, CAST(a.PanDt AS DATE)) 
                               AND DATEADD(day, -1, CAST(a.PanDt AS DATE)) THEN 1 
        WHEN p.StartDateTime BETWEEN CAST(a.PanDt AS DATE) 
                               AND DATEADD(day, 90, CAST(a.PanDt AS DATE)) THEN 2
        WHEN p.StartDateTime BETWEEN DATEADD(day, 91, CAST(a.PanDt AS DATE)) 
                               AND CAST(a.ADT AS DATE) THEN 3
    END AS SurgeryFl,
    ROW_NUMBER() OVER (
        PARTITION BY p.PersonId 
        ORDER BY p.StartDateTime
    ) AS rn
FROM data a
LEFT JOIN procedure p
    ON a.PersonId = p.PersonId
WHERE p.CodeConceptMapId IN (
    SELECT Id 
    FROM ProcedureCodeConceptMap 
    WHERE CodeConceptId IN (SELECT ConceptId FROM CPTCodePancreatecromy)
)
AND p.StartDateTime IS NOT NULL
)

SELECT PersonId,SurgeryDt,PanDt,LivDtFl, LivDt,
COALESCE(SurgeryFl, 'N') AS SurgeryFl
FROM table1
WHERE rn = 1 -- Selects only the earliest surgery record per PersonId

"
pan_proced <- load_sql_table(con, snapshot, sql3, view_name='pan_proced',output_mode = "sparklyr")

In [68]:
display_df(pan_proced,15)

In [30]:

tempCount <- cases_control %>% select(PersonId, LivDtFlBase, LivDtFl30) %>% left_join(pan_radiation,by="PersonId")

sum(tempCount$LivDtFlBase == "Y" & tempCount$RadiationFl1 == "Y", na.rm=TRUE)
sum(tempCount$RadiationFl2 == "Y", na.rm=TRUE)

##### **Radiation Therapy**

In [24]:
CPTCodeRadiology = codeset(con, snapshot,
  "CPT",
  'selfAndDescendants',
  "77261",
  "77262",
  "77263",
  "77280",
  "77285",
  "77290",
  "77295",
  "77300",
  "77373",
  "77385",
  "77427",
  "77431",
  "77435",
  "77469",
  "77470",
  "77499"
)

create_view(CPTCodeRadiology,"CPTCodeRadiology")

In [24]:
# skip and run next 
sql4 <- "

  WITH table1 AS (
    SELECT 
        PersonId,
        min(CAST(StartDateTime AS DATE)) AS RadiationStDt, 
        max(CAST(StartDateTime AS DATE)) AS RadiationEndDt
    FROM procedure
    WHERE CodeConceptMapId IN (
        SELECT Id 
        FROM ProcedureCodeConceptMap 
        WHERE CodeConceptId IN (SELECT ConceptId FROM CPTCodeRadiology)
    )
    AND StartDateTime IS NOT NULL
    GROUP BY PersonId
),

table2 AS (
  SELECT 
    p.Personid, CAST(s.RadiationStDt AS DATE) RadiationStDt, CAST(s.RadiationEndDt AS DATE) RadiationEndDt,
    CAST(p.PanDt AS DATE) AS PanDt, CAST(p.LivDt AS DATE) LivDt, 
    p.LivDtFl30, p.LivDtFlBase, p.ADT, p.CNSR
  FROM data p 
  LEFT JOIN table1 s
    ON p.PersonId = s.PersonId 
)

  SELECT *, 
    CASE 
        WHEN LivDtFlBase = 'Y' AND RadiationStDt >= PanDt THEN 'Y'
        WHEN LivDtFl30 = 'Y' AND RadiationStDt >= LivDt THEN 'Y'
        WHEN LivDtFl30 = 'Y' AND RadiationStDt BETWEEN PanDt AND DATEADD(day, -1, LivDt) THEN 'Y'
        WHEN LivDtFlBase = 'N' AND LivDtFl30 = 'N' AND RadiationStDt BETWEEN PanDt AND ADT THEN 'Y'
        ELSE 'N'
    END AS RadiationFl, 

    CASE 
        WHEN LivDtFlBase = 'Y' AND RadiationStDt >= PanDt THEN 1
        WHEN LivDtFl30 = 'Y' AND RadiationStDt >= LivDt THEN 2
        WHEN LivDtFl30 = 'Y' AND RadiationStDt BETWEEN PanDt AND DATEADD(day, -1, LivDt) THEN 3
        WHEN LivDtFlBase = 'N' AND LivDtFl30 = 'N' AND RadiationStDt BETWEEN PanDt AND ADT THEN 3
        ELSE 0
    END AS RPOC  

  FROM table2
"

pan_rad <- load_sql_table(con, snapshot, sql4, view_name='pan_rad',output_mode = "sparklyr")

In [33]:
sql <- "
SELECT COUNT(*)
FROM pan_rad
WHERE RadiationFl = 'Y'
LIMIT 10
"
display_df(load_sql_table(con, snapshot, sql, view_name='count',output_mode = "sparklyr"))

In [25]:
#Radiation Therapy Risk factor Analysis Flag: RadFl1
#For cases where at least 1 surgery or procedure code is present after PanDt and before LivMetDt, then “Y”
#For control if at least 1 surgery or procedure code is present after PanDt and before ADT, then “Y”
#Radiation Therapy  Presence Flag: RadFl2
#For all subject if presence of surgery of procedure after PanDt

sql5 <- "

  WITH table1 AS (
    SELECT 
        d.PersonId,
        CASE
         WHEN CAST(d.LivDt AS DATE) IS NOT NULL AND CAST(p.StartDateTime AS DATE) BETWEEN DATEADD(DAY, -30, CAST(d.PanDt AS DATE)) AND DATEADD(Day,-1,CAST(d.LivDt AS DATE))
         THEN 1
         WHEN CAST(d.LivDt AS DATE) IS NULL AND CAST(p.StartDateTime AS DATE) BETWEEN DATEADD(DAY, -30, CAST(d.PanDt AS DATE)) AND DATEADD(Day,-1,CAST(d.ADT AS DATE))
         THEN 1   
         ELSE 0
         END AS RadFl1,
        CASE  
        WHEN CAST(p.StartDateTime AS DATE) >=  DATEADD(DAY, -30, CAST(d.PanDt AS DATE)) THEN 1 ELSE 0 END AS RadFl2
    FROM data d
     LEFT JOIN procedure p
      ON p.Personid=d.PersonId
    WHERE p.CodeConceptMapId IN (
        SELECT Id 
        FROM ProcedureCodeConceptMap 
        WHERE CodeConceptId IN (SELECT ConceptId FROM CPTCodeRadiology)
    )
    AND CAST(p.StartDateTime AS DATE) IS NOT NULL
),

table2 AS (
  SELECT PersonId,SUM(RadFl1) AS CountRadFl1, SUM(RadFl2) AS CountRadFl2
  FROM table1
  GROUP BY PersonId
)

  SELECT PersonId, 
    CASE 
       WHEN  CountRadFl1>= 1 THEN 1 ELSE 0  
    END AS RadiationFl1, 
        CASE 
       WHEN  CountRadFl2>= 1 THEN 1 ELSE 0
    END AS RadiationFl2
  FROM table2
"

pan_radiation <- load_sql_table(con, snapshot, sql5, view_name='pan_radiation',output_mode = "sparklyr") %>% collect()

In [19]:
sum(pan_radiation$RadiationFl1 == "Y",na.rm=TRUE)

In [15]:
display_df(pan_rad,10)

##### **Medications**

In [26]:
# Chemotherapy Risk factor Analysis Flag: ChemoFl1
#For cases where at least two medication encounters occur between PanDt - 30 Days and LivMetDt, then “Y”
#For control if at least 2 medication encounters occur between PanDt - 30 days and ADT then “Y” Else “N”
# Chemotherapy presence Flag: ChemoFl2
# For all subjects if at least 2 medication encounters after PanDt
# Select subjects from all the 3 tables(adm,req,disp) and combine them all. Select only subjects with startdatetime not null and atleast 2 exposure(unique datetime) for each subject.
# Per row per subject with earliest and latest medication date 

chemo_flag <- function(codes = "codes",med_name = "name")
{
create_view(codes,"concept_code")

sql <- "

WITH adm AS 
(
    SELECT m.PersonId, CAST(m.StartDateTime AS DATE) AS StartDateTime
    FROM MedicationAdministration m 
    WHERE m.CodeConceptMapId IN (
          SELECT Id FROM MedicationCodeConceptMap 
          WHERE CodeConceptId IN (SELECT ConceptId FROM concept_code)        
    ) 
    AND m.StartDateTime IS NOT NULL
    AND m.StartDateTime >= '2018-01-01'
),


req AS 
(
    SELECT m.PersonId, CAST(m.StartDateTime AS DATE) AS StartDateTime
    FROM MedicationRequest m 
    WHERE m.CodeConceptMapId IN (
          SELECT Id FROM MedicationCodeConceptMap 
          WHERE CodeConceptId IN (SELECT ConceptId FROM concept_code)
    ) 
    AND m.StartDateTime IS NOT NULL
    AND m.StartDateTime >= '2018-01-01'
),

disp AS 
(
    SELECT m.PersonId, CAST(m.DispenseDateTime AS DATE) AS StartDateTime
    FROM MedicationDispense m 
    WHERE m.CodeConceptMapId IN (
          SELECT Id FROM MedicationCodeConceptMap 
          WHERE CodeConceptId IN (SELECT ConceptId FROM concept_code)
    ) 
    AND m.DispenseDateTime IS NOT NULL
    AND m.DispenseDateTime >= '2018-01-01'
),

-- add claim line
claim AS
(
  SELECT c.PersonId, CAST(c.ServiceBeginDate AS DATE) AS StartDateTime
   FROM Claim c
  INNER JOIN ClaimLine cl ON c.PersonId = cl.PersonId  -- Ensuring PersonId matches
  INNER JOIN ClaimLineCodes clc ON cl.ClaimId = clc.ClaimLineId  -- Ensure ClaimId is used for joining
  INNER JOIN concept_code cc ON clc.CodeConceptId = cc.ConceptId  -- Ensuring CodeConceptId matches ConceptId
  WHERE c.ServiceBeginDate IS NOT NULL
   AND c.ServiceBeginDate >= '2018-01-01'
),

all_tbl AS 
(
    SELECT * FROM adm
    UNION ALL
    SELECT * FROM req
    UNION ALL
    SELECT * FROM disp
    UNION ALL
    SELECT * FROM claim
),

filter_tb1 AS
(
    SELECT PersonId, StartDateTime
    FROM all_tbl
    GROUP BY PersonId, StartDateTime
),

filter_tb2
(
  SELECT d.PersonId, CAST(d.PanDt AS DATE) AS PanDt, CAST(d.LivDt AS DATE) LivDt, d.ADT,p.StartDateTime,
  CASE
   WHEN CAST(d.LivDt AS DATE) IS NOT NULL AND CAST(p.StartDateTime AS DATE) BETWEEN DATEADD(DAY, -30, CAST(d.PanDt AS DATE)) AND DATEADD(Day,-1,CAST(d.LivDt AS DATE))
    THEN 1
   WHEN CAST(d.LivDt AS DATE) IS NULL AND CAST(p.StartDateTime AS DATE) BETWEEN DATEADD(DAY, -30, CAST(d.PanDt AS DATE)) AND DATEADD(Day,-1,CAST(d.ADT AS DATE))
    THEN 1   
   ELSE 0
    END AS Fl1,
  CASE  
   WHEN CAST(p.StartDateTime AS DATE) >=  DATEADD(DAY, -30, CAST(d.PanDt AS DATE)) THEN 1 ELSE 0 END AS Fl2
  FROM data d
    LEFT JOIN filter_tb1 p
      ON d.PersonId = p.PersonId
),

filter_tb3 AS (
  SELECT PersonId,SUM(Fl1) AS CountFl1, SUM(Fl2) AS CountFl2
  FROM filter_tb2
  GROUP BY PersonId
)

  SELECT PersonId, 
    CASE 
       WHEN  CountFl1 >= 2 THEN 1 ELSE 0  
    END AS %sFl1, 
        CASE 
       WHEN  CountFl2 >= 2 THEN 1 ELSE 0  
    END AS %sFl2
  FROM filter_tb3
"
  sql1 <- sprintf(sql,med_name,med_name)

  tb <- load_sql_table(con,snapshot, query = sql1, output_mode = "sparklyr" ) %>% collect()
  return(tb)
}

#### Skip this part 


In [27]:
# Get the codes for chemotherapy medication
Fluorouracil = codeset(con, snapshot, "RxNorm",'selfAndDescendants',"4492")
Oxaliplatin = codeset(con, snapshot, "RxNorm",'selfAndDescendants',"32592")
Leucovorin =  codeset(con, snapshot, "RxNorm",'selfAndDescendants',"6313")
Irinotecan = codeset(con, snapshot, "RxNorm",'selfAndDescendants',"51499")
Gemcitabine = codeset(con, snapshot, "RxNorm",'selfAndDescendants',"12574")
Cisplatin = codeset(con, snapshot, "RxNorm",'selfAndDescendants',"2555")
Capecitabine= codeset(con, snapshot, "RxNorm",'selfAndDescendants',"194000")

In [29]:
# Get the codes for chemotherapy medication
Fluorouracil = codeset(con, snapshot, "RxNorm",'selfAndDescendants',"4492", view_name = 'Fluorouracil')
Oxaliplatin = codeset(con, snapshot, "RxNorm",'selfAndDescendants',"32592", view_name = 'Oxaliplatin')
Leucovorin =  codeset(con, snapshot, "RxNorm",'selfAndDescendants',"6313", view_name = 'Leucovorin')
Irinotecan = codeset(con, snapshot, "RxNorm",'selfAndDescendants',"51499", view_name = 'Irinotecan')
Gemcitabine = codeset(con, snapshot, "RxNorm",'selfAndDescendants',"12574", view_name = 'Gemcitabine')
Cisplatin = codeset(con, snapshot, "RxNorm",'selfAndDescendants',"2555", view_name = 'Cisplatin')
Capecitabine= codeset(con, snapshot, "RxNorm",'selfAndDescendants',"194000", view_name = 'Capecitabine')


##### Call the function to create chemo flag for each chemotherapy medication

In [28]:
Fluorouracil_tb <- chemo_flag(codes = Fluorouracil,med_name = "Fluorouracil")
print("Finished flagging: Fluorouracil_tb")
Oxaliplatin_tb <- chemo_flag(codes = Oxaliplatin,med_name = "Oxaliplatin")
print("Finished flagging: Oxaliplatin_tb")
Leucovorin_tb <- chemo_flag(codes = Leucovorin,med_name = "Leucovorin")
print("Finished flagging: Leucovorin_tb")
Irinotecan_tb <- chemo_flag(codes = Irinotecan,med_name = "Irinotecan")
print("Finished flagging: Irinotecan_tb")
Gemcitabine_tb <- chemo_flag(codes = Gemcitabine,med_name = "Gemcitabine")
print("Finished flagging: Gemcitabine_tb")
Cisplatin_tb <- chemo_flag(codes = Cisplatin,med_name = "Cisplatin")
print("Finished flagging: Cisplatin_tb")
Capecitabine_tb <- chemo_flag(codes = Capecitabine,med_name = "Capecitabine")
print("Finished flagging: Capecitabine_tb")


In [37]:
# Combine all the chemo medication tables, along with radiation therapy, and surgery as exposure
# Merge all datasets using left_join to ensure we retain all PersonId entries from the base dataset(case_control dataset)

exposure <- cases_control %>% select(PersonId, STARTDT, ADT, CNSR, indicator) %>%
    left_join(Fluorouracil_tb, by = "PersonId") %>%
    left_join(Oxaliplatin_tb, by = "PersonId") %>%
    left_join(Leucovorin_tb, by = "PersonId") %>%
    left_join(Irinotecan_tb, by = "PersonId") %>%
    left_join(Gemcitabine_tb, by = "PersonId") %>%
    left_join(Cisplatin_tb, by = "PersonId") %>%
    left_join(Capecitabine_tb, by = "PersonId") %>%
    left_join(pan_radiation, by = "PersonId") %>%
    left_join(pan_surgery, by = "PersonId")

In [96]:
# Display the final merged dataset
display_df(exposure,5)

In [94]:
exposure2 <- exposure %>%
  mutate(
    FOLFIRINOX = ifelse(FluorouracilFl1 == 1 & (OxaliplatinFl1 == 1 | LeucovorinFl1 == 1 | IrinotecanFl1 == 1), 1, 0),
    FOLFIRINOX2 = ifelse(FluorouracilFl2 == 1 & (OxaliplatinFl2 == 1 | LeucovorinFl2 == 1 | IrinotecanFl2 == 1), 1, 0),
    Gemc_Cis_Cap = ifelse(GemcitabineFl1 == 1 & (CisplatinFl1 == 1 | CapecitabineFl1 == 1), 1, 0),
    Gemc_Cis_Cap2 = ifelse(GemcitabineFl2 == 1 & (CisplatinFl2 == 1 | CapecitabineFl2 == 1), 1, 0),
    Gemc_Mono = ifelse(GemcitabineFl1 == 1 & CisplatinFl1 == 0 & CapecitabineFl1 == 0, 1, 0),
    Gemc_Mono2 = ifelse(GemcitabineFl2 == 1 & CisplatinFl2 == 0 & CapecitabineFl2 == 0, 1, 0),
    Chemo = ifelse(FOLFIRINOX == 1 | Gemc_Cis_Cap == 1 | Gemc_Mono == 1, 1, 0),
    Chemo2 = ifelse(FOLFIRINOX2 == 1 | Gemc_Cis_Cap2 == 1 | Gemc_Mono2 == 1, 1, 0)
  )


In [95]:
exposure <- exposure2 %>%
  mutate(across(where(~ all(. %in% c("N", "Y"))), ~ as.integer(. == "Y"))) %>%
  mutate(across(where(is.numeric), as.integer))

#### Count

In [97]:
cat("FluorouracilFl1:", sum(exposure$FluorouracilFl1 == 1, na.rm=TRUE),"\n")
cat("OxaliplatinFl1:", sum(exposure$OxaliplatinFl1 == 1, na.rm=TRUE),"\n")
cat("LeucovorinFl1:", sum(exposure$LeucovorinFl1 == 1, na.rm=TRUE),"\n")
cat("IrinotecanFl1:", sum(exposure$IrinotecanFl1 == 1, na.rm=TRUE),"\n")
cat("GemcitabineFl1:", sum(exposure$GemcitabineFl1 == 1, na.rm=TRUE),"\n")
cat("CisplatinFl1:", sum(exposure$CisplatinFl1 == 1, na.rm=TRUE),"\n")
cat("CapecitabineFl1:", sum(exposure$CapecitabineFl1 == 1, na.rm=TRUE),"\n")
cat("SurgeryFl1:", sum(exposure$SurgeryFl1 == 1, na.rm=TRUE),"\n")
cat("RadiationFl1:", sum(exposure$RadiationFl1 == 1, na.rm=TRUE),"\n")
cat("FOLFIRINOX:", sum(exposure$FOLFIRINOX == 1, na.rm=TRUE),"\n")
cat("Gemc_Cis_Cap:", sum(exposure$Gemc_Cis_Cap == 1, na.rm=TRUE),"\n")
cat("Gemc_Mono:", sum(exposure$Gemc_Mono == 1, na.rm=TRUE),"\n")
cat("Any group (folfi../gemC../gemMoni):", sum(exposure$Chemo1 == 1, na.rm=TRUE),"\n")

### Run from here for chemotherapy

In [43]:
# Advance Chemotherapy Code
AdvChemoMed = codeset(con, snapshot, "RXNORM", 'selfAndDescendants',
  "1001405", "1001433", "1045456", "105585", "105586", "1093280", "1191138", "1193331", "1193339", "1193343",
  "1193347", "1193351", "1437968", "1437969", "1441402", "1441411", "1441416", "1441422", "1536484", "1543547",
  "1544378", "1544385", "1544387", "1544389", "1544395", "1544397", "1544403", "1655956", "1655959", "1655960",
  "1655967", "1655968", "1660004", "1660009", "1718589", "1719000", "1719003", "1719005", "1720735", "1720960",
  "1720975", "1720977", "1726097", "1726271", "1726276", "1726319", "1726324", "1726333", "1726492", "1726673",
  "1726676", "1728072", "1728077", "1731338", "1731340", "1731355", "1732182", "1732186", "1734340", "1734917",
  "1734919", "1734921", "1736776", "1736781", "1736784", "1736786", "1736854", "1740864", "1740865", "1740894",
  "1740898", "1740900", "1747179", "1747185", "1747192", "1790095", "1790097", "1790099", "1790100", "1790103",
  "1791493", "1791498", "1791500", "1791588", "1791593", "1791597", "1791599", "1791701", "1791736", "1796419",
  "1796424", "1797528", "1799416", "1799424", "1805001", "1805007", "1860480", "1860485", "1860619", "1861411",
  "1863343", "1863354", "1872062", "1918045", "1921592", "1942743", "1946772", "197323", "197422", "197462",
  "197687", "197797", "197894", "197895", "197896", "197919", "197931", "197988", "197989", "198269", "1992545",
  "199315", "1998783", "1999308", "2002002", "200327", "200328", "200342", "200343", "200344", "239177",
  "239178", "239179", "239180", "240416", "240573", "240754", "240906", "249364", "253113", "283475",
  "283510", "283511", "283671", "308725", "309012", "309013", "309311", "309650", "310194", "310248",
  "310351", "311487", "311625", "312199", "313209", "313210", "313211", "313213", "314167", "317160",
  "485246", "486419", "583214", "597195", "603566", "636631", "700883", "700885", "747193", "747195",
  "828706", "829926", "992166", "4492", "32592", "51499", "12574", "2555", "6313", "194000"
)

In [44]:
AdvChemoMed_tb <- chemo_flag(codes = AdvChemoMed,med_name = "AdvChemoMed")
print("Finished flagging")

In [68]:
sum(AdvChemoMed_tb$AdvChemoMedFl1 == 1, na.rm=TRUE)

#### Other Medication

In [16]:
# define the medication code
Nortriptyline <- codeset(con, snapshot, "RxNorm",'selfAndDescendants', "7531","198045", "198046","198047","312036", "317136")
trimipramine <- codeset(con, snapshot, "RxNorm", 'selfAndDescendants', "10834")
protriptyline <- codeset(con, snapshot, "RxNorm", 'selfAndDescendants', "8886")
imipramine <- codeset(con, snapshot, "RxNorm", 'selfAndDescendants', "5691")
doxepin <- codeset(con, snapshot, "RxNorm", 'selfAndDescendants', "3638")
desipramine <- codeset(con, snapshot, "RxNorm", 'selfAndDescendants', "3247")
clomipramine <- codeset(con, snapshot, "RxNorm", 'selfAndDescendants', "2597")
amoxapine <- codeset(con, snapshot, "RxNorm", 'selfAndDescendants', "722")
amitriptyline <- codeset(con, snapshot, "RxNorm", 'selfAndDescendants', "704")

tricyclic_antidepressant <- rbind(Nortriptyline,trimipramine, protriptyline, imipramine, doxepin, desipramine, clomipramine, amoxapine, amitriptyline)

venlafaxine = codeset(con, snapshot, "RxNorm", 'selfAndDescendants', "39786")
metronidazole = codeset(con, snapshot, "RxNorm", 'selfAndDescendants', "6922")  

In [17]:
desvenlafaxine <- codeset(con, snapshot, "RxNorm", 'selfAndDescendants', "734064")
duloxetine <- codeset(con, snapshot, "RxNorm", 'selfAndDescendants', "72625")
levomilnacipran <- codeset(con, snapshot, "RxNorm", 'selfAndDescendants', "1433212")
venlafaxine <- codeset(con, snapshot, "RxNorm", 'selfAndDescendants', "39786")

snri <- rbind(desvenlafaxine,duloxetine, levomilnacipran, venlafaxine)

In [40]:
Statin = codeset_from_prose(con,snapshot, url = "/definitions/statin", variable_name = "codes") 

In [38]:
# Medication
# Baseline exposure period is 1 year before Index

med_flag <- function(codes = "codes",med_name = "name")
{
create_view(codes,"concept_code")

sql <- "

WITH adm AS 
(
    SELECT m.PersonId, CAST(m.StartDateTime AS DATE) AS StartDateTime
    FROM MedicationAdministration m 
    WHERE m.CodeConceptMapId IN (
          SELECT Id FROM MedicationCodeConceptMap 
          WHERE CodeConceptId IN (SELECT ConceptId FROM concept_code)        
    ) 
    AND m.StartDateTime IS NOT NULL
   -- AND m.StartDateTime >= '2018-01-01'
),

req AS 
(
    SELECT m.PersonId, CAST(m.StartDateTime AS DATE) AS StartDateTime
    FROM MedicationRequest m 
    WHERE m.CodeConceptMapId IN (
          SELECT Id FROM MedicationCodeConceptMap 
          WHERE CodeConceptId IN (SELECT ConceptId FROM concept_code)
    ) 
    AND m.StartDateTime IS NOT NULL
   -- AND m.StartDateTime >= '2018-01-01'
),

disp AS 
(
    SELECT m.PersonId, CAST(m.DispenseDateTime AS DATE) AS StartDateTime
    FROM MedicationDispense m 
    WHERE m.CodeConceptMapId IN (
          SELECT Id FROM MedicationCodeConceptMap 
          WHERE CodeConceptId IN (SELECT ConceptId FROM concept_code)
    ) 
    AND m.DispenseDateTime IS NOT NULL
   -- AND m.DispenseDateTime >= '2018-01-01'

),

all_tbl AS 
(
    SELECT * FROM adm
    UNION ALL
    SELECT * FROM req
    UNION ALL
    SELECT * FROM disp
),

filter_tb1 AS  -- unique PersonId and date (one row per PersonId and date)
(
    SELECT PersonId, StartDateTime
    FROM all_tbl
    GROUP BY PersonId, StartDateTime
),

filter_tb2 as
(
  SELECT d.PersonId, CAST(d.PanDt AS DATE) AS PanDt, CAST(d.LivDt AS DATE) as LivDt, d.ADT,p.StartDateTime,
  CASE
   WHEN CAST(p.StartDateTime AS DATE) BETWEEN DATEADD(day,-360,CAST(d.PanDt AS DATE)) AND DATEADD(day,30,CAST(d.PanDt AS DATE))
    THEN 1
    ELSE 0
    END AS Fl1
  FROM pandata d
    LEFT JOIN filter_tb1 p
      ON d.PersonId = p.PersonId
),

filter_tb3 as
(
  SELECT PersonId,SUM(Fl1) AS CountFl1
  FROM filter_tb2
  GROUP BY PersonId
)

  SELECT PersonId, 
    CASE 
       WHEN  CountFl1 >= 2 THEN 1 ELSE 0  
    END AS %sFl
  FROM filter_tb3
"
  sql1 <- sprintf(sql,med_name)

  tb <- load_sql_table(con,snapshot, query = sql1, output_mode = "sparklyr" ) %>% collect()
  return(tb)
}

In [18]:
# call the function
notriptylin_tb <- med_flag(codes = notriptylin,med_name = "notriptylin")
venlafaxine_tb <- med_flag(codes = venlafaxine,med_name = "venlafaxine")
metronidazole_tb <- med_flag(codes = metronidazole,med_name = "metronidazole")

cases_control_allvar <- cases_control_allvar %>%
    left_join(notriptylin_tb, by="PersonId") %>%
    left_join(venlafaxine_tb, by="PersonId") %>%
    left_join(metronidazole_tb, by="PersonId")

tricyclic_antidepressant <- med_flag(codes = tricyclic_antidepressant,med_name = "tricyclic_antidepressant")
cases_control_allvar_new <- cases_control_allvar_new %>% left_join(tricyclic_antidepressant, by="PersonId")

In [42]:
#Statin <- med_flag(codes = Statin,med_name = "Statin")
pandata_all2 <- pandata_all1 %>% left_join(Statin, by="PersonId")

In [18]:
sum(data$Statin == 1, na.rm=TRUE)

In [17]:
snri <- med_flag(codes = snri,med_name = "snri")
cases_control_allvar_new <- cases_control_allvar_new %>% left_join(snri, by="PersonId") 
tricyclic_antidepressant <- med_flag(codes = tricyclic_antidepressant,med_name = "tricyclic_antidepressant")
cases_control_allvar_new <- cases_control_allvar_new %>% left_join(tricyclic_antidepressant, by="PersonId") 
sum(cases_control_allvar_new$snriFl == 1)

In [33]:
display_df(conc_med,10)

In [34]:
cat("notriptylinFl1:", sum(conc_med$notriptylinFl1 == "Y", na.rm=TRUE),"\n")
cat("venlafaxineFl1:", sum(conc_med$venlafaxineFl1 == "Y", na.rm=TRUE),"\n")
cat("metronidazoleFl1:", sum(conc_med$metronidazoleFl1 == "Y", na.rm=TRUE),"\n")


In [20]:
sql <-
" 
SELECT 
    COUNT(CASE WHEN venlafaxineFl1 = 'Y' THEN 1 END) AS count
FROM tempconversiontbl_1741624600
"
display_df(load_sql_table(con, snapshot, sql, view_name='count',output_mode = "sparklyr"))


#### Different Approach for Medication

In [24]:
# Select subjects from all the 3 tables(adm,req,disp) and combine them all. 
#Select only subjects with startdatetime not null and atleast 2 exposure(unique datetime) for each subject.
# Per row per subject with earliest and latest medication date 
sql <- "

WITH filtered_adm1 AS 
(
    SELECT m.PersonId, CAST(m.StartDateTime AS DATE) AS StartDateTime
    FROM MedicationAdministration m 
    WHERE m.CodeConceptMapId IN (
          SELECT Id FROM MedicationCodeConceptMap 
          WHERE CodeConceptId IN (SELECT ConceptId FROM Capecitabine)        
    ) 
    AND m.StartDateTime IS NOT NULL
    AND m.StartDateTime >= '2018-01-01'
),

filtered_adm2 AS 
(
    SELECT PersonId
    FROM filtered_adm1
    GROUP BY PersonId
    HAVING COUNT(DISTINCT StartDateTime) >= 2
),
adm as
(
SELECT a.PersonId,a.StartDateTime
FROM filtered_adm1 a
INNER JOIN filtered_adm2 b
ON a.Personid=b.PersonId
),

filtered_req1 AS 
(
    SELECT m.PersonId, CAST(m.StartDateTime AS DATE) AS StartDateTime
    FROM MedicationRequest m 
    WHERE m.CodeConceptMapId IN (
          SELECT Id FROM MedicationCodeConceptMap 
          WHERE CodeConceptId IN (SELECT ConceptId FROM Capecitabine)
    ) 
    AND m.StartDateTime IS NOT NULL
    AND m.StartDateTime >= '2018-01-01'
),
filtered_req2 AS 
(
    SELECT PersonId
    FROM filtered_req1
    GROUP BY PersonId
    HAVING COUNT(DISTINCT StartDateTime) >= 2
),

req as
(
SELECT a.PersonId,a.StartDateTime
FROM filtered_req1 a
INNER JOIN filtered_req2 b
ON a.Personid=b.PersonId
),

filtered_disp1 AS 
(
    SELECT m.PersonId, CAST(m.DispenseDateTime AS DATE) AS StartDateTime
    FROM MedicationDispense m 
    WHERE m.CodeConceptMapId IN (
          SELECT Id FROM MedicationCodeConceptMap 
          WHERE CodeConceptId IN (SELECT ConceptId FROM Capecitabine)
    ) 
    AND m.DispenseDateTime IS NOT NULL
    AND m.DispenseDateTime >= '2018-01-01'
),

filtered_disp2 AS 
(
    SELECT PersonId
    FROM filtered_disp1
    GROUP BY PersonId
    HAVING COUNT(DISTINCT StartDateTime) >= 2
),

disp AS 
(
SELECT a.PersonId,a.StartDateTime
FROM filtered_disp1 a
INNER JOIN filtered_disp2 b
ON a.Personid=b.PersonId
),

all_tbl AS 
(
    SELECT * FROM adm
    UNION ALL
    SELECT * FROM req
    UNION ALL
    SELECT * FROM disp
),

all_tbl1 AS 
(
    SELECT PersonId, 
           MIN(StartDateTime) AS MedStart, 
           MAX(StartDateTime) AS MedLast, 
           'Capecitabine' AS Med_name
    FROM all_tbl
    GROUP BY PersonId
)

SELECT m.*, CAST(p.PanDt AS DATE) AS PanDt, CAST(p.LivDt AS DATE) LivDt, p.LivDtFl30, p.LivDtFlBase
FROM data p
INNER JOIN all_tbl1 m
    ON p.PersonId = m.PersonId
"

Capecitabine_tbl <- load_sql_table(con, snapshot, sql, view_name = 'Capecitabine_tbl', output_mode = "sparklyr")


In [48]:
#Combine all the medication table together and order by PersonId, Med_name, MedStart
sql <- "
WITH comb_tbl AS (
    SELECT * FROM Fluorouracil_tbl
    UNION ALL
    SELECT * FROM Oxaliplatin_tbl
    UNION ALL
    SELECT * FROM Leucovorin_tbl
    UNION ALL
    SELECT * FROM Irinotecan_tbl
    UNION ALL
    SELECT * FROM Gemcitabine_tbl
    UNION ALL
    SELECT * FROM Cisplatin_tbl
    UNION ALL
    SELECT * FROM Capecitabine_tbl
),

comb_pan as 
(
  SELECT a.*,b.ADT
  FROM comb_tbl a
   LEFT JOIN data b
    ON a.PersonId = b.PersonId  
)

SELECT *, 
    CASE 
        WHEN LivDtFlBase = 'Y' AND MedStart >= PanDt THEN 'Y'
        WHEN LivDtFl30 = 'Y' AND MedStart BETWEEN PanDt AND LivDt THEN 'Y'
        WHEN LivDtFlBase = 'N' AND LivDtFl30 = 'N' AND MedStart BETWEEN PanDt AND ADT THEN 'Y'
        ELSE 'N'
    END AS Exposure,  -- Added comma here

    CASE 
        WHEN LivDtFlBase = 'Y' AND MedStart >= PanDt THEN 1
        WHEN LivDtFl30 = 'Y' AND MedStart BETWEEN PanDt AND LivDt THEN 2
        WHEN LivDtFlBase = 'N' AND LivDtFl30 = 'N' AND MedStart BETWEEN PanDt AND ADT THEN 3
        ELSE 0
    END AS APOC  -- No comma needed at the end

FROM comb_pan
ORDER BY PersonId, Med_Name, MedStart

"
chemo_med <- load_sql_table(con, snapshot, sql, view_name = 'chemo_med', output_mode = "sparklyr")

In [73]:
sql <-"
SELECT *,
CASE
WHEN MedStart <= ADT THEN 'Y' ELSE N END AS expo
FROM chemo_med
"
count <- display_df(load_sql_table(con, snapshot, sql, view_name = 'count', output_mode = "sparklyr"))


In [None]:
# Get each medication in column form
library(data.table)
chemo_medR <- collect(chemo_med)

In [30]:
dt_wide <- dcast(chemo_medR, PersonId ~ Med_name, value.var = "Exposure")
pan_chemo <- dt_wide %>% mutate(
    FOLFIRINOX = ifelse(Fluorouracil == "Y" & (Oxaliplatin == "Y" | Leucovorin == "Y" | Irinotecan == "Y"), "Y", "N"),
    Gemc_Cis_Cap = ifelse(Gemcitabine == "Y" & (Cisplatin == "Y" | Capecitabine == "Y"), "Y", "N"),
    Gemc_Mono = ifelse(Gemcitabine == "Y" & is.na(Cisplatin) & is.na(Capecitabine), "Y", "N"),
    Chemo = ifelse(FOLFIRINOX == "Y" | Gemc_Cis_Cap == "Y" | Gemc_Mono == "Y","Y","N")
)

In [31]:
# total number of subject with chemotherapy
sum(pan_chemo$Chemo=="Y",na.rm=TRUE)

#### Add Claim Line

In [None]:
### trial
sql <- "

WITH claimline as
(SELECT cl.PersonId, clc.CodeConceptId from 
ClaimLine cl join ClaimLineCodes clc on cl.Id = clc.ClaimLineId
WHERE clc.CodeConceptId in
(SELECT ConceptId 
FROM medication_code))

SELECT PersonId, count(c.CodeConceptId) as count
from female_claimline c
group by PersonId
" 

temp <- load_sql_table(con,snapshot, sql, view_name = "temp", output_mode = "sparklyr") %>% collect

display_df(female_claimline)

In [31]:
sql <- "
SELECT cl.PersonId
FROM ClaimLine cl
INNER JOIN ClaimLineCodes clc ON cl.ClaimId = clc.ClaimLineId  -- Ensure ClaimId is used for joining
INNER JOIN Fluorouracil cc ON clc.CodeConceptId = cc.ConceptId  -- Ensuring CodeConceptId matches ConceptId
LIMIT 10
"
temp <- load_sql_table(con,snapshot, sql, view_name = "temp", output_mode = "sparklyr")

display_df(temp,10)

### Comorbidities

##### Disease Code

In [14]:
CKD = codeset(con, snapshot, "ICD10CM", 'selfAndDescendants', "E13.22", "E11.22", "E08.22", "E09.22", "N18", "I12", "I13")

T2DM = codeset(con, snapshot, "ICD10CM", 'selfAndDescendants', "E11")

HepatitisB = codeset(con, snapshot, "ICD10CM", 'selfAndDescendants', "B16.0", "B16.1", "B16.2", "B16.9", "B17.0", "B18.0", "B18.1", "B19.10", "B19.11")
HepatitisC = codeset(con, snapshot, "ICD10CM", 'selfAndDescendants', "B17.1", "B17.10", "B17.11", "B18.2", "B19.2", "B19.20", "B19.21")
Hepatitis = rbind(HepatitisB,HepatitisC)

#Chronic Liver Disease
CLD = codeset(con, snapshot, "ICD10CM", 'selfAndDescendants', "K70.0", "K70.2", "K70.30", "K70.31", "K70.40", "K70.41", "K70.9", "K71.0", "K71.10", "K71.11", "K71.3", "K71.4", "K71.50", "K71.51", "K71.7", "K72.10", "K72.11", "K73.0", "K73.1", "K73.2", "K73.8", "K73.9", "K74.0", "K74.00", "K74.01", "K74.02", "K74.1", "K74.2", "K74.3", "K74.4", "K74.5", "K74.60", "K74.69", "K76.1", "K76.2", "K76.3", "K76.5", "K76.6", "K76.7")

#Hypertension = codeset(con, snapshot, "ICD10CM", 'selfAndDescendants', "I10", "I11", "I12", "I13", "I15", "I16")

#Hyperlipidemia = codeset(con, snapshot, "ICD10CM", 'selfAndDescendants', "E78.0", "E78.00", "E78.01", "E78.1", "E78.2", "E78.3", "E78.4", "E78.41", "E78.49", "E78.5")

Hyperlipidemia1 = codeset(con, snapshot, "ICD10CM", 'selfAndDescendants', "E78.0", "E78.00", "E78.01", "E78.1", "E78.2", "E78.3", "E78.4", "E78.41", "E78.49", "E78.5") 
Hyperlipidemia2 = codeset(con, snapshot,"SNOMED CT",'selfAndDescendants', "13644009", "33513003", "34349009", "34528009", "55822004", "129589009", "129590000", "129591001", "190774002", "238040008", "238076009", "238077000",
  "238078005", "238079002", "238080004", "238081000", "238082007", "238083002", "238084008", "238085009", "238087001", "238088006", "238089003", "267432004",
  "267433009", "267434003", "267435002", "299465007", "302870006", "397915002","398036000", "402473001", "402474007", "402475008", "402725005", "402726006",
  "402727002", "402785008", "402786009", "402787000", "403827000", "403828005","403829002", "403830007", "403831006", "426161002", "445261005", "767133009",
  "773649005", "773726000", "701000119103", "1571000119104", "15771000119109","114831000119107", "137931000119102", "137941000119106")

Hyperlipidemia = rbind(Hyperlipidemia1,Hyperlipidemia2)

Hypertension1 = codeset(con, snapshot,"ICD10CM",'selfAndDescendants',"I10", "I11", "I12", "I13", "I15", "I16") 
Hypertension2 = codeset(con, snapshot,"ICD9CM",'selfAndDescendants',"401", "402", "403", "404", "405")
Hypertension3 = codeset(con, snapshot,"SNOMED CT",'selfAndDescendants', "38341003", "10725009", "1078301000112109", "111438007", "1201005", "123799005", "123800009", "14973001", "169465000", "194783001", "194785008",
  "194788005", "194791005", "199008003", "26078007", "28119000", "31992008","371125006", "39018007", "39727004", "427889009", "428575007", "429457004","461301000124109", "46481004", "48146000", "48552006", "56218007", "57684003",
  "59621000", "59720008", "65518004", "73410007", "74451002", "762463000", "78975002", "89242004", "284981000119102", "284991000119104", "704667004","71701000119105", "71421000119105", "397748008", "449759005", "443482000")

Hypertension = rbind(Hypertension1, Hypertension2, Hypertension3)


#Obstructive Sleep Apnea
osa = codeset(con, snapshot, "ICD10CM", 'selfAndDescendants', "G47.3", "G47.30", "G47.33", "G47.39")

COPD = codeset(con, snapshot, "ICD10CM", 'selfAndDescendants', "J41", "J42", "J43", "J44")

Anxiety = codeset(con, snapshot, "ICD10CM", 'selfAndDescendants', "F41.0", "F41.1", "F41.3", "F41.8", "F41.9")

Depression = codeset(con, snapshot, "ICD10CM", 'selfAndDescendants', "F01.51", "F32.0", "F32.1", "F32.2", "F32.3", "F32.4", "F32.5", "F32.89", "F32.9", "F33", "F34.1", "F34.81", "F34.89", "F43.21", "F43.23", "F53.0", "F53.1")

GingivitisPeriodontal = codeset(con, snapshot, "ICD10CM", 'selfAndDescendants', "K05")

Ischemic_Heart_Disease = codeset(con, snapshot, "ICD10CM", 'selfAndDescendants', "I23.0", "I23.1", "I23.2", "I23.3", "I23.4", "I23.5", "I23.6", "I23.8", "I24.0", "I24.8", "I24.9", "I25")

Obesity = codeset(con, snapshot, "ICD10CM", 'SelfAndDescendants', "E66.01", "E66.09", "E66.1", "E66.2", "E66.3", "E66.8", "E66.9") 
obesity_snomed = codeset(con, snapshot, "SNOMED CT", 'SelfAndDescendants', "162863004", "162864005", "238131007", "238136002", "408512008", "414915002", "414916001", "415530009", "83911000119104")
obesity_codes = rbind(Obesity, obesity_snomed)

abdominal_pain1 = codeset(con, snapshot, "SNOMED CT", 'selfAndDescendants', "21522001", "102614006", "54586004", "116290004", "111985007", "83132003", "285388000", "285387005", "79922009", "14700001000004102", "139313005", "301717006", "439469002", "304542004", "314212008", "162042000", "162046002", "102613000", "247358007", "207223006", "139317006", "139326009", "139327000", "139328005", "207205003", "247351001", "207229005", "207220009", "163216002", "163217006", "2174009", "1119218004", "1119217009", "247362001")
abdominal_pain2 = codeset(con, snapshot, "ICD10CM", 'selfAndDescendants', "R10.9", "R10.30", "R10.10", "R10.84", "R10.11", "R10.31", "R10.32", "R10.8") 
abdominal_pain3 = codeset(con, snapshot, "ICD9CM", 'selfAndDescendants', "789.06", "789.00", "789.07", "789.09", "789.01", "789.03", "789.04", "789.02", "789.0")
abdominal_pain = rbind(abdominal_pain1,abdominal_pain2,abdominal_pain3)

Gastroesophageal_refluxdisease1 = codeset(con, snapshot, "SNOMED CT", 'selfAndDescendants', "235595009") 
Gastroesophageal_refluxdisease2 = codeset(con, snapshot, "ICD10CM", 'selfAndDescendants', "K21.01", "K21.00", "K21.9")
Gastroesophageal_refluxdisease = rbind(Gastroesophageal_refluxdisease1,Gastroesophageal_refluxdisease2)

Dyspnea1 = codeset(con, snapshot, "SNOMED CT", 'selfAndDescendants', "49233005", "139201002", "161941007", "161946002", "248548009", "267036007", "870535009", "60845006", "207057006", "158379001", "389324009", "390057009", "297216006", "161937008") 
Dyspnea2 = codeset(con, snapshot, "ICD10CM", 'selfAndDescendants', "R06.00", "R06.0", "R06.09", "J95.87", "R06.02") 
Dyspnea = rbind(Dyspnea1,Dyspnea2)

Anemia1 = codeset(con, snapshot, "ICD10CM", 'selfAndDescendants', "D50.0", "D50.1", "D50.8", "D50.9", "D55.0", "D55.1", "D55.21", "D55.29", "D55.3", "D55.8", "D55.9", "D58.0", "D58.8", "D58.9", "D59.0", "D59.10", "D59.11", "D59.12", "D59.13", "D59.19", "D59.2", "D59.3", "D59.4", "D59.5", "D59.6", "D59.8", "D59.9", "D63.0", "D63.1", "D63.8", "O99.011", "O99.012", "O99.013", "O99.019", "O99.02", "D51.0", "D51.3", "D51.8", "D51.9", "D53.9", "D62", "D64.81", "D64.89", "D64.9", "O90.81", "O99.01") 
Anemia2 = codeset(con, snapshot, "SNOMED CT", 'selfAndDescendants', "862001", "2694001", "2835000", "3272007", "3571004", "3978000", "4854004", "4939006", "4984008", "5300004", "5315003", "5430006", "5603006", "6398009", "6659005", "9434008", "10205009", "10564005", "10619002", "11491000", "11503009", "11781007", "12189000", "12238007", "12907000", "14087004", "14126008", "14379009", "14514008", "15276008", "15332004", "16645003", "18323000", "18637002", "18662002", "21412009", "21914002", "22098000", "22347002", "22438006", "22933009", "24620004", "24661004", "24962009", "24975009", "25251008", "25266006", "25443007", "26333003", "26944003", "27342004", "27366005", "27798002", "28147001", "29177004", "29551000", "30418008", "30575002", "31206006", "31820007", "32094009", "32648007", "33491002", "33905008", "34194007", "34247008", "34629009", "34852006", "34925000", "35703006", "35778001", "36568005", "36919001", "37272000", "37370005", "38689004", "38911009", "38970002", "40387008", "41387000", "41462006", "41614006", "41841004", "42461002", "42484009", "43707008", "44206008", "44288006", "44452003", "44641000", "44666001", "44910003", "45098004", "46737006", "46760003", "47516005", "47526003", "47844003", "48553001", "48580008", "49284006", "49472006", "49708008", "50253007", "51071000", "51667002", "52212006", "52413004", "52565000", "53165003", "54698001", "55995005", "59106005", "59644002", "60138009", "60164003", "60504009", "60805002", "61261009", "62268000", "62389006", "62403005", "62871001", "66309005", "66612000", "67894009", "68361004", "68700003", "69574002", "69981004", "71855000", "72501006", "73891003", "74703006", "74789008", "76366001", "77413008", "77607006", "77663007", "78209002", "78677008", "78908001", "78997000", "79035003", "80126007", "80875006", "80963002", "81711008", "82003006", "82430007", "82895008", "82980005", "83414005", "84027009", "85570009", "85649008", "85746008", "86225009", "86325007", "86448001", "86859003", "87522002", "87806008", "87810006", "90175006", "91217009", "91411007", "105599000", "109996008", "109998009", "110000005", "111407006", "111571009", "111574001", "111575000", "111576004", "111577008", "111579006", "111581008", "115963009", "127045008", "127049002", "127050002")
Anemia = rbind(Anemia1,Anemia2)

FHOMND1 = codeset(con, snapshot, "ICD10CM", 'selfAndDescendants', "Z80.0")
FHOMND2 = codeset(con, snapshot, "SNOMED CT", 'selfAndDescendants', "430813008")
FHOMND = rbind(FHOMND1,FHOMND2)


In [10]:
#Add Breast, Lung Stomach Colorectal Cancer Codes 
#breast cancer
BreastCan = codeset(con, snapshot,"ICD10CM", 'selfAndDescendants', "C50")
#Colorectal Cancer
ColorecCan = codeset(con, snapshot,"ICD10CM",'selfAndDescendants', "C18","C19","C20","C21") 
#Lung Cancer
LungCanicd = codeset(con, snapshot,"ICD10CM",'selfAndDescendants',"C33","C34")
LungCanSnomed = codeset(con, snapshot,"SNOMED CT",'selfAndDescendants',"93880001","363358000")
LungCan = rbind(LungCanicd,LungCanSnomed)
#Stomach Cancer
StomachCan = codeset(con, snapshot,"ICD10CM", 'selfAndDescendants', "C16")
#Esophageal cancer 
EsophaCan = codeset(con, snapshot,"ICD10CM", 'selfAndDescendants', "C15")
#Liver Cancer
LiverCan = codeset(con, snapshot,"ICD10CM", 'selfAndDescendants', "C22")

# Combine all into one category of  cancer
Cancer = rbind(ColorecCan,BreastCan,LungCan,StomachCan,EsophaCan,LiverCan)


In [11]:
disease_flag <- function(codes = "codes",disease_name = "name"){
    create_view(codes,"concept_code")

sql <- "
   WITH cond1 as 
   (
     SELECT 
        PersonId, 
        min(COALESCE(OnsetDateTime, RecordedDateTime)) AS CondStart
    FROM condition c
    INNER JOIN ConditionCodeConceptMap ccm 
    ON c.CodeConceptMapId = ccm.Id
    INNER JOIN concept_code pcc 
    ON ccm.CodeConceptId = pcc.ConceptId
    WHERE ccm.SourceConceptId IN (2703595, 2703594)
    AND (OnsetDateTime IS NOT NULL OR RecordedDateTime IS NOT NULL) 
    GROUP BY PersonId
  ),
  
  cond2 AS 
  (
    SELECT p.PersonId,CAST(c.CondStart AS DATE) AS CondStart, CAST(p.PanDt AS DATE) AS PanDt
    FROM data p
    LEFT JOIN cond1 c
     ON p.PersonId = c.PersonId
  )

  SELECT PersonId, 
    CASE 
    WHEN CondStart IS NOT NULL AND CondStart <= DATEADD(day, 30, PanDt) 
         THEN 1 ELSE 0 
    END AS %s
  FROM cond2
"
  sql1 <- sprintf(sql, disease_name) 

  tb <- load_sql_table(con,snapshot, query = sql1, output_mode = "sparklyr" ) %>% collect()
  return(tb)
}

In [19]:
# Flagging each disease separately
df_CKD <- disease_flag(codes = CKD, disease_name = "CKD")
print("Finished flagging the disease: CKD")

df_T2DM <- disease_flag(codes = T2DM, disease_name = "T2DM")
print("Finished flagging the disease: T2DM")

df_Hepatitis <- disease_flag(codes = Hepatitis, disease_name = "Hepatitis")
print("Finished flagging the disease: Hepatitis")

df_CLD <- disease_flag(codes = CLD, disease_name = "CLD")
print("Finished flagging the disease: CLD")

df_Hypertension <- disease_flag(codes = Hypertension, disease_name = "Hypertension")
print("Finished flagging the disease: Hypertension")

df_Hyperlipidemia <- disease_flag(codes = Hyperlipidemia, disease_name = "Hyperlipidemia")
print("Finished flagging the disease: T2DiabetesWithCoHyperlipidemiamplications")

df_osa <- disease_flag(codes = osa, disease_name = "osa")
print("Finished flagging the disease: osa")

df_COPD <- disease_flag(codes = COPD, disease_name = "COPD")
print("Finished flagging the disease: COPD")

df_Anxiety <- disease_flag(codes = Anxiety, disease_name = "Anxiety")
print("Finished flagging the disease: Anxiety")

df_Ischemic_Heart_Disease <- disease_flag(codes = Ischemic_Heart_Disease, disease_name = "Ischemic_Heart_Disease")
print("Finished flagging the disease: Ischemic_Heart_Disease")

df_Depression <- disease_flag(codes = Depression, disease_name = "Depression")
print("Finished flagging the disease: Depression")

df_Obesity_codes <- disease_flag(codes = obesity_codes, disease_name = "Obesity_codes")
print("Finished flagging the disease: Obesity_codes")

df_GingivitisPeriodontal <- disease_flag(codes = GingivitisPeriodontal, disease_name = "GingivitisPeriodontal")
print("Finished flagging the disease: GingivitisPeriodontal")

df_Cancer <- disease_flag(codes = Cancer, disease_name = "Cancer")
print("Finished flagging the disease: Cancer")

df_abdominal_pain <- disease_flag(codes = abdominal_pain, disease_name = "abdominal_pain")
print("Finished flagging the disease: abdominal_pain")

df_Gastroesophageal_refluxdisease <- disease_flag(codes = Gastroesophageal_refluxdisease, disease_name = "Gastroesophageal_refluxdisease")
print("Finished flagging the disease: Gastroesophageal_refluxdisease")

df_Dyspnea <- disease_flag(codes = Dyspnea, disease_name = "Dyspnea")
print("Finished flagging the disease: Dyspnea")

df_Anemia <- disease_flag(codes = Anemia, disease_name = "Anemia")
print("Finished flagging the disease: Anemia")

df_FHOMND <- disease_flag(codes = FHOMND, disease_name = "FHOMND")
print("Finished flagging the disease: FHOMND")

In [None]:
# Merge all datasets using left_join to ensure we retain all PersonId entries from the base dataset
comorbidity <- cases_control %>% select(PersonId, STARTDT, ADT, CNSR, indicator) %>%
    left_join(df_CKD, by = "PersonId") %>%
    left_join(df_T2DM, by = "PersonId") %>%
    left_join(df_Hepatitis, by = "PersonId") %>%
    left_join(df_CLD, by = "PersonId") %>%
    left_join(df_Hypertension, by = "PersonId") %>%
    left_join(df_Hyperlipidemia, by = "PersonId") %>%
    left_join(df_osa, by = "PersonId") %>%
    left_join(df_COPD, by = "PersonId") %>%
    left_join(df_Anxiety, by = "PersonId") %>%
    left_join(df_Ischemic_Heart_Disease, by = "PersonId") %>%
    left_join(df_Depression, by = "PersonId") %>%
    left_join(df_Obesity_codes, by = "PersonId") %>%
    left_join(df_GingivitisPeriodontal, by = "PersonId") %>%
    left_join(df_Gastroesophageal_refluxdisease, by = "PersonId") %>% 
    left_join(df_abdominal_pain, by= "PersonId") %>% 
    left_join(df_Dyspnea, by= "PersonId") %>% 
    left_join(df_Anemia, by= "PersonId") %>% 
    left_join(df_FHOMND, by= "PersonId") %>% 
    left_join(df_Cancer, by = "PersonId")

In [28]:
# Display the final merged dataset
display_df(comorbidity,10)

In [16]:
#Temporary don't run later 
#Join the old and new list of comorbidities

# remove hyperlipidemia, hypertension and comorbidscore
comorbidity <- comorbidity %>% select(-Hyperlipidemia,-Hypertension,-comorbidScore)

# Just do cbind of both new and old comorbidity
comorbidity <- comorbidity %>% left_join(comorbidity_temp,by="PersonId")
comorbidity$comorbidScore <- rowSums(comorbidity[, 6:23], na.rm = TRUE)

In [19]:
cat("CKD:", sum(comorbidity$CKD == 1, na.rm = TRUE), "\n")
cat("T2DM:", sum(comorbidity$T2DM == 1, na.rm = TRUE), "\n")
cat("Hepatitis:", sum(comorbidity$Hepatitis == 1, na.rm = TRUE), "\n")
cat("CLD:", sum(comorbidity$CLD == 1, na.rm = TRUE), "\n")
cat("Hypertension:", sum(comorbidity$Hypertension == 1, na.rm = TRUE), "\n")
cat("Hyperlipidemia:", sum(comorbidity$Hyperlipidemia == 1, na.rm = TRUE), "\n")
cat("OSA:", sum(comorbidity$osa == 1, na.rm = TRUE), "\n")
cat("COPD:", sum(comorbidity$COPD == 1, na.rm = TRUE), "\n")
cat("Anxiety:", sum(comorbidity$Anxiety == 1, na.rm = TRUE), "\n")
cat("Ischemic Heart Disease:", sum(comorbidity$Ischemic_Heart_Disease == 1, na.rm = TRUE), "\n")
cat("Depression:", sum(comorbidity$Depression == 1, na.rm = TRUE), "\n")
cat("Obesity:", sum(comorbidity$Obesity_codes == 1, na.rm = TRUE), "\n")
cat("Abdominal Pain:", sum(comorbidity$abdominal_pain == 1, na.rm = TRUE), "\n")
cat("Gastroesophageal Reflux Disease:", sum(comorbidity$Gastroesophageal_refluxdisease == 1, na.rm = TRUE), "\n")
cat("Dyspnea:", sum(comorbidity$Dyspnea == 1, na.rm = TRUE), "\n")
cat("Anemia:", sum(comorbidity$Anemia == 1, na.rm = TRUE), "\n")
cat("Family History of Malignant Neoplasm of Digestive Organ:", sum(comorbidity$FHOMND == 1, na.rm = TRUE), "\n")

In [None]:
comorbidity$comorbidScore <- rowSums(comorbidity[, 6:23], na.rm = TRUE)

In [16]:
# Stratify based on Gender
dataM <- cases_control %>% filter(Sex == "Male")
dataF <- cases_control %>% filter(Sex == "Female")
modelM <- glm(indicator ~ BMI_Group, data = dataM, family = binomial(link = "logit"))
modelF <- glm(indicator ~ BMI_Group, data = dataF, family = binomial(link = "logit"))

In [36]:
tempdata <- com_med %>% filter(OthMetFLBase == "Y" & LivDtFlBase == "N") %>% mutate(LivDtFlTemp = ifelse(LivDtFl == "Y",1,0))
glm(LivDtFlTemp ~ AdvChemoMedFl1, data= tempdata, family = binomial(link = "logit"))

## Correlation

In [8]:
colnames(cases_control_allvar)

In [23]:
#Join the medication tables and create with the primary diagnosis table
tempmed <- exposure %>% select(PersonId, FOLFIRINOX, Gemc_Cis_Cap, Gemc_Mono, RadiationFl1, SurgeryFl1) %>% inner_join(AdvChemoMed_tb,by="PersonId")
med <- cases_control %>% select(PersonId) %>% left_join(tempmed,by="PersonId")

tempcom <- comorbidity[,-c(2:5)]
com <- cases_control %>% select(PersonId,LivDtFlBase,LivDtFl30,LivDtFl,OthMetFLBase,OthMetFl30,AllMetDt) %>% left_join(tempcom,by="PersonId")

In [24]:
com_med <- med %>% inner_join(com,by="PersonId")
nrow(com_med)

In [125]:
# correlation between medication and comorbidities
com_med <- com_med %>% mutate(across(where(is.integer), as.numeric))
com_med1 <- com_med[,-1]
a <- cor(com_med1, use = "pairwise", method = "spearman")
cordata <- as.data.frame(a) %>% select(-starts_with('Has')) %>% round(.,2)

##### **Final Analysis Dataset**

In [90]:
# Create exposure variable i.e if chemo/surgery/radition or nothing (0); Assign "UNK" if BMI_Group is missing
cases_control_allvar_new <- cases_control_allvar_new %>% mutate(exposure = as.factor(ifelse(AdvChemoMedFl1 == 1, 1,
    ifelse(SurgeryFl1 == 1,2,
    ifelse(RadiationFl1 == 1,3,0)
    ))))

In [54]:
# If only 2 level convert to binary 1/0

cases_control_allvar_new$AdvChemoMedFl1 <- ifelse(cases_control_allvar_new$AdvChemoMedFl1 == "Y",1,0)
cases_control_allvar_new$AdvChemoMedFl2 <- ifelse(cases_control_allvar_new$AdvChemoMedFl2 == "Y",1,0)
cases_control_allvar_new$SurgeryFl1 <- ifelse(cases_control_allvar_new$SurgeryFl1 == "Y",1,0)
cases_control_allvar_new$SurgeryFl2 <- ifelse(cases_control_allvar_new$SurgeryFl2 == "Y",1,0)
cases_control_allvar_new$RadiationFl1  <- ifelse(cases_control_allvar_new$RadiationFl1  == "Y",1,0)
cases_control_allvar_new$RadiationFl2 <- ifelse(cases_control_allvar_new$RadiationFl2 == "Y",1,0)

In [46]:
#Convert all the categorical variable to factor and all numeric variable to class numeric.
# Variables

#Comorbidity
comorbidity <- c("CKD", "T2DM", "Hepatitis", "Hypertension", "CLD", "Hyperlipidemia", "osa", "COPD", "Anxiety", "Ischemic_Heart_Disease", "Depression", "Obesity_codes","Cancer",
"Gastroesophageal_refluxdisease", "abdominal_pain","Dyspnea","Anemia","FHOMND")

#Exposure
Exposure1 <- c("RadiationFl1", "SurgeryFl1","AdvChemoMedFl1","RadiationFl2", "SurgeryFl2","AdvChemoMedFl2","exposure")
Exposure2 <- c("Nortriptyline","Venlafaxine", "Metronidazole", "tricyclic_antidepressant", "SNRI")

#Demographic and other
DemographicOth <- c("Age_Group", "Ethnicity", "Race", "Sex", "MaritalStatus", "BMI_Group", "OthMetFlBase", "PanLocation", "DistantMetFlBase", "LympNodeFlBase")

#lab variables
lab <- c("DirectBilirubin","TotalBilirubin","ALP","Albumin","TotalBilirubin","AST","ALT","TotalProtein","ProthrombinT", "SerumLipase")

vars_cat <- c(comorbidity, Exposure1, Exposure2, DemographicOth)
vars_num <- c("AgeAtDiagnosis","BMI")

# Convert variable types

# Safely convert categorical variables
for (var in vars_cat) {
  if (var %in% names(cases_control_allvar_new)) {
    cases_control_allvar_new[[var]] <- as.factor(cases_control_allvar_new[[var]])
  } else {
    message("Variable not found (categorical): ", var)
  }
}

# Safely convert numeric variables
for (var in vars_num) {
  if (var %in% names(cases_control_allvar_new)) {
    cases_control_allvar_new[[var]] <- as.numeric(cases_control_allvar_new[[var]])
  } else {
    message("Variable not found (numeric): ", var)
  }
}


In [93]:
# Create reference level 
cases_control_allvar_new$Age_Group <- relevel(cases_control_allvar_new$Age_Group, ref = "Age 45-64")
unique(factor(cases_control_allvar_new$Age_Group))

cases_control_allvar_new$Ethnicity <- relevel(cases_control_allvar_new$Ethnicity, ref = "Not Hispanic or Latino")
unique(factor(cases_control_allvar_new$Ethnicity))

cases_control_allvar_new$MaritalStatus <- relevel(cases_control_allvar_new$MaritalStatus, ref = "Married")
unique(factor(cases_control_allvar_new$MaritalStatus))

cases_control_allvar_new$Race <- relevel(cases_control_allvar_new$Race, ref = "White")
unique(factor(cases_control_allvar_new$Race))

cases_control_allvar_new$Sex <- relevel(cases_control_allvar_new$Sex, ref = "Female")
unique(factor(cases_control_allvar_new$Sex))

cases_control_allvar_new$BMI_Group <- relevel(cases_control_allvar_new$BMI_Group, ref = "Normal")
unique(factor(cases_control_allvar_new$BMI_Group))

cases_control_allvar_new$PanLocation <- relevel(cases_control_allvar_new$PanLocation, ref = "Body/Tail")
unique(factor(cases_control_allvar_new$PanLocation))

cases_control_allvar_new$exposure <- relevel(cases_control_allvar_new$exposure, ref = "0")
unique(factor(cases_control_allvar_new$exposure))

In [91]:
# Reordering the levels
cases_control_allvar_new$Age_Group <- factor(cases_control_allvar_new$Age_Group, levels = c("Age 45-64", "Age 66-84", "Age 85+"))

cases_control_allvar_new$BMI_Group <- factor(cases_control_allvar_new$BMI_Group, 
      levels = c("Normal","Underweight","Overweight", "Obese(Class I)", "Obese(Class II)", "Obese(Class III)", "UNK"))

cases_control_allvar_new$exposure <- factor(cases_control_allvar_new$exposure, levels = c("0", "1", "2","3"))

In [26]:
# create new age groups; rename nortriphylin
cases_control_allvar_new <- cases_control_allvar_new %>% rename(Nortriptyline = notriptylinFl1, Venlafaxine = venlafaxineFl1, Metronidazole = metronidazoleFl1, 
tricyclic_antidepressant = tricyclic_antidepressantFl1, SNRI = snriFl1)

cases_control_allvar_new <- cases_control_allvar_new %>% mutate(Age_Group = ifelse(AgeAtDiagnosis < 45, "Age < 45",
                                               ifelse(AgeAtDiagnosis >= 45 &AgeAtDiagnosis < 60, "Age 45-60",
                                                      ifelse(AgeAtDiagnosis >= 60 & AgeAtDiagnosis < 75, "Age 60-75", "Age >= 75")
                                               )
)
) 

In [49]:
missing_vars_cat <- vars_cat[!vars_cat %in% colnames(cases_control_allvar)]
missing_vars_num <- vars_num[!vars_num %in% colnames(cases_control_allvar)]

print(missing_vars_cat)
print(missing_vars_num)

In [54]:
#Patient Characteristics
library(dplyr)
library(tidyr)
library(knitr)

data <- cases_control_allvar_new

summary_table <- function(df, var) {
  df %>%
    count(!!sym(var)) %>%
    mutate(Percentage = round(n / sum(n) * 100, 2)) %>%
    rename(Category = !!sym(var))
}

# Creating summaries for all categorical variables

age_summary <- summary_table(data, "Age_Group")
sex_summary <- summary_table(data, "Sex")
ethnicity_summary <- summary_table(data, "Ethnicity")
race_summary <- summary_table(data, "Race")
marital_summary <- summary_table(data, "MaritalStatus")
bmi_summary <- summary_table(data, "BMI_Group")
bmi_impute_summary <- summary_table(data, "BMI_GROUP_impute")
location_summary <- summary_table(data, "PanLocation")
state_summary <- summary_table(data, "StateOrProvince")

# Combine all summaries into one table
final_summary <- bind_rows(
  mutate(age_summary, Variable = "Age_Group"),
  mutate(sex_summary, Variable = "Sex"),
  mutate(ethnicity_summary, Variable = "Ethnicity"),
  mutate(race_summary, Variable = "Race"),
  mutate(marital_summary, Variable = "MaritalStatus"),
  mutate(location_summary, Variable = "PanLocation"),
  mutate(bmi_summary, Variable = "BMI_Group"),
  mutate(bmi_impute_summary, Variable = "BMI_GROUP_impute"),
  mutate(state_summary, Variable = "StateOrProvince")
) %>%
  select(Variable, Category, n, Percentage) %>%
  arrange(Variable)

# Display the final summary table
kable(final_summary, caption = "Patient Characteristics Summary for Pancreatic Cancer Subject")


In [84]:
display_df(cases_control_allvar_new, 10)

In [75]:
vars_to_remove <- c(
  "SurgeryFl1", "SurgeryFl2",
  "RadiationFl1", "RadiationFl2",
  "AdvChemoMedFl1", "AdvChemoMedFl2",
  "exposure",
  "OthMetDt30", "OthMetCode30",
  "OthMetDtBase", "OthMetCodeBase",
  "OthMetFLBase", "OthMetFl30"
)

cases_control_allvar_new <- cases_control_allvar_new %>% select(-all_of(vars_to_remove))


In [77]:
sql <- "SELECT PersonId,
  CASE 
    WHEN LympNodeDt BETWEEN DATEADD(DAY, -365, PanDt) AND DATEADD(DAY, 30, PanDt)
    THEN 'Y'
    ELSE 'N'
  END AS LympNodeFlBase
FROM data
"
LympNodeFlBase <- load_sql_table(con,snapshot, sql, view_name = "LympNodeFlBase", output_mode = "sparklyr") %>% collect

In [76]:
cases_control_allvar_new <- cases_control_allvar_new %>% 
            left_join(pan_radiation, by = "PersonId") %>% 
            left_join(pan_surgery, by= "PersonId") %>% 
            left_join(AdvChemoMed_tb, by= "PersonId") %>%
            left_join(pan_OthMet, by= "PersonId") %>%
            left_join(LympNodeFlBase,by = "PersonId")

In [80]:
cases_control_allvar_new <- cases_control_allvar_new %>% rename( OthMetFlBase= OthMetFLBase)

In [81]:
cases_control_allvar_new <- cases_control_allvar_new %>%
  mutate(
 #Distant Metastasis flag
    DistantMetFlBase = if_else(
      LympNodeFlBase == "Y" | OthMetFlBase == "Y",
      "Y", 
      "N"
    )
  )

In [82]:
cases_control_allvar_new$SurgeryFl1[is.na(cases_control_allvar_new$SurgeryFl1)] <- 0
cases_control_allvar_new$SurgeryFl2[is.na(cases_control_allvar_new$SurgeryFl2)] <- 0
cases_control_allvar_new$RadiationFl1[is.na(cases_control_allvar_new$RadiationFl1)] <- 0
cases_control_allvar_new$RadiationFl2[is.na(cases_control_allvar_new$RadiationFl2)] <- 0

In [64]:
library(dplyr)

cases_control_allvar_new <- cases_control_allvar_new %>%
  mutate(
    SurgeryFl1 = as.integer(replace_na(SurgeryFl1, 0)),
    SurgeryFl2 = as.integer(replace_na(SurgeryFl2, 0)),
    RadiationFl1 = as.integer(replace_na(RadiationFl1, 0)),
    RadiationFl2 = as.integer(replace_na(RadiationFl2, 0))
  )


In [63]:
cases_control_allvar_new <- cases_control_allvar_new %>% mutate(Age_Group = ifelse(AgeAtDiagnosis < 45, "Age < 45",
                                               ifelse(AgeAtDiagnosis >= 45 &AgeAtDiagnosis < 60, "Age 45-60",
                                                      ifelse(AgeAtDiagnosis >= 60 & AgeAtDiagnosis < 75, "Age 60-75", "Age >= 75")
                                               )
)
) 

In [9]:
#Dummy subjid variables
comorbidity <- c("CKD", "T2DM", "Hepatitis", "Hypertension", "CLD", "Hyperlipidemia", "osa", "COPD", "Anxiety", "Ischemic_Heart_Disease", "Depression", "Obesity_codes","Cancer",
"Gastroesophageal_refluxdisease", "abdominal_pain","Dyspnea","Anemia","FHOMND", "comorbidScore")

#Exposure
Exposure1 <- c("RadiationFl1", "SurgeryFl1","AdvChemoMedFl1","RadiationFl2", "SurgeryFl2","AdvChemoMedFl2","exposure")
Exposure2 <- c("Nortriptyline","Venlafaxine", "Metronidazole", "tricyclic_antidepressant", "SNRI")

#Demographic and other
DemographicOth <- c("Age_Group", "AgeAtDiagnosis","BMI_imputed", "Ethnicity", "Race", "Sex", "MaritalStatus", "Region","OthMetFlBase", "PanLocation", "BMI_GROUP_impute")

#lab variables
lab <- c("DirectBilirubin","TotalBilirubin","ALP","Albumin","TotalBilirubin","AST","ALT","TotalProtein","ProthrombinT", "SerumLipase")

#Analysis variables
analysis_var <- c("AVAL", "CNSR","PARAM", "indicator", "PanEncClass", "FollowUpDiagEnc","DiffPanDthDays", "PrimaryDiagnosisConceptId", 
"OthMetFl30", "OthMetCodeBase", "OthMetCode30", "OnsetDtFLPan","LivDtFl30","LivDtFlBase", "LivDtFl","diagEnc90", "diagEnc30", "diagEnc90Out", "diagEnc30Out", "pounds_final")

# create distant metastases variable

Pandata_temp <- cases_control_allvar_new %>% arrange(PersonId) %>% mutate(subject_id = sprintf("subj%04d", row_number())) %>% 
select(comorbidity,Exposure1,Exposure2,DemographicOth,analysis_var,subject_id)

In [83]:
colnames(cases_control_allvar_new) 

### Temporary analysis: Dr Eran

In [6]:
# low molecular weight Heparin
LMWH = codeset(con, snapshot, "RXNORM", 'selfAndDescendants',"67108","67109", "69646")

In [25]:
# Medication

med_flag <- function(codes = "codes",med_name = "name")
{
create_view(codes,"concept_code")

sql <- "

WITH adm AS 
(
    SELECT m.PersonId, CAST(m.StartDateTime AS DATE) AS StartDateTime
    FROM MedicationAdministration m 
    WHERE m.CodeConceptMapId IN (
          SELECT Id FROM MedicationCodeConceptMap 
          WHERE CodeConceptId IN (SELECT ConceptId FROM concept_code)        
    ) 
    AND m.StartDateTime IS NOT NULL
),

req AS 
(
    SELECT m.PersonId, CAST(m.StartDateTime AS DATE) AS StartDateTime
    FROM MedicationRequest m 
    WHERE m.CodeConceptMapId IN (
          SELECT Id FROM MedicationCodeConceptMap 
          WHERE CodeConceptId IN (SELECT ConceptId FROM concept_code)
    ) 
    AND m.StartDateTime IS NOT NULL
),

disp AS 
(
    SELECT m.PersonId, CAST(m.DispenseDateTime AS DATE) AS StartDateTime
    FROM MedicationDispense m 
    WHERE m.CodeConceptMapId IN (
          SELECT Id FROM MedicationCodeConceptMap 
          WHERE CodeConceptId IN (SELECT ConceptId FROM concept_code)
    ) 
    AND m.DispenseDateTime IS NOT NULL
),

all_tbl AS 
(
    SELECT * FROM adm
    UNION ALL
    SELECT * FROM req
    UNION ALL
    SELECT * FROM disp
),

filter_tb1 AS  -- unique PersonId and date (one row per PersonId and date)
(
    SELECT PersonId, StartDateTime
    FROM all_tbl
    GROUP BY PersonId, StartDateTime
),

filter_tb2 AS
(
  SELECT d.PersonId, CAST(d.PanDt AS DATE) AS PanDt, CAST(d.LivDt AS DATE) as LivDt, d.ADT,p.StartDateTime,
  CASE
   WHEN CAST(p.StartDateTime AS DATE) <= DATEADD(Day,-1,CAST(d.ADT AS DATE))
    THEN 1   
  ELSE 0
    END AS Fl1
  FROM data d
    LEFT JOIN filter_tb1 p
      ON d.PersonId = p.PersonId
),

filter_tb3 as
(
  SELECT PersonId,SUM(Fl1) AS CountFl1
  FROM filter_tb2
  GROUP BY PersonId
)

  SELECT PersonId, 
    CASE 
       WHEN  CountFl1 >= 2 THEN 1 ELSE 0  
    END AS %sFl
  FROM filter_tb3
"
  sql1 <- sprintf(sql,med_name)

  tb <- load_sql_table(con,snapshot, query = sql1, output_mode = "sparklyr" ) %>% collect()
  return(tb)
}

In [26]:
Heparin <- med_flag(codes = LMWH,med_name = "Heparin")
snri <- med_flag(codes = snri,med_name = "snri")
tricyclic_antidepressant <- med_flag(codes = tricyclic_antidepressant,med_name = "tricyclic_antidepressant")

In [28]:
new_med_data <- cases_control_allvar_new %>% 
                left_join(snri, by="PersonId") %>% 
                left_join(tricyclic_antidepressant, by="PersonId") %>% 
                left_join(Heparin, by="PersonId")

In [8]:
cat("HeparinFl:", sum(new_med_data$HeparinFl == 1, na.rm=TRUE),"\n")
cat("TAD:", sum(tricyclic_antidepressant$tricyclic_antidepressantFl == 1, na.rm=TRUE),"\n")
cat("SNRI:", sum(snri$snriFl == 1, na.rm=TRUE),"\n")

In [10]:
#library(glmnet)
#library(rms)
library(survival)

#Exclude baseline LM
filter_data <- new_med_data %>% filter(LivDtFl == "N" | LivDtFl30 == "Y")

dataset <- new_med_data

#define intercept-only model
intercept_only <- coxph(Surv(AVAL,indicator)~1,data = dataset)

#define model with all predictors (whole data)
cox_full1 <- coxph(Surv(AVAL,indicator)~ AgeAtDiagnosis + Ethnicity  + Sex + Race + OthMetFlBase + PanLocation + BMI_imputed +
CKD + T2DM + Hepatitis + CLD + Hypertension + osa +  Hyperlipidemia + COPD + Anxiety + Ischemic_Heart_Disease + Depression + Obesity_codes + Cancer +
Gastroesophageal_refluxdisease + abdominal_pain + Dyspnea + Anemia + FHOMND + HeparinFl, data = dataset)

# Add the Exposure related variables (remove baseline)
cox_full2 <- coxph(Surv(AVAL,indicator)~ AgeAtDiagnosis + Ethnicity  + Sex + Race + OthMetFlBase + PanLocation + BMI_imputed +
CKD + T2DM + Hepatitis + CLD + Hypertension + osa +  Hyperlipidemia + COPD + Anxiety + Ischemic_Heart_Disease + Depression + 
Obesity_codes + Cancer + Gastroesophageal_refluxdisease + abdominal_pain + Dyspnea + Anemia + FHOMND + RadiationFl1 + SurgeryFl1 + 
AdvChemoMedFl1 + HeparinFl, data = filter_data)


In [12]:
table(Exposure = new_med_data$HeparinFl,Event = new_med_data$indicator)

In [13]:
table(Exposure = filter_data$HeparinFl,Event = filter_data$indicator)

In [14]:
cox_full2

In [15]:
cox_full1

In [14]:
display_df(Pandata_pt2)

#### UNIVARIATE Analysis

In [64]:
library(survival)
cox2 <- coxph(Surv(AVAL,indicator) ~ StatinFl, data = pandataF1)
#cox1 <- coxph(Surv(AVAL,indicator) ~ HeparinFl, data = filter_data)
cox2

In [65]:

log2 <- glm(indicator ~ StatinFl, data = pandataF1)
#cox1 <- coxph(Surv(AVAL,indicator) ~ HeparinFl, data = filter_data)
log2

In [58]:
#data <- collect(temp_all)

#Chi sq test to compare incidence of metastases for pancreatic cancer between different levels in each groups

# Create a binary variable for metastases
#data <- comorbidity
dataset_name <- pandata2

#data <- cases_control

# List of categorical variables to test
#categorical_vars <- c("Age_Group", "Ethnicity", "Race", "Sex", "MaritalStatus","BMI_Group")
#categorical_vars <- c("FOLFIRINOX1", "Gemc_Cis_Cap1", "Gemc_Mono1", "Chemo1", "RadiationFl1", "SurgeryFl1")
#categorical_vars <- c("AdvChemoMedFl1","AdvChemoMedFl2")
categorical_vars <- c("notriptylinFl1","venlafaxineFl1", "metronidazoleFl1")

#categorical_vars <- c("CKD", "T2DM", "Hepatitis", "CLD", "Hypertension", "osa", "COPD", "Anxiety", "Ischemic_Heart_Disease", "Depression", "Obesity_codes", "Obesity_codes", "GingivitisPeriodontal")

# Perform chi-square test for each variable
results <- lapply(categorical_vars, function(var) {

# Filter out "Unknown" levels
  filtered_data <- dataset_name %>%
    filter(.data[[var]] != "Unknown")

# Filter out groups with less than 6 records
  filtered_data <- filtered_data %>%
    group_by(!!sym(var)) %>%
    filter(n() >= 6) %>%
    ungroup()

# Create a contingency table
contingency_table <- table(filtered_data[[var]], filtered_data$indicator)
  
# Perform chi-square test
  test <- chisq.test(contingency_table)
  
 # Return results
  list(variable = var,
       contingency_table = contingency_table,
       chi_sq_stat = test$statistic,
       p_value = test$p.value,
       expected = test$expected)
})

# Print results
for (result in results) {
  cat("Variable:", result$variable, "\n")
  print(result$contingency_table)
  cat("Chi-sq statistic:", result$chi_sq_stat, "\n")
  cat("P-value:", result$p_value, "\n")
  cat("Expected frequencies:\n")
  print(result$expected)
  cat("\n=======================\n")
}


In [39]:
library(dplyr)
library(survival)

# Define variables and dataset

dataset_name <- cases_control_allvar %>% filter(LivDtFl30 == "Y")
#Comorbidity
categorical_vars <- c("AgeAtDiagnosis","Age_Group", "Ethnicity", "Race", "Sex", "MaritalStatus", "BMI_Group", "BMI","OthMetFLBase", "PanLocation", 
"CKD", "T2DM", "Hepatitis", "Hypertension", "CLD", "Hyperlipidemia", "osa", "COPD", "Anxiety", "Ischemic_Heart_Disease", "Depression", "Obesity_codes","Cancer",
"Gastroesophageal_refluxdisease", "abdominal_pain","Dyspnea","Anemia","FHOMND","comorbidScore", "RadiationFl1", "SurgeryFl1","AdvChemoMedFl1")

#categorical_vars <- c("FluorouracilFl1","GemcitabineFl1","OxaliplatinFl1","LeucovorinFl1","IrinotecanFl1","CisplatinFl1","CapecitabineFl1","FOLFIRINOX1", "Gemc_Cis_Cap1", "Gemc_Mono1", "Chemo1", "RadiationFl1", "SurgeryFl1")
#categorical_vars <- c("FOLFIRINOX", "Gemc_Cis_Cap", "Gemc_Mono","RadiationFl1", "SurgeryFl1","AdvChemoMedFl1","AdvChemoMedFl2")
#categorical_vars <- c("notriptylinFl1","venlafaxineFl1", "metronidazoleFl1")
library(survival)

#Demographic and other
#categorical_vars <- c("AgeAtDiagnosis","Age_Group", "Ethnicity", "Race", "Sex", "MaritalStatus", "BMI_Group", "BMI","OthMetFLBase", "PanLocation")

#lab variables
#categorical_vars <- ("DirectBilirubin","TotalProtein","AlkalinePhosphatase","LiverFunctionPanel","defAlbumin","TotalBilirubin","AsparateAminoTra","AlanAminTra", "GammaglutamylTransferase", "ProthrombinT", "SerumLipase", "SerumPanAmylase")

# Run logistic regression and store results in a data frame
results <- lapply(categorical_vars, function(var) {
  model <- coxph(as.formula(paste("Surv(AVAL,indicator)~", var)), 
               data = dataset_name)
              
  
  coef_summary <- summary(model)$coefficients
  
  # Extract all levels except the intercept (first row)
  #levels_info <- coef_summary[-1, , drop = FALSE]
  
  # Convert log-odds to odds ratio and create a result table
  result_table <- data.frame(
        Variable = var,
    #Level = rownames(levels_info),  # Store level names
    P_Value = round(coef_summary[1, 5], 3),
    Log_Odds_Ratio = round(coef_summary[1, 2], 3),  # Convert log-odds to OR
    Interpretation = ifelse(coef_summary[1, 5] < 0.05, "Significant", "Not Significant")
  )
  
  return(result_table)
})

# Combine results into a single table
results_df <- bind_rows(results)

# Print results
print(results_df)

# Optionally, export to CSV
# write.csv(results_df, "logistic_regression_summary.csv", row.names = FALSE)


In [14]:
install.packages("rms")   # If not already installed
library(rms)


In [138]:
#Create Table
#Chi sq test to compare incidence of metastases for pancreatic cancer between different levels in each groups

# Create a binary variable for metastases
dataset <- cases_control

# List of categorical variables to test
categorical_vars <- c("Age_Group", "Ethnicity", "Race", "Sex", "MaritalStatus","BMI_Group")
#categorical_vars <- c("CKD", "T2DM", "Hepatitis", "CLD", "Hypertension", "osa", "COPD", "Anxiety", "Ischemic_Heart_Disease", "Depression", "Obesity_codes", "GingivitisPeriodontal")

#categorical_vars <- c("FOLFIRINOX1", "Gemc_Cis_Cap1", "Gemc_Mono1", "Chemo1", "RadiationFl1", "SurgeryFl1")
#categorical_vars <- c("notriptylinFl1","venlafaxineFl1","metronidazolelF1")

# Perform chi-square test for each variable
results <- lapply(categorical_vars, function(var){

# Create a contingency table
contingency_table <- table(dataset[[var]], dataset$indicator)
  
# Perform chi-square test
  test <- chisq.test(contingency_table)
  
 # Return results
  variable <- var
  chi_sq_stat <- test$statistic
  p_value <- test$p.value
  # Interpretation based on significance level
  interpretation <- ifelse(p_value < 0.05, "Significant", "Not Significant")


  return(data.frame(variable = var,
                    chi_sq_stat = round(chi_sq_stat,3),
                    p_value = round(p_value,3),
                    interpretation = interpretation))
})

# Combine results into a single table
results_df <- bind_rows(results)

# Print the final table
print(results_df)


##### <mark><u>_**Function**_</u></mark>

In [None]:
colSums(is.na(data_no_na))
display_df()
nrow()
sum()

### <mark>**Trial**</mark>

In [44]:
# combine both new and old data
pandataF1 <- pandata_all2 %>% filter((LivDtFl == "Y" & AVAL <= 1) | (CNSR != 0 & AVAL >= 1))

In [63]:
table(StatinExp=pandataF1$StatinFl, LivMetFl = pandataF1$LivDtFl)

In [56]:
colnames(pandataF1)

In [23]:
pandataT <- Pandata_pt2
comorbidity <- c("CKD", "T2DM", "Hepatitis", "Hypertension", "CLD", "Hyperlipidemia", "osa", "COPD", "Anxiety", "Ischemic_Heart_Disease", "Depression", "Obesity_codes","Cancer",
                 "Gastroesophageal_refluxdisease", "abdominal_pain","Dyspnea","Anemia","FHOMND")
Demo_cat <- c("Age_Group", "Ethnicity", "Race", "Sex", "MaritalStatus", "Region","PanLocation", "BMI_GROUP_impute","DistantMetFlBase")
Demo_cont <- c("AgeAtDiagnosis","BMI_imputed")
Exposure1 <- c("RadiationFl1", "SurgeryFl1","AdvChemoMedFl1")

vars_cat <- c(Demo_cat,comorbidity, Exposure1)

missing_vars <- vars_cat[!vars_cat %in% colnames(pandataT)]
print(missing_vars)

for (var in vars_cat) {
  pandataT[[var]] <- as.factor(pandataT[[var]])
}
for (var in Demo_cont) {
  pandataT[[var]] <- as.numeric(pandataT[[var]])
}

pandataT$Age_Group <- relevel(pandataT$Age_Group, ref = "Age 45-60")
unique(factor(pandataT$Age_Group))

pandataT$Ethnicity <- relevel(pandataT$Ethnicity, ref = "Not Hispanic or Latino")
unique(factor(pandataT$Ethnicity))

pandataT$MaritalStatus <- relevel(pandataT$MaritalStatus, ref = "Married")
unique(factor(pandataT$MaritalStatus))

pandataT$Race <- relevel(pandataT$Race, ref = "White")
unique(factor(pandataT$Race))

pandataT$Sex <- relevel(pandataT$Sex, ref = "Female")
unique(factor(pandataT$Sex))

pandataT$BMI_GROUP_impute <- relevel(as.factor(pandataT$BMI_GROUP_impute), ref = "Normal")
unique(factor(pandataT$BMI_GROUP_impute))

pandataT <- pandataT %>%
  mutate(PanLocation = case_when(
    PanLocation %in% c("Body", "Tail") ~ "Body/Tail",
    PanLocation %in% c("Other Parth", "Overlapping Sites", "Duct") ~ "Other",
    TRUE ~ as.character(PanLocation)  # Keep remaining values unchanged
  ))
pandataT$PanLocation <- relevel(as.factor(pandataT$PanLocation), ref = "Body/Tail")
unique(factor(pandataT$PanLocation))

# Combine new and old records
pandataT <- pandataT %>% rename("indicator" = "Indicator")

# Reordering the levels
pandataT$Age_Group <- factor(pandataT$Age_Group, levels = c("Age 45-60", "Age 60-75", "Age >= 75"))
pandataT$BMI_GROUP_impute <- factor(pandataT$BMI_GROUP_impute,
                                    levels = c("Normal", "Underweight", "Overweight",
                                               "Obese(Class I)", "Obese(Class II)",
                                               "Obese(Class III)", "UNK"))

pandataT$Sex <- factor(pandataT$Sex, levels = c("Female", "Male"))
pandataT$Ethnicity <- factor(pandataT$Ethnicity, levels = c("Not Hispanic or Latino", "Hispanic or Latino", "Unknown"))
pandataT$Race <- factor(pandataT$Race, levels = c("White", "Black", "Other Race", "Unknown"))
pandataT$PanLocation <- factor(pandataT$PanLocation, levels = c("Body/Tail", "Head", "Other", "Unspecified"))

In [27]:

comorbidity <- c("CKD", "T2DM", "Hepatitis", "Hypertension", "CLD", "Hyperlipidemia", "osa", "COPD", "Anxiety", "Ischemic_Heart_Disease", "Depression", "Obesity_codes","Cancer",
                 "Gastroesophageal_refluxdisease", "abdominal_pain","Dyspnea","Anemia","FHOMND")
Demo_cat <- c("Age_Group", "Ethnicity", "Race", "Sex", "MaritalStatus", "Region",
              "PanLocation", "BMI_GROUP_impute")
Demo_cont <- c("AgeAtDiagnosis","BMI_imputed")
Exposure1 <- c("RadiationFl1", "SurgeryFl1","AdvChemoMedFl1")

vars_cat <- c(Demo_cat,comorbidity, Exposure1)

missing_vars <- vars_cat[!vars_cat %in% colnames(cases_control_allvar_new)]
print(missing_vars)

for (var in vars_cat) {
  cases_control_allvar_new[[var]] <- as.factor(cases_control_allvar_new[[var]])
}
for (var in Demo_cont) {
  cases_control_allvar_new[[var]] <- as.numeric(cases_control_allvar_new[[var]])
}

cases_control_allvar_new$Age_Group <- relevel(cases_control_allvar_new$Age_Group, ref = "Age 45-60")
unique(factor(cases_control_allvar_new$Age_Group))

cases_control_allvar_new$Ethnicity <- relevel(cases_control_allvar_new$Ethnicity, ref = "Not Hispanic or Latino")
unique(factor(cases_control_allvar_new$Ethnicity))

cases_control_allvar_new$MaritalStatus <- relevel(cases_control_allvar_new$MaritalStatus, ref = "Married")
unique(factor(cases_control_allvar_new$MaritalStatus))

cases_control_allvar_new$Race <- relevel(cases_control_allvar_new$Race, ref = "White")
unique(factor(cases_control_allvar_new$Race))

cases_control_allvar_new$Sex <- relevel(cases_control_allvar_new$Sex, ref = "Female")
unique(factor(cases_control_allvar_new$Sex))

cases_control_allvar_new$BMI_GROUP_impute <- relevel(as.factor(cases_control_allvar_new$BMI_GROUP_impute), ref = "Normal")
unique(factor(cases_control_allvar_new$BMI_GROUP_impute))

cases_control_allvar_new <- cases_control_allvar_new %>%
  mutate(PanLocation = case_when(
    PanLocation %in% c("Body", "Tail") ~ "Body/Tail",
    PanLocation %in% c("Other Parth", "Overlapping Sites", "Duct") ~ "Other",
    TRUE ~ as.character(PanLocation)  # Keep remaining values unchanged
  ))
cases_control_allvar_new$PanLocation <- relevel(as.factor(cases_control_allvar_new$PanLocation), ref = "Body/Tail")
unique(factor(cases_control_allvar_new$PanLocation))

# Combine new and old records
#cases_control_allvar_new <- cases_control_allvar_new %>% rename("indicator" = "Indicator")

# Reordering the levels
cases_control_allvar_new$Age_Group <- factor(cases_control_allvar_new$Age_Group, levels = c("Age 45-60", "Age 60-75", "Age >= 75"))
cases_control_allvar_new$BMI_GROUP_impute <- factor(cases_control_allvar_new$BMI_GROUP_impute,
                                    levels = c("Normal", "Underweight", "Overweight",
                                               "Obese(Class I)", "Obese(Class II)",
                                               "Obese(Class III)", "UNK"))

cases_control_allvar_new$Sex <- factor(cases_control_allvar_new$Sex, levels = c("Female", "Male"))
cases_control_allvar_new$Ethnicity <- factor(cases_control_allvar_new$Ethnicity, levels = c("Not Hispanic or Latino", "Hispanic or Latino", "Unknown"))
cases_control_allvar_new$Race <- factor(cases_control_allvar_new$Race, levels = c("White", "Black", "Other Race", "Unknown"))
cases_control_allvar_new$PanLocation <- factor(cases_control_allvar_new$PanLocation, levels = c("Body/Tail", "Head", "Other", "Unspecified"))


In [28]:
#pandata_all <- bind_rows(cases_control_allvar_new,pandataT)
#sum(is.na(pandataT$PanLocation))

# add filter
pandata_all1 <- pandata_all %>% filter(Sex != "Unknown" | !is.na(Sex), PanLocation != "Endocrine" | !is.na(PanLocation))
pandata_all1 <- pandata_all1 %>%
  mutate(
    indicator_name = factor(indicator, levels = c(0, 1), labels = c("Control", "Cases"))
  )


In [36]:
nrow(pandataF1)

In [59]:
# logistic
log_intercept_only <- glm(indicator ~ 1, family = binomial(), data = pandataF1)

log_full1 <- glm(indicator ~ AgeAtDiagnosis + Ethnicity  + Sex + Race  + DistantMetFlBase + PanLocation + BMI_imputed +
                   CKD + T2DM + Hepatitis + CLD + Hypertension + osa +  Hyperlipidemia + COPD + Anxiety + Ischemic_Heart_Disease + Depression + Obesity_codes + Cancer +
                   Gastroesophageal_refluxdisease + abdominal_pain + Dyspnea + Anemia + FHOMND + StatinFl,
                 family = binomial(), data = pandataF1)

#Stepwise Selection (Direction Both) - AIC based - Whole Data 
step_both_model <- step(log_intercept_only, direction='both', scope=formula(log_full1), trace=0)

In [60]:
step_both_model

In [61]:
exp(0.2048)

In [51]:
str(pandataF1)

In [None]:
all <- c("AgeAtDiagnosis","Age_Group", "Ethnicity", "Race", "Sex", "MaritalStatus", "BMI_Group", "BMI","OthMetFLBase", "PanLocation", 
"CKD", "T2DM", "Hepatitis", "Hypertension", "CLD", "Hyperlipidemia", "osa", "COPD", "Anxiety", "Ischemic_Heart_Disease", "Depression", "Obesity_codes","Cancer",
"Gastroesophageal_refluxdisease", "abdominal_pain","Dyspnea","Anemia","FHOMND","comorbidScore", "RadiationFl1", "SurgeryFl1","AdvChemoMedFl1",
"DirectBilirubin","TotalBilirubin","ALP","Albumin","TotalBilirubin","AST","ALT","TotalProtein","ProthrombinT", "SerumLipase", "SerumPanAmylase")

#Comorbidity
comorbidity <- c("CKD", "T2DM", "Hepatitis", "Hypertension", "CLD", "Hyperlipidemia", "osa", "COPD", "Anxiety", "Ischemic_Heart_Disease", "Depression", "Obesity_codes","Cancer",
"Gastroesophageal_refluxdisease", "abdominal_pain","Dyspnea","Anemia","FHOMND")

#Exposure
Exposurea <- c("FluorouracilFl1","GemcitabineFl1","OxaliplatinFl1","LeucovorinFl1","IrinotecanFl1","CisplatinFl1","CapecitabineFl1","FOLFIRINOX1", "Gemc_Cis_Cap1", "Gemc_Mono1", "Chemo1", "RadiationFl1", "SurgeryFl1")
Exposureb <- c("FOLFIRINOX", "Gemc_Cis_Cap", "Gemc_Mono","RadiationFl1", "SurgeryFl1","AdvChemoMedFl1","AdvChemoMedFl2")

Exposure1 <- c("RadiationFl1", "SurgeryFl1","AdvChemoMedFl1","exposure")
Exposure2 <- c("notriptylinFl1","venlafaxineFl1", "metronidazoleFl1")

#Demographic and other
DemographicOth <- c("AgeAtDiagnosis","Age_Group", "Ethnicity", "Race", "Sex", "MaritalStatus", "Region", "BMI_Group", "BMI", "BMI_GROUP_impute","BMI_imputed","OthMetFLBase", "PanLocation")
TempVar <- c("Race_old", "BMI_Group", "BMI", "PanLocation_old")

#lab variables
lab_all <- c("DirectBilirubin","TotalBilirubin","ALP","Albumin","TotalBilirubin","AST","ALT","TotalProtein","ProthrombinT", "SerumLipase", "SerumPanAmylase")

# without lab factors
all_wo_lb <- c("AgeAtDiagnosis","Age_Group", "Ethnicity", "Race", "Sex", "MaritalStatus", "BMI_Group", "BMI","OthMetFLBase", "PanLocation", 
"CKD", "T2DM", "Hepatitis", "Hypertension", "CLD", "Hyperlipidemia", "osa", "COPD", "Anxiety", "Ischemic_Heart_Disease", "Depression", "Obesity_codes","Cancer",
"Gastroesophageal_refluxdisease", "abdominal_pain","Dyspnea","Anemia","FHOMND","comorbidScore", "RadiationFl1", "SurgeryFl1","AdvChemoMedFl1")

In [11]:
# Data Driven Approach to check top 5 condition in the cohort

sql <-"
    SELECT ccm.CodeConceptId, COUNT(DISTINCT c.PersonId) AS PersonCount
    FROM condition c
     INNER JOIN ConditionCodeConceptMap ccm 
        ON c.CodeConceptMapId = ccm.Id
    GROUP BY CodeConceptId 
    ORDER BY PersonCount DESC
    LIMIT 30
"
topCond <- load_sql_table(con, snapshot, sql, view_name='topCond',output_mode = "sparklyr") 
display_df(topCond)

#### Patient Characteristics

In [137]:
library(dplyr)
library(tidyr)
library(knitr)

data <- cases_control 

categorical_vars <- c("Age_Group", "Sex", "Ethnicity", "Race", "BMI_Group", "Marital Status", "OthMetFlBase")

# Generate summaries using lapply
summaries <- lapply(categorical_vars, function(var) {
  summary_table(data, var) %>% mutate(Variable = var)
})

# Combine all summaries into a final summary table
final_summary <- bind_rows(summaries) %>%
  select(Variable, Category, n, Percentage) %>%
  arrange(Variable)

# Display the final summary table
kable(final_summary, caption = "Patient Characteristics Summary for Pancreatic Cancer Subject")

In [7]:
#' decodes the top level *ConceptId columns within the given data frame and derives new names (without the ConceptId suffix). 
#' It assume that every column ending with `ConceptId` which is an integer or float is a concept column
#' @param df the data frame to decode
#' @param drop_concepts whether to drop *ConceptId columns (default true)
#' @param columns the optional list of columns to decode
#' @return the enhanced data frame
decode_concepts <- \(df, drop_concepts = TRUE, columns = NULL) {
    stopifnot(inherits(df, c("data.frame", "tbl_spark", "SparkDataFrame")))
    should_decode <- \(col, dtype) {
        if(!is.null(columns)) {
            col %in% columns
        } else {
            grepl("ConceptId$", col) && dtype %in% c('integer', 'double', 'IntegerType', 'FloatType')
        }
    }

    column_names <- colnames(df)

    target_col <- \(col) {
        name <- sub("ConceptId$", "", col)
        if (name == col && drop_concepts) {
            return(name)
        }
        while (name %in% column_names) {
            name <- paste0(name, "Name")
        }
        name
    }

    safe_col <- \(col) {
        while (col %in% column_names) {
            col <- paste0(col, "_tmp")
        }
        col
    }

    final_order <- c()

    decode_dplyr <- \(df, col, name, concepts) {
        if (col == name) {
            tmp_col <- safe_col(col)
            concept_x <- concepts |> dplyr::rename({{tmp_col}} := ConceptId, {{name}} := ConceptName)
            df <- df |> dplyr::rename({{tmp_col}} := {{col}}) |> dplyr::left_join(concept_x, by=tmp_col) |>
                dplyr::select(-tidyselect::all_of(tmp_col))
        } else {
            df <- df |> dplyr::left_join(concepts |> dplyr::rename({{col}} := ConceptId, {{name}} := ConceptName), by=col)
            if (drop_concepts) {
                df <- df |> dplyr::select(-tidyselect::all_of(col))
            }
        }
        df
    }

    if (inherits(df, "data.frame")) {
        concepts <- NULL
        for (col in colnames(df)) {
            dtype <- class(df[[col]])[1]  # Get the data type of the column
            if (should_decode(col, dtype)) {
                name <- target_col(col)
                column_names <- c(column_names, name)
        
                if (is.null(concepts)) {
                    # Lazy lookup
                    concepts <- SparkR::sql("SELECT ConceptId, ConceptName FROM Concept") |> SparkR::collect()
                }
                df <- decode_dplyr(df, col, name, concepts)
                if (!drop_concepts) {
                    final_order <- c(final_order, col)
                }
                final_order <- c(final_order, name)
            } else {
                final_order <- c(final_order, col)
            }
        }
        df <- df[final_order]
    } else if (inherits(df, "tbl_spark")) {
        concepts_s <- sparklyr::sdf_sql(sparklyr::spark_connection(df), "SELECT ConceptId, ConceptName FROM Concept") |> sparklyr::sdf_persist()
        info <- dplyr::bind_rows(sparklyr::sdf_schema(df))
        for (i in 1:nrow(info)) {
            col <- info$name[i]
            if (should_decode(col, info$type[i])) {
                name <- target_col(col)
                column_names <- c(column_names, name)
                df <- decode_dplyr(df, col, name, concepts_s)
                if (!drop_concepts) {
                    final_order <- c(final_order, col)
                }
                final_order <- c(final_order, name)
            } else {
                final_order <- c(final_order, col)
            }
        }
        df <- df[final_order]
    } else {    
        concepts_s <- SparkR::sql("SELECT ConceptId, ConceptName FROM Concept") |> SparkR::cache()
        info <- tibble::tibble(name=colnames(df), type=SparkR::coltypes(df))
        for (i in 1:nrow(info)) {
            col <- info$name[i]
            if (should_decode(col, info$type[i])) {
                name <- target_col(col)
                column_names <- c(column_names, name)
                if (col == name) {
                    tmp_col <- safe_col(col)
                    tmp_s <- concepts_s |> SparkR::withColumnRenamed("ConceptId", tmp_col) |> SparkR::withColumnRenamed("ConceptName", name)
                    df <- df |> SparkR::withColumnRenamed(col, tmp_col) |>
                        SparkR::merge(tmp_s, by=tmp_col, all.x = TRUE, all.y = FALSE) |> SparkR::drop(paste0(tmp_col, c("_x", "_y")))
                } else {
                    tmp_s <- concepts_s |> SparkR::withColumnRenamed("ConceptId", col) |> SparkR::withColumnRenamed("ConceptName", name)
                    df <- df |> SparkR::merge(tmp_s, by=col, all.x = TRUE, all.y = FALSE) |> SparkR::drop(paste0(col, "_y"))
                    if (drop_concepts) {
                        df <- df |> SparkR::drop(paste0(col, "_x"))
                    } else {
                        final_order <- c(final_order, col)
                        df <- df |> SparkR::withColumnRenamed(paste0(col, "_x"), col)
                    }
                }
                final_order <- c(final_order, name)
            } else {
                final_order <- c(final_order, col)
            }
        }
        df <- df |> SparkR::select(final_order)
    }
    df
    
}

#### Exploratary analysis for Demographic details

In [12]:
library(ggplot2)
library(reshape2)

# Melt the dataset into a long format
long_data <- melt(pancreatic_final_data, measure.vars = c("Age_Group","Sex", "MaritalStatus", "Ethnicity", "Race"))

# Create histograms for each variable
plot <- ggplot(long_data, aes(x = value)) +
  geom_bar(fill = "skyblue", color = "black") +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5, size = 3) +
  facet_wrap(~ variable, scales = "free", ncol = 2) +
  labs(
    title = "Histograms of Categorical Variables",
    x = "Category",
    y = "Count"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

In [None]:
# Demographic of liver met and no liver met 
No_LivMet<- pancreatic_final_data %>% filter(time_to_metastasis != 0)
#LivMet <- pancreatic_final_data %>% filter(time_to_metastasis == 0)

library(ggplot2)
library(reshape2)

# Melt the dataset into a long format
long_data <- melt(No_LivMet, measure.vars = c("age_group","Sex", "MaritalStatus", "Ethnicity", "Race"))

# Create histograms for each variable
plot <- ggplot(long_data, aes(x = value)) +
  geom_bar(fill = "skyblue", color = "black") +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5, size = 3) +
  facet_wrap(~ variable, scales = "free", ncol = 2) +
  labs(
    title = "Histograms of Categorical Variables",
    x = "Category",
    y = "Count"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

  plot

In [14]:
# Remove "Unknown"

# Filter data for liver metastases group
data <- pancreatic_final_data %>% filter(time_to_metastasis != 0)

# List of categorical variables to analyze
categorical_vars <- c("MaritalStatus", "Age_Group", "Ethnicity", "Race", "Sex")

# Initialize results list
results <- lapply(categorical_vars, function(var) {
  # Filter out "Unknown" levels
  filtered_data <- data %>%
    filter(.data[[var]] != "Unknown")
  
  # Summarize statistics for each group
  summary_table <- filtered_data %>%
    group_by(.data[[var]]) %>%
    summarize(
      mean_time = mean(time_to_metastasis, na.rm = TRUE),
      median_time = median(time_to_metastasis, na.rm = TRUE),
      sd_time = sd(time_to_metastasis, na.rm = TRUE),
      n = n()
    ) %>%
    ungroup()
  
  # Check the number of unique levels in the variable
  num_levels <- n_distinct(filtered_data[[var]])
  
  # Perform the appropriate test
  if (num_levels > 2) {
    # Perform ANOVA test for variables with more than 2 levels
    formula <- as.formula(paste("time_to_metastasis ~", var))
    aov_result <- aov(formula, data = filtered_data)
    test_result <- summary(aov_result)
    pairwise_result <- TukeyHSD(aov_result)  # Post-hoc test for pairwise comparisons
    list(
      variable = var,
      summary_table = summary_table,
      test_type = "ANOVA",
      test_result = test_result,
      pairwise_comparisons = pairwise_result
    )
  } else if (num_levels == 2) {
    # Perform t-test for variables with exactly 2 levels
    formula <- as.formula(paste("time_to_metastasis ~", var))
    t_test_result <- t.test(formula, data = filtered_data)
    list(
      variable = var,
      summary_table = summary_table,
      test_type = "t-test",
      test_result = t_test_result
    )
  } else {
    # Skip variables with fewer than 2 levels
    list(variable = var, message = "Insufficient levels for testing")
  }
})

# Print results
for (result in results) {
  cat("Variable:", result$variable, "\n")
  print(result$summary_table)
  
  if (!is.null(result$message)) {
    cat(result$message, "\n")
  } else if (result$test_type == "ANOVA") {
    cat("Test Type: ANOVA\n")
    print(result$test_result)
    cat("Pairwise Comparisons:\n")
    print(result$pairwise_comparisons)
  } else if (result$test_type == "t-test") {
    cat("Test Type: t-test\n")
    print(result$test_result)
  }
  
  cat("\n=======================\n")
}


In [26]:
# plotting Death distribution
# Define bins for the ranges
bins <- c(-Inf, 0, 1, 7, 30, 90, 360, 722, Inf)  # The edges of your ranges

# Create labels for the bins
labels <- c("0 Days", "1 Day", "2-7 Days", "8-30 Days", "31-90 Days", "91-360 Days", "361-722 Days", "723+ Days")

# Categorize the data into the bins
days_bins <- cut(TempData1$DiffPanDthDays, breaks = bins, labels = labels, right = TRUE)

# Calculate frequencies
freq_table <- table(days_bins)

# Print frequency table
print(freq_table)

# Create the histogram
library(ggplot2)
ggplot(data = as.data.frame(freq_table), aes(x = days_bins, y = Freq)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(
    title = "Histogram of Number of Days of Death After Diagnosis",
    x = "Time Ranges",
    y = "Frequency"
  ) +
  geom_text(aes(label = Freq), vjust = -0.5, size = 5) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


In [8]:
%%sparkr
# Person Table
sql <- "SELECT *
FROM Person p
"

In [41]:
ggsave('./R_plot.png',plot=plot)

In [45]:
artifacts_path_local <- get_artifacts_path(con, study, fs = TRUE)
artifacts_path_local

path = file.path("/public/mydata")
save_artifacts_data(con, study, df, path)


In [36]:
#Get artifacts
artifacts_path_local <- get_artifacts_path(con, study, fs = TRUE)

In [None]:
path = file.path("study_data/sampler")
save_artifacts_data(con, study, df, path)

In [37]:
# create a variable containing the path of the directory to be created
# note: here we use the spark artifacts path since we are using spark fs utilities
new_dir <- paste(artifacts_path_spark, "/my_data", sep = "")
new_dir

In [7]:
colnames(cases_control_allvar_new)

In [8]:
temp_data <- pan_demo1 %>% filter(!(PersonId %in% cases_control_allvar_new$PersonId))

In [78]:
# Derive Age at the time of Pancreatic diagnosis
temp_data9 <- temp_data9 %>% mutate(Age_Group = ifelse(AgeAtDiagnosis < 45, "Age < 45",
                                               ifelse(AgeAtDiagnosis >= 45 &AgeAtDiagnosis < 60, "Age 45-60",
                                                      ifelse(AgeAtDiagnosis >= 60 & AgeAtDiagnosis < 75, "Age 60-75", "Age >= 75")
                                               )
)
) 

In [11]:
# Define region mapping
temp_data <- temp_data %>%
  mutate(Region = as.factor(case_when(
    StateOrProvince %in% c("Connecticut", "Maine", "Massachusetts", "New Hampshire", 
                           "Rhode Island", "Vermont", "New Jersey", "New York", "Pennsylvania") ~ "Northeast",
    StateOrProvince %in% c("Illinois", "Indiana", "Michigan", "Ohio", "Wisconsin", 
                           "Iowa", "Kansas", "Minnesota", "Missouri", "Nebraska", 
                           "North Dakota", "South Dakota") ~ "Midwest",
    StateOrProvince %in% c("Delaware", "Florida", "Georgia", "Maryland", "North Carolina", 
                           "South Carolina", "Virginia", "West Virginia", "Alabama", "Kentucky", 
                           "Mississippi", "Tennessee", "Arkansas", "Louisiana", "Oklahoma", "Texas") ~ "South",
    StateOrProvince %in% c("Arizona", "Colorado", "Idaho", "Montana", "Nevada", 
                           "New Mexico", "Utah", "Wyoming", "Alaska", "California", 
                           "Hawaii", "Oregon", "Washington") ~ "West",
    TRUE ~ "Unknown" # Default case for states not listed
  )))

In [12]:
# rename race and panlocation as old and derive new 
temp_data <-  temp_data %>% rename("Race_old" = "Race")

temp_data <- temp_data %>% 
  mutate(
    Race = case_when(
      Race_old == "White" ~ "White",
      Race_old == "Black or African American" ~ "Black",
      Race_old %in% c("Asian", "American Indian or Alaska Native", "Native Hawaiian or Other Pacific Islander", "Other Race") ~ "Other Race",
      Race_old == "Unknown" ~ "Unknown"
    )
  )


In [15]:
temp_data14 <- temp_data13 %>%  select(-"PanLocation")
temp_data14 <- temp_data14 %>%  rename("PanLocation" = "PanLocation_old")

In [70]:
#comorbidity

temp_data1 <- temp_data_r %>% 
    left_join(df_CKD, by = "PersonId") %>%
    left_join(df_T2DM, by = "PersonId") %>%
    left_join(df_Hepatitis, by = "PersonId") %>%
    left_join(df_CLD, by = "PersonId") %>%
    left_join(df_Hypertension, by = "PersonId") %>%
    left_join(df_Hyperlipidemia, by = "PersonId") %>%
    left_join(df_osa, by = "PersonId") %>%
    left_join(df_COPD, by = "PersonId") %>%
    left_join(df_Anxiety, by = "PersonId") %>%
    left_join(df_Ischemic_Heart_Disease, by = "PersonId") %>%
    left_join(df_Depression, by = "PersonId") %>%
    left_join(df_Obesity_codes, by = "PersonId") %>%
    left_join(df_GingivitisPeriodontal, by = "PersonId") %>%
    left_join(df_Gastroesophageal_refluxdisease, by = "PersonId") %>% 
    left_join(df_abdominal_pain, by= "PersonId") %>% 
    left_join(df_Dyspnea, by= "PersonId") %>% 
    left_join(df_Anemia, by= "PersonId") %>% 
    left_join(df_FHOMND, by= "PersonId") %>% 
    left_join(df_Cancer, by= "PersonId")

In [71]:
temp_data2 <- temp_data1 %>% 
    left_join(AdvChemoMed_tb, by = "PersonId") %>%
    left_join(pan_radiation, by = "PersonId") %>%
    left_join(pan_surgery, by = "PersonId")

In [72]:
# Other metastases 
temp_data3 <- temp_data2 %>% left_join(pan_OthMet, by = "PersonId")

In [73]:
# Indicator variable

temp_data4 <- temp_data3 %>% mutate(Indicator = case_when(
LivDtFl == "Y" ~ 1,
LivDtFl == "N" ~ 0   
))

In [53]:
# ICD10CM code for liver, colon, gastric, abdomen metastases

AllMetastasesCode = codeset(con, snapshot, "ICD10CM", 'selfAndDescendants',
  "C78")

#AllMetastasesSNOMEDCode = codeset(con, snapshot, "SNOMED CT", 'selfAndDescendants',"275266006")

#AllMetastasesCode = rbind(AllMetastasesICDCode,AllMetastasesSNOMEDCode)

create_view(AllMetastasesCode,'AllMetastasesCode')

# ICD10CM code for Lymph Node (Unspecified, intra-abdomina, multiple region)

# Lymph_Node_SNOMED_codes = codeset(con, snapshot, "SNOMED CT",'selfAndDescendants',"94410007","94519005", "94336001", "94351005", "94397007", "94392001","303201005","94466007")

LymphNodeAllCodes = codeset(con, snapshot,
  "ICD10CM",
  'selfAndDescendants',
  "C77.8",
  "C77.5",
  "C77.1",
  "C77.2",
  "C77.9"
)

#LymphNodeAllCodes = rbind(Lymph_Node_ICD_codes,Lymph_Node_SNOMED_codes)

create_view(LymphNodeAllCodes,'LymphNodeAllCodes')

In [55]:
# Select subject with any Metastases Code and Lymph Node Codes 
sql6 <- " 
WITH AllMetastases as
(
    SELECT 
        PersonId, 
        COALESCE(OnsetDateTime, RecordedDateTime) AS AllMetDt, 
        ROW_NUMBER() OVER (PARTITION BY PersonId ORDER BY COALESCE(OnsetDateTime, RecordedDateTime)) AS RowNum
    FROM condition c
    WHERE c.CodeConceptMapId IN (
        SELECT Id 
        FROM ConditionCodeConceptMap 
        WHERE CodeConceptId IN (SELECT ConceptId FROM AllMetastasesCode)
        AND (SourceConceptId = 2703595 OR SourceConceptId = 2703594))
    AND (OnsetDateTime IS NOT NULL OR RecordedDateTime IS NOT NULL)  
),

AllMetastases1
(
   SELECT PersonId,AllMetDt
   FROM AllMetastases a
   WHERE a.RowNum = 1
),

LympNode
(
    SELECT 
        PersonId, 
        COALESCE(OnsetDateTime, RecordedDateTime) AS LympNodeDt, 
        ROW_NUMBER() OVER (PARTITION BY PersonId ORDER BY COALESCE(OnsetDateTime, RecordedDateTime)) AS RowNum
    FROM condition c
    WHERE c.CodeConceptMapId IN (
        SELECT Id 
        FROM ConditionCodeConceptMap 
        WHERE CodeConceptId IN (SELECT ConceptId FROM LymphNodeAllCodes)
        AND (SourceConceptId = 2703595 OR SourceConceptId = 2703594))
    AND (OnsetDateTime IS NOT NULL OR RecordedDateTime IS NOT NULL)  
),

LympNode1
(
   SELECT PersonId,LympNodeDt
   FROM LympNode a
   WHERE a.RowNum = 1
)

SELECT
    p.PersonId,b.AllMetDt,c.LympNodeDt
FROM 
    data p
    LEFT JOIN AllMetastases1 b
        ON p.PersonId = b.PersonId
    LEFT JOIN LympNode1 c
        ON p.PersonId = c.PersonId
"
lymp_met <- load_sql_table(con, snapshot, sql6, view_name='lymp_met',output_mode = "sparklyr") %>% collect()

In [74]:
temp_data4 <- temp_data4 %>% select(-c("AllMetDt","LympNodeDt"))

temp_data5 <- temp_data4 %>% left_join(lymp_met, by = "PersonId")

In [58]:
sql <-
"
 SELECT *,
    CASE 
	    WHEN (TypeConceptId in (3059272,1065290,1065342) and ClassConceptId in (1065217, 1065225)) then 'Emergency'
      WHEN (TypeConceptId in (3059272,1065297,3059289,1065290,1065342,1065307) and ClassConceptId in (1065215,1065220)) then 'Inpatient'
      WHEN (TypeConceptId in (1065310,3059277,1065310,1065286) and ClassConceptId in (2649591,1065216,1067561,1065227,1065226,1065220,0,1065217,1065225)) then 'LabImaging'
      WHEN (TypeConceptId in (3059271,3059272,2649591,1067557,1065280,1065333,1065318,1065323,1065342,1065330,1065337) and ClassConceptId in (1065216,1065227,1065225)) then 'Outpatient'
      WHEN (TypeConceptId in (3059263,3059301,1067555,3059265,3059272,2649591,3059264) and ClassConceptId in (1065230,1065216,0)) then 'Virtual'
      else 'Other'
    end as EncounterType
  FROM Encounter 
  where ClassConceptId !=1067555 and StartDateTime is not NULL
    and StatusConceptId not in (1067555,2983200,2506595, 2983199,2506590, 1065206)
"
encounterF <- load_sql_table(con, snapshot, sql, view_name='encounterF',output_mode = "sparklyr")

In [59]:
# Step 1: All condition
# Step 2: All Encounter with Encounter Id in Condition Table

sql3 <-
"
WITH all_cond as 
(
    SELECT DISTINCT c.Personid, c.EncounterId, cm.CodeConceptId
    FROM condition c JOIN ConditionCodeConceptMap cm 
    ON c.CodeConceptMapId = cm.Id
    WHERE cm.SourceConceptId = 2703595 OR cm.SourceConceptId = 2703594
)

SELECT 
    e.PersonId, 
    e.StartDateTime as DiagEncStart,
    e.ClassConceptId as EncClass, 
    e.TypeConceptId as EncType,
    e.Id, e.EncounterType,
    c.CodeConceptId, 
    c.EncounterId
FROM 
    encounterF e
INNER JOIN 
    all_cond c
ON 
    e.Id = c.EncounterId
WHERE 
    e.StartDateTime IS NOT NULL 
    AND e.StartDateTime >= '2018-01-01'

"
all_enc <- load_sql_table(con, snapshot, sql3, view_name='all_enc',output_mode = "sparklyr")

In [60]:
# Get Follow-up Diagnosis encounter after PanDt and before Analysis date
# Create indicator variable for cases(1) and control(0)
sql7 <-
"
    SELECT p.PersonId, COUNT(DISTINCT e.DiagEncStart) AS FollowUpDiagEnc
    FROM data p
    INNER JOIN all_enc e
        ON p.PersonId = e.PersonId
    WHERE e.DiagEncStart BETWEEN p.PanDt AND p.ADT
    GROUP BY p.PersonId

"
followup <- load_sql_table(con, snapshot, sql7, view_name='followup',output_mode = "sparklyr") %>% collect()

In [75]:
temp_data5 <- temp_data5 %>% select(-FollowUpDiagEnc)

temp_data6 <- temp_data5 %>% left_join(followup, by = "PersonId")

In [76]:

# Ensure date columns are in Date format
temp_data6 <- temp_data6 %>%
  mutate(
    LympNodeDt = as.Date(LympNodeDt),
    PanDt = as.Date(PanDt)
  )

# Apply the flag logic
temp_data7 <- temp_data6 %>%
  mutate(
    LympNodeFlBase = case_when(
      LympNodeDt >= (PanDt - days(365)) & LympNodeDt <= (PanDt + days(30)) ~ "Y",
      TRUE ~ "N"
    )
  )

temp_data8 <- temp_data7 %>%
  mutate(
 #Distant Metastasis flag
    DistantMetFlBase = if_else(
      LympNodeFlBase == "Y" | OthMetFlBase == "Y",
      "Y", 
      "N"
    )
  )

In [77]:
temp_data9 <- temp_data8 %>%
  mutate(
    SurgeryFl1 = as.integer(replace_na(SurgeryFl1, 0)),
    SurgeryFl2 = as.integer(replace_na(SurgeryFl2, 0)),
    RadiationFl1 = as.integer(replace_na(RadiationFl1, 0)),
    RadiationFl2 = as.integer(replace_na(RadiationFl2, 0))
  )

In [81]:
temp_data10 <- temp_data9 %>% 
  rename("Hypertension" = "Hypertension.x","Hyperlipidemia" = "Hyperlipidemia.x" ) %>%
  select(-c("Hypertension.y","Hyperlipidemia.y"))

In [7]:
# BMI
sql <-
"
WITH temp as (
    SELECT PersonId, EffectiveDateTime, NormalizedValueUOMConceptId, NormalizedValueNumeric
    FROM SearchResult_defTrBodyMassIndexBmi 
        WHERE (NormalizedValueNumeric IS NOT NULL AND NormalizedValueNumeric >= 12 AND NormalizedValueNumeric <= 90)
        AND (EffectiveDateTime IS NOT NULL AND year(EffectiveDateTime) >= 2018) 
        AND NormalizedValueUOMConceptId IS NOT NULL
),

temp1 as (
  SELECT t.*, c.ConceptName as UnitOfMeasurement
  from temp t join Concept c on t.NormalizedValueUOMConceptId = c.ConceptId
),

temp2 AS (
    SELECT  p.*, t.EffectiveDateTime AS BMIDate, t.NormalizedValueNumeric AS BMI,t.UnitOfMeasurement, 
      -- Calculate absolute difference in days
      ABS(DATEDIFF(t.EffectiveDateTime, p.PanDt)) AS days
    FROM data p 
    LEFT JOIN temp1 t ON t.PersonId = p.PersonId
    WHERE t.EffectiveDateTime BETWEEN DATEADD(day, -360, p.PanDt) AND DATEADD(day, 360, p.PanDt)
),

temp3 as (
 SELECT PersonId,BMI, BMIDate,
  CASE 
    WHEN BMI < 18.5 THEN 'Underweight'
    WHEN BMI >= 18.5 AND BMI < 25 THEN 'Normal'
    WHEN BMI >= 25 AND BMI < 30 THEN 'Overweight'
    WHEN BMI >= 30 AND BMI < 35 THEN 'Obese(Class I)'
    WHEN BMI >= 35 AND BMI < 40 THEN 'Obese(Class II)' 
    WHEN BMI >= 40 THEN 'Obese(Class III)'
  END AS BMI_Group,
  ROW_NUMBER() OVER (PARTITION BY PersonId ORDER BY days) AS rn
 FROM temp2
)

SELECT PersonId,BMI, BMIDate,BMI_Group FROM temp3 WHERE rn=1
"
bmi_temp <- load_sql_table(con, snapshot, sql, view_name='bmi_temp',output_mode = "sparklyr") %>% collect()

In [12]:
temp_data11 <- temp_data10 %>% left_join(bmi_temp, by = "PersonId")

In [24]:
# ======================================
# STEP-BY-STEP BMI IMPUTATION USING MICE
# ======================================

library(mice)
# ------------------------------
# Step 1: Define Predictor Variables
# ------------------------------
comorbidity <- c("CKD", "T2DM", "Hepatitis", "Hypertension", "CLD", "Hyperlipidemia",
                 "osa", "COPD", "Anxiety", "Ischemic_Heart_Disease", "Depression", "Obesity_codes",
                  "Gastroesophageal_refluxdisease", "abdominal_pain", "Dyspnea", "Anemia", "FHOMND")

DemographicOth <- c("AgeAtDiagnosis", "Ethnicity", "Race", "Sex", "BMI")

predictors_for_imputation <- c(comorbidity, DemographicOth)

# ------------------------------
# Step 2: Subset Data for Imputation
# ------------------------------
imp_data <- temp_data11 %>% select(all_of(predictors_for_imputation))

# ------------------------------
# Step 3: Impute Using MICE
# ------------------------------
set.seed(123)
imp_result <- mice(imp_data, m = 5, method = 'pmm', maxit = 5, seed = 500)

# ------------------------------
# Step 4: Extract Completed Dataset (first imputed dataset)
# ------------------------------
completed_data <- complete(imp_result, 1)
temp_data11$BMI_imputed <- round(completed_data$BMI,2)

In [25]:
temp_data12 <- temp_data11 %>%
  mutate(
    BMI_GROUP_impute = case_when(
      BMI_imputed < 18.5 ~ "Underweight",
      BMI_imputed >= 18.5 & BMI_imputed < 25 ~ "Normal",
      BMI_imputed >= 25 & BMI_imputed < 30 ~ "Overweight",
      BMI_imputed >= 30 & BMI_imputed < 35 ~ "Obese(Class I)",
      BMI_imputed >= 35 & BMI_imputed < 40 ~ "Obese(Class II)",
      BMI_imputed >= 40 ~ "Obese(Class III)",
      TRUE ~ "Unknown"  # Assigns NA if BMI is missing
    )
  ) %>%
  rename("BMI_Group_original" = "BMI_Group")

temp_data12 <- temp_data12 %>% mutate(BMI_Group = ifelse(BMI_Group_original == "Obese(Class III)", "Obese(Class II)", BMI_Group_original))

In [16]:
#Dummy subjid variables
comorbidity <- c("CKD", "T2DM", "Hepatitis", "Hypertension", "CLD", "Hyperlipidemia", "osa", "COPD", "Anxiety", "Ischemic_Heart_Disease", "Depression", "Obesity_codes","Cancer",
"Gastroesophageal_refluxdisease", "abdominal_pain","Dyspnea","Anemia","FHOMND")

#Exposure
Exposure1 <- c("RadiationFl1", "SurgeryFl1","AdvChemoMedFl1","RadiationFl2", "SurgeryFl2","AdvChemoMedFl2")

#Demographic and other
DemographicOth <- c("Age_Group", "AgeAtDiagnosis","BMI_imputed", "Ethnicity", "Race", "Sex", "MaritalStatus", "Region","OthMetFlBase", "PanLocation", "BMI_GROUP_impute", "DistantMetFlBase", "LympNodeFlBase")

#lab variables
lab <- c("DirectBilirubin","TotalBilirubin","ALP","Albumin","TotalBilirubin","AST","ALT","TotalProtein","ProthrombinT", "SerumLipase")

#Analysis variables
analysis_var <- c("AVAL", "CNSR","PARAM", "Indicator", "PanEncClass", "FollowUpDiagEnc","DiffPanDthDays", "PrimaryDiagnosisConceptId", 
"OthMetFl30", "OthMetCodeBase", "OthMetCode30", "OnsetDtFLPan","LivDtFl30","LivDtFlBase", "LivDtFl","diagEnc90", "diagEnc30", "diagEnc90Out", "diagEnc30Out")

# create distant metastases variable
Pandata_pt2 <- temp_data14 %>% arrange(PersonId) %>% select(comorbidity,Exposure1,DemographicOth,analysis_var)

In [52]:
library(dplyr)

# Combine lung cancer codes
LungCanicd <- LungCanicd %>% collect()
LungCanSnomed <- LungCanSnomed %>% collect()
LungCan <- rbind(LungCanicd, LungCanSnomed)

In [14]:
df_Cancer <- disease_flag(codes = Cancer, disease_name = "Cancer")
print("Finished flagging the disease: Cancer")
temp_data13 <- temp_data12 %>% left_join(df_Cancer, by= "PersonId")

In [46]:
temp_data_r <- SparkR::collect(temp_data)


In [44]:
display_df(temp_data12)

In [17]:
colnames(temp_data14)

In [18]:
str(Pandata_pt2)

In [26]:
sum(temp_data14$LivDtFl == "Y")