# Initialize Truveta SDK

In [1]:
# These are some commonly used R Packages.  
# The arrow package makes loading data with spark faster. 
library(readr, warn.conflicts = FALSE)
library(arrow, warn.conflicts = FALSE)
library(magrittr, warn.conflicts = FALSE)
library(stringr, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(rlang, warn.conflicts = FALSE)
library(data.table, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)
library(tidyr, warn.conflicts = FALSE)
library(truveta.notebook.study)
library(sparklyr)
library(ggplot2)
library(reshape2)
library(survival)

In [2]:
print("load snapshot")
con <- create_connection()
study <- get_study(con)
#print(study)
population_id = "ps-3ormi7swwukuhhu6kcqrqw4mue"
population <- get_population(con, study, title = "PancreaticMainPop")
snapshot <- get_latest_snapshot(con, population)
snapshot
# get list of tables from the snapshot
tables <- get_tables(con, snapshot)
tables

Set the Output Path

#### Get your working directory

In [3]:
# use fs = true when reading and writing files locally

output_path_local <- get_output_path(con, study, fs = TRUE)
output_path_local

##### Code to get plots 

In [4]:
display_plot<- \(x, dpi = "screen", ...) {
    file <- tempfile()
    # dump as PNG
    ggplot2::ggsave(file, device = "png", plot = x, dpi = dpi, ...)
    # load as base64
    uri <- base64enc::dataURI(file = file, mime = "image/png")
    unlink(file)
    # display as HTML
    displayHTML(paste0('<img src="', uri, '">'))
}

In [5]:
# Read the file
t1 <- paste(output_path_local, "/RawMainDataset.csv.r", sep = "")

# use read.csv to read file into a R dataframe
RawMainDataset <- read.csv(t1)
#nrow(AdvChemoMed_tb)
##display_df(cases_control,10)

In [32]:
# Convert R DataFrame to Spark DataFrame
temp_data <- as.DataFrame(pan_liv_final1b)
createOrReplaceTempView(temp_data, "pandata")

In [48]:
# Build path
file_to_write <- paste(output_path_local, "/RawMainDataset.csv.r", sep = "")

# use write.csv to write your file
write.csv(RawMainDataset, file_to_write, row.names = FALSE)

#### 1. Get Pancreatic Condition Patients

In [10]:
# Check the sourceconcept codes
sql <- "
    SELECT SourceConceptId, count (*) as cnt FROM ConditionCodeConceptMap
    group by SourceConceptId
    "
source_id_cond_tbl <- load_sql_table(con, snapshot,sql)
display_df(source_id_cond_tbl)

##### Filter Encounter and Condition table


In [11]:
sql <-
"
 SELECT *,
    CASE 
	    WHEN (TypeConceptId in (3059272,1065290,1065342) and ClassConceptId in (1065217, 1065225)) then 'Emergency'
      WHEN (TypeConceptId in (3059272,1065297,3059289,1065290,1065342,1065307) and ClassConceptId in (1065215,1065220)) then 'Inpatient'
      WHEN (TypeConceptId in (1065310,3059277,1065310,1065286) and ClassConceptId in (2649591,1065216,1067561,1065227,1065226,1065220,0,1065217,1065225)) then 'LabImaging'
      WHEN (TypeConceptId in (3059271,3059272,2649591,1067557,1065280,1065333,1065318,1065323,1065342,1065330,1065337) and ClassConceptId in (1065216,1065227,1065225)) then 'Outpatient'
      WHEN (TypeConceptId in (3059263,3059301,1067555,3059265,3059272,2649591,3059264) and ClassConceptId in (1065230,1065216,0)) then 'Virtual'
      else 'Other'
    end as EncounterType
  FROM Encounter 
  where ClassConceptId !=1067555 and StartDateTime is not NULL
    and StatusConceptId not in (1067555,2983200,2506595, 2983199,2506590, 1065206)
"
encounterF <- load_sql_table(con, snapshot, sql, view_name='encounterF',output_mode = "sparklyr")

#### Create view for pancreatic cancer code

In [12]:
# no use
pancreatic_icd10_codes = codeset(con, snapshot,"ICD10CM", 'selfAndDescendants',"C25")

pancreatic_snomed_codes = codeset(con, snapshot,
  "SNOMED CT",
  'selfAndDescendants',
  "429000004",
  "363418001",
  "93938001",
  "94459006",
  "255088001")

pancreatic_cond_codes = rbind(pancreatic_icd10_codes,pancreatic_snomed_codes)
#create view
create_view(pancreatic_cond_codes, 'pancreatic_cond_codes')

In [13]:
# Use only ICD codes
pancreatic_icd10_codes = codeset(con, snapshot,"ICD10CM", 'selfAndDescendants',"C25")

#create view
create_view(pancreatic_icd10_codes, 'pancreatic_cond_codes')

In [8]:
display_df(pancreatic_cond_codes)

##### 1.1 Get the patient details from condition table for the pancreatic codes

In [14]:
#All data tab 1
sql1 <- "
WITH all_pan_cond as 
 (
    SELECT 
        PersonId, 
        COALESCE(OnsetDateTime, RecordedDateTime) AS PanDt, 
        EncounterId, PrimaryDiagnosisConceptId,pcc.ConceptCode,
        CASE
            WHEN OnsetDateTime IS NOT NULL THEN 'Y'
            ELSE 'N'
        END AS OnsetDtFLPan,
        ROW_NUMBER() OVER (PARTITION BY PersonId ORDER BY COALESCE(OnsetDateTime, RecordedDateTime)) AS RowNum
    FROM condition c
    INNER JOIN ConditionCodeConceptMap ccm 
    ON c.CodeConceptMapId = ccm.Id
    INNER JOIN pancreatic_cond_codes pcc 
    ON ccm.CodeConceptId = pcc.ConceptId
    WHERE ccm.SourceConceptId IN (2703595, 2703594)
    AND (OnsetDateTime IS NOT NULL OR RecordedDateTime IS NOT NULL) 

 ),

 count_personR as
  (
    SELECT 
        PersonId,
        count(*) as Personrecords
        FROM all_pan_cond
        GROUP BY PersonId
  )

SELECT 
  a.PersonId, a.PanDt,  a.EncounterId,  a.OnsetDtFLPan,  a.ConceptCode AS PanLocationCode, b.Personrecords,
  CASE
    WHEN a.ConceptCode = 'C25.0' THEN 'Head'
    WHEN a.ConceptCode = 'C25.1' OR a.ConceptCode = 'C25.2' THEN 'Body/Tail'
    WHEN a.ConceptCode = 'C25.3' OR a.ConceptCode = 'C25.7' OR a.ConceptCode = 'C25.8' THEN 'Other Parts'
    WHEN a.ConceptCode = 'C25.4' THEN 'Endocrine'
    ELSE 'Unspecified'
  END AS PanLocation
FROM all_pan_cond a
LEFT JOIN count_personR b 
  ON a.PersonId = b.PersonId
WHERE a.RowNum = 1;


"
pan_cond <- load_sql_table(con, snapshot, sql1, view_name='pan_cond',output_mode = "sparklyr")

# updated logic
# PanLocation_old %in% c("Body", "Tail") ~ "Body/Tail",
# PanLocation_old %in% c("Duct", "Other Parts", "Overlapping Sites") ~ "Other"


In [21]:
display_df(pan_cond,20)

In [15]:
## pancreatic_final
sql2 <- 
"
 WITH pancreatic2019_20 as 
 (
    SELECT  *
    FROM pan_cond
    WHERE  PanDt >= '2019-01-01'
      AND PanDt <= '2021-12-31'
 ),

deathR as
 (
   SELECT 
    PersonId, 
    min(COALESCE(DeathDateTime, RecordedDateTime)) AS DTHDT,
    'Y' AS DTHFL
   FROM 
    PersonDeathFact d
   GROUP BY 
    PersonId
 ),

 Pan_death as
  (
   SELECT 
     p.*, d.DTHDT, d.DTHFL
   FROM pancreatic2019_20 p LEFT JOIN deathR d ON p.PersonId=d.PersonId

  )


SELECT p.*, e.ClassConceptId as PanEncClass, e.TypeConceptId as PanEncType, e.EncounterType as PanEncounterType
FROM Pan_death p 
LEFT JOIN EncounterF e 
   ON p.EncounterId = e.Id
 
"

Pan_Encounter <- load_sql_table(con, snapshot, sql2, view_name='Pan_Encounter',output_mode = "sparklyr")

In [None]:
display_df(Pan_Encounter)

### **Diagnosis Encounter**

In [16]:
# Step 1: All condition
# Step 2: All Encounter with Encounter Id in Condition Table

sql3 <-
"
WITH all_cond as 
(
    SELECT DISTINCT c.Personid, c.EncounterId, cm.CodeConceptId
    FROM condition c JOIN ConditionCodeConceptMap cm 
    ON c.CodeConceptMapId = cm.Id
    WHERE cm.SourceConceptId = 2703595 OR cm.SourceConceptId = 2703594
)

SELECT 
    e.PersonId, 
    e.StartDateTime as DiagEncStart,
    e.ClassConceptId as EncClass, 
    e.TypeConceptId as EncType,
    e.Id, e.EncounterType,
    c.CodeConceptId, 
    c.EncounterId
FROM 
    encounterF e
INNER JOIN 
    all_cond c
ON 
    e.Id = c.EncounterId
WHERE 
    e.StartDateTime IS NOT NULL 
    AND e.StartDateTime >= '2018-01-01'

"
all_enc <- load_sql_table(con, snapshot, sql3, view_name='all_enc',output_mode = "sparklyr")

In [17]:
# Count the diagnosis encounter for each combination
sql4 <-
"
WITH diagenctotal as
(
    SELECT e.PersonId, count(distinct e.DiagEncStart) as diagenctotal, min(e.DiagEncStart) as FirstDiagEncDt
    FROM Pan_Encounter p
    INNER JOIN all_enc e
    ON p.PersonId = e.PersonId
    WHERE e.DiagEncStart < p.PanDt
    GROUP BY e.PersonId
),

diagenc90 as
(
    SELECT e.PersonId, count(distinct e.DiagEncStart) as diagEnc90
    FROM Pan_Encounter p
    INNER JOIN all_enc e
    ON p.PersonId = e.PersonId
    WHERE e.DiagEncStart < DATE_SUB(p.PanDt,90)
    GROUP BY e.PersonId
),

diagenc30 as
(
    SELECT e.PersonId, count(distinct e.DiagEncStart) as diagEnc30
    FROM Pan_Encounter p
    INNER JOIN all_enc e
    ON p.PersonId = e.PersonId
    WHERE e.DiagEncStart < DATE_SUB(p.PanDt,30)
    GROUP BY e.PersonId
),

diagenc90Out as
(
    SELECT e.PersonId, count(distinct e.DiagEncStart) as diagEnc90Out
    FROM Pan_Encounter p
    INNER JOIN all_enc e
    ON p.PersonId = e.PersonId
    WHERE e.DiagEncStart < DATE_SUB(p.PanDt,90) AND e.EncounterType = 'Outpatient'
    GROUP BY e.PersonId
),

diagenc30Out as
(
    SELECT e.PersonId, count(distinct e.DiagEncStart) as diagEnc30Out
    FROM Pan_Encounter p
    INNER JOIN all_enc e
    ON p.PersonId = e.PersonId
    WHERE e.DiagEncStart < DATE_SUB(p.PanDt,30) AND e.EncounterType = 'Outpatient'
    GROUP BY e.PersonId
),

diagenc90OutTemp as
(
    SELECT e.PersonId, count(distinct e.DiagEncStart) as diagEnc90OutTemp
    FROM Pan_Encounter p
    INNER JOIN all_enc e
    ON p.PersonId = e.PersonId
    WHERE e.DiagEncStart < DATE_SUB(p.PanDt,90)
        AND EncType in (3059271,3059272,2649591,1067557,1065280,1065333,1065318,1065323,1065342,1065330,1065337) 
    GROUP BY e.PersonId
),

diagenc30OutTemp as
(
    SELECT e.PersonId, count(distinct e.DiagEncStart) as diagEnc30OutTemp
    FROM Pan_Encounter p
    INNER JOIN all_enc e
    ON p.PersonId = e.PersonId
    WHERE e.DiagEncStart < DATE_SUB(p.PanDt,30) 
        AND EncType in (3059271,3059272,2649591,1067557,1065280,1065333,1065318,1065323,1065342,1065330,1065337) 
    GROUP BY e.PersonId
),

diagEncDates as
(
    SELECT PersonId, min(StartDateTime) as FirstEncDt, max(StartDateTime) as LastEncDt
    FROM EncounterF e
    GROUP BY PersonId
)

SELECT p.*, h.diagenctotal, h.FirstDiagEncDt, a.diagEnc90, b.diagEnc30, c.diagEnc90Out, d.diagEnc30Out, e.FirstEncDt, e.LastEncDt, f.diagEnc90OutTemp, g.diagEnc30OutTemp
FROM Pan_Encounter p
LEFT JOIN diagenctotal h
      ON p.PersonId=h.PersonId
LEFT JOIN diagenc90 a 
     ON p.PersonId=a.PersonId
LEFT JOIN diagenc30 b 
     ON p.PersonId=b.PersonId
LEFT JOIN diagenc90Out c 
     ON p.PersonId=c.PersonId
LEFT JOIN diagenc30Out d 
     ON p.PersonId=d.PersonId
LEFT JOIN diagEncDates e 
     ON p.PersonId=e.PersonId
LEFT JOIN diagenc90OutTemp f 
     ON p.PersonId=f.PersonId
LEFT JOIN diagenc30OutTemp g 
     ON p.PersonId=g.PersonId

"
all_pan_diag <- load_sql_table(con, snapshot, sql4, view_name='all_pan_diag',output_mode = "sparklyr") 

In [None]:
display_df(all_pan_diag,10)

#####  Liver metastases

In [None]:
# Don't Use
#Liver metastases condition codes

liver_mets_icd_codes = codeset(con, snapshot,
  "ICD10CM",
  'selfAndDescendants',
  "C78.7")

liver_mets_SNOMED_codes = codeset(con, snapshot,
  "SNOMED CT",
  'selfAndDescendants',
  "94381002",
  "1251486007"
)

liver_mets_codes=rbind(liver_mets_icd_codes,liver_mets_SNOMED_codes)

create_view(liver_mets_codes, "liver_mets_codes")

In [13]:
# Liver metastases condition codes only ICD

liver_mets_icd_codes = codeset(con, snapshot,
  "ICD10CM",
  'selfAndDescendants',
  "C78.7")

create_view(liver_mets_icd_codes, "liver_mets_codes")

In [19]:
sql5 <- "
WITH all_livermets_cond as 
 (
    SELECT 
        PersonId, 
        COALESCE(OnsetDateTime, RecordedDateTime) AS LivDt, 
        EncounterId, 
        CASE
            WHEN OnsetDateTime IS NOT NULL THEN 'Y'
            ELSE 'N'
        END AS OnsetDtFLLiv,
        ROW_NUMBER() OVER (PARTITION BY PersonId ORDER BY COALESCE(OnsetDateTime, RecordedDateTime)) AS RowNum
    FROM condition c
    WHERE c.CodeConceptMapId IN (
        SELECT Id 
        FROM ConditionCodeConceptMap 
        WHERE CodeConceptId IN (SELECT ConceptId FROM liver_mets_codes)
        AND (SourceConceptId = 2703595 OR SourceConceptId = 2703594))
    AND (OnsetDateTime IS NOT NULL OR RecordedDateTime IS NOT NULL)  
 ),

final_liv_cond
 (
   SELECT *
   FROM all_livermets_cond a
   WHERE a.RowNum = 1
 ),

liv_enc
(
   SELECT 
    DISTINCT a.*, b.EncClass as LivClassEnc, b.EncType as LivTypeEnc, b.EncounterType as LivEncounterType
   FROM final_liv_cond a
   LEFT JOIN all_enc b 
       ON a.EncounterId=b.Id
),

liv_diag
  (    
        SELECT PersonId, count(distinct DiagEncStart) as LivMetN
        FROM all_enc e
        WHERE e.CodeConceptId IN
        (SELECT ConceptId FROM liver_mets_codes)
        GROUP BY PersonId
  )

SELECT 
   p.*,l1.LivDt, l1.OnsetDtFLLiv, l1.LivClassEnc, l1.LivTypeEnc, l1.LivEncounterType, l2.LivMetN
FROM all_pan_diag p
    LEFT JOIN liv_enc l1
        ON p.PersonId = l1.PersonId
    LEFT JOIN liv_diag l2
        ON p.PersonId = l2.PersonId
"

pan_liv_final <- load_sql_table(con, snapshot, sql5, view_name='pan_liv_final',output_mode = "sparklyr")

##### Overall Count of subjects

In [20]:
sql <-
"
 SELECT count(distinct PersonId) as TotalPtsCount
 FROM PrimDiagId
 WHERE PrimaryDiagnosisConceptId != 1200405
 LIMIT 1
"
count <- load_sql_table(con, snapshot, sql, view_name='count',output_mode = "sparklyr")
display_df(count)

In [27]:
nrow(pan_liv_final)

#### Metastases related to Pancreatic Cancer

In [20]:
# ICD10CM code for liver, colon, gastric, abdomen metastases

AllMetastasesCode = codeset(con, snapshot, "ICD10CM", 'selfAndDescendants',
  "C78")

#AllMetastasesSNOMEDCode = codeset(con, snapshot, "SNOMED CT", 'selfAndDescendants',"275266006")

#AllMetastasesCode = rbind(AllMetastasesICDCode,AllMetastasesSNOMEDCode)

create_view(AllMetastasesCode,'AllMetastasesCode')

# ICD10CM code for Lymph Node (Unspecified, intra-abdomina, multiple region)

# Lymph_Node_SNOMED_codes = codeset(con, snapshot, "SNOMED CT",'selfAndDescendants',"94410007","94519005", "94336001", "94351005", "94397007", "94392001","303201005","94466007")

LymphNodeAllCodes = codeset(con, snapshot,
  "ICD10CM",
  'selfAndDescendants',
  "C77.8",
  "C77.5",
  "C77.1",
  "C77.2",
  "C77.9"
)

#LymphNodeAllCodes = rbind(Lymph_Node_ICD_codes,Lymph_Node_SNOMED_codes)

create_view(LymphNodeAllCodes,'LymphNodeAllCodes')

In [11]:
#display_df(LymphNodeAllCodes,10)
display_df(AllMetastasesCode)

In [None]:
# Select subject with any Metastases Code and Lymph Node Codes 
sql6 <- " 
WITH AllMetastases as
(
    SELECT 
        PersonId, 
        COALESCE(OnsetDateTime, RecordedDateTime) AS AllMetDt, 
        ROW_NUMBER() OVER (PARTITION BY PersonId ORDER BY COALESCE(OnsetDateTime, RecordedDateTime)) AS RowNum
    FROM condition c
    WHERE c.CodeConceptMapId IN (
        SELECT Id 
        FROM ConditionCodeConceptMap 
        WHERE CodeConceptId IN (SELECT ConceptId FROM AllMetastasesCode)
        AND (SourceConceptId = 2703595 OR SourceConceptId = 2703594))
    AND (OnsetDateTime IS NOT NULL OR RecordedDateTime IS NOT NULL)  
),

AllMetastases1
(
   SELECT PersonId,AllMetDt
   FROM AllMetastases a
   WHERE a.RowNum = 1
),

LympNode
(
    SELECT 
        PersonId, 
        COALESCE(OnsetDateTime, RecordedDateTime) AS LympNodeDt, 
        ROW_NUMBER() OVER (PARTITION BY PersonId ORDER BY COALESCE(OnsetDateTime, RecordedDateTime)) AS RowNum
    FROM condition c
    WHERE c.CodeConceptMapId IN (
        SELECT Id 
        FROM ConditionCodeConceptMap 
        WHERE CodeConceptId IN (SELECT ConceptId FROM LymphNodeAllCodes)
        AND (SourceConceptId = 2703595 OR SourceConceptId = 2703594))
    AND (OnsetDateTime IS NOT NULL OR RecordedDateTime IS NOT NULL)  
),

LympNode1
(
   SELECT PersonId,LympNodeDt
   FROM LympNode a
   WHERE a.RowNum = 1
)

SELECT
    p.*,b.AllMetDt,c.LympNodeDt
FROM 
    pan_liv_final p
    LEFT JOIN AllMetastases1 b
        ON p.PersonId = b.PersonId
    LEFT JOIN LympNode1 c
        ON p.PersonId = c.PersonId
"
pan_liv_final1 <- load_sql_table(con, snapshot, sql6, view_name='pan_liv_final1',output_mode = "sparklyr")

##### Trial Primary diagnosis Flag as Yes count

In [27]:
sql<-
"
WITH table1 AS 
   (
SELECT 
    PersonId, 
    COALESCE(OnsetDateTime, RecordedDateTime) AS PanDt, 
    PrimaryDiagnosisConceptId,
    ROW_NUMBER() OVER (
        PARTITION BY PersonId 
        ORDER BY COALESCE(OnsetDateTime, RecordedDateTime) ASC, PrimaryDiagnosisConceptId DESC
    ) AS RowNum
FROM condition c
INNER JOIN ConditionCodeConceptMap ccm 
    ON c.CodeConceptMapId = ccm.Id
INNER JOIN pancreatic_cond_codes pcc 
    ON ccm.CodeConceptId = pcc.ConceptId
WHERE ccm.SourceConceptId IN (2703595, 2703594)
  AND (OnsetDateTime IS NOT NULL OR RecordedDateTime IS NOT NULL) 

  ),
  
 table2 AS
 (
    SELECT *
    FROM table1
    WHERE RowNum=1
 )
   
   SELECT p.*,a.PrimaryDiagnosisConceptId
   FROM pan_liv_final1 p
   LEFT JOIN table2 a
    ON p.PersonId = a.PersonId AND p.PanDt=a.PanDt

"
pan_liv_final1a <- load_sql_table(con, snapshot, sql, view_name='pan_liv_final1a',output_mode = "sparklyr") %>% collect()

In [28]:
# DON'T RUN This
# Perform transformation
# No need for renaming and dropping while running second time

pan_liv_final1b <- pan_liv_final1a %>%
  mutate(
    
    DiffPanDthDays = ifelse(
      !is.na(DTHDT), as.numeric(as.Date(DTHDT) - as.Date(PanDt)), NA
                           ),  # Calculate difference between DTHDT and PanDt

    LivDtFl30 = ifelse(
      !is.na(LivDt) & as.Date(LivDt) > as.Date(PanDt) + 30, "Y","N"
                      ),  # Flag if LivDt > PanDt + 30

    LivDtFl = ifelse(
      !is.na(LivDt) & as.Date(LivDt) >= as.Date(PanDt), "Y", "N"
                    ) , # Flag if LivDt > PanDt

    LivDtFlBase = ifelse(
      !is.na(LivDt) & as.Date(LivDt) >= as.Date(PanDt) & as.Date(LivDt) <= as.Date(PanDt) + 30, "Y", "N"
                    ) , # Flag if Pandt <= LivDt <= PanDt +30

    PARAM = as.numeric(ifelse(!is.na(LivDt), as.Date(LivDt) - as.Date(PanDt), NA)), 
    # Difference in Time between Liver metastases and Pancreatic Diagnosis date 

    STARTDT = as.Date(PanDt), # Copy of PanDt

    CNSR = ifelse(
      !is.na(LivDtFl) & LivDtFl == "Y", 0, 
      ifelse(
        !is.na(DTHFL) & DTHFL == "Y", 2, 
        1
      )
    ), # Censoring indicator variable

    ADT = case_when(
      CNSR == 0 ~ as.Date(LivDt),
      CNSR == 2 ~ as.Date(DTHDT),
      TRUE ~ pmin(as.Date(LastEncDt), as.Date("2024-11-30"), na.rm = TRUE)
    ), 

    # Analysis Date

    # Difference in Time from Analysis date and StartDate(PanDT)
    AVAL = round(as.numeric(ADT - as.Date(STARTDT) + 1) / 365.25, 2)
  )

display_df(pan_liv_final1b,10)

#### Follow up Diagnosis encounter

In [33]:
# Get Follow-up Diagnosis encounter after PanDt and before Analysis date
# Create indicator variable for cases(1) and control(0)
sql7 <-
"
WITH table1 AS (
    SELECT e.PersonId, COUNT(DISTINCT e.DiagEncStart) AS FollowUpDiagEnc
    FROM pandata p
    INNER JOIN all_enc e
        ON p.PersonId = e.PersonId
    WHERE e.DiagEncStart BETWEEN p.PanDt AND p.ADT
    GROUP BY e.PersonId
)

SELECT 
    p.*,
    t1.FollowUpDiagEnc,
    CAST(
        CASE 
            WHEN p.LivDtFl IS 'Y' THEN 1
            WHEN p.LivDtFl IS 'N' THEN 0
        END AS INT
    ) AS Indicator
FROM pandata p
LEFT JOIN table1 t1
    ON p.PersonId = t1.PersonId

"
pan_liv_final2 <- load_sql_table(con, snapshot, sql7, view_name='pan_liv_final2',output_mode = "sparklyr") %>% collect()

In [34]:
nrow(pan_liv_final2)

####### Check the dataset carefully, count not matching after the data cleaning issue with allmetdt, livdt creation

#### **Add Demographics **
Apply Demographic Filter

In [35]:
# get demographic data
df = get_demographics(con, snapshot)

# remove collect and select
demo_df1 <- sparkr_df_to_sparklyr_df(con, df, display_msg = TRUE) %>% collect() %>% 
        select(PersonId,BirthDateTime, Sex, Ethnicity, Race, StateOrProvince, PostalOrZipCode, MaritalStatus)

# Merge Pancreatic Cancer and Demographics 
pan_demo <- merge(pan_liv_final2,demo_df1,by="PersonId",all.x=TRUE)
#nrow(pan_demo)

In [36]:
# Derive Age at the time of Pancreatic diagnosis
pan_demo <- pan_demo %>% mutate(AgeAtDiagnosis = round(as.numeric(as.Date(PanDt) - as.Date(BirthDateTime)) / 365.25),
                                  Age_Group = ifelse(AgeAtDiagnosis < 45, "Age < 45",
                                                     ifelse(AgeAtDiagnosis >= 45 &AgeAtDiagnosis <= 65, "Age 45-65",
                                                            ifelse(AgeAtDiagnosis >= 66 & AgeAtDiagnosis <= 85, "Age 66-85", "Age > 85")
                                                            )
                                                    )
                                )
# Divide patients in different age group

pan_demo$Age_Group <- ifelse(
  pan_demo$AgeAtDiagnosis < 45, "Age < 45",
  ifelse(pan_demo$AgeAtDiagnosis >= 45 & pan_demo$AgeAtDiagnosis < 65, "Age 45-65",
  ifelse(pan_demo$AgeAtDiagnosis >= 65 & pan_demo$AgeAtDiagnosis < 85, "Age 66-84", "Age >= 85"))
)

In [37]:
# rename race and panlocation as old and derive new 
pan_demo <-  pan_demo %>% rename("Race_old" = "Race")

pan_demo <- pan_demo %>% 
  mutate(
    Race = case_when(
      Race_old == "White" ~ "White",
      Race_old == "Black or African American" ~ "Black",
      Race_old %in% c("Asian", "American Indian or Alaska Native", "Native Hawaiian or Other Pacific Islander", "Other Race") ~ "Other Race",
      Race_old == "Unknown" ~ "Unknown"
    )
  )

colnames(pan_demo)

In [38]:
# Create a seperate Region Variable

# Define region mapping
pan_demo <- pan_demo %>%
  mutate(Region = as.factor(case_when(
    StateOrProvince %in% c("Connecticut", "Maine", "Massachusetts", "New Hampshire", 
                           "Rhode Island", "Vermont", "New Jersey", "New York", "Pennsylvania") ~ "Northeast",
    StateOrProvince %in% c("Illinois", "Indiana", "Michigan", "Ohio", "Wisconsin", 
                           "Iowa", "Kansas", "Minnesota", "Missouri", "Nebraska", 
                           "North Dakota", "South Dakota") ~ "Midwest",
    StateOrProvince %in% c("Delaware", "Florida", "Georgia", "Maryland", "North Carolina", 
                           "South Carolina", "Virginia", "West Virginia", "Alabama", "Kentucky", 
                           "Mississippi", "Tennessee", "Arkansas", "Louisiana", "Oklahoma", "Texas") ~ "South",
    StateOrProvince %in% c("Arizona", "Colorado", "Idaho", "Montana", "Nevada", 
                           "New Mexico", "Utah", "Wyoming", "Alaska", "California", 
                           "Hawaii", "Oregon", "Washington") ~ "West",
    TRUE ~ "Unknown" # Default case for states not listed
  )))

# View result
#head(cases_control_allvar)


#### Data Cleaning

In [8]:
# Data issue : subject with ageatdiagnosis < 45 remove those
pan_demo1 <- pan_demo %>% filter( AgeAtDiagnosis >= 45)
nrow(pan_demo1)

In [9]:
# Subjects with Death Date on or before PanDt
pan_demo1a <- pan_demo1 %>% filter (!(!is.na(DTHDT) & as.Date(DTHDT) <= as.Date(PanDt))) 
nrow(pan_demo1a)

In [10]:
# remove subjects with first occurance of any metastases(including LM) date before the PanDt 
pan_demo1b <- pan_demo1a %>% filter (!(!is.na(AllMetDt) & as.Date(AllMetDt) < as.Date(PanDt))) 
nrow(pan_demo1b)

In [42]:
#Remove subject with Analysis Date (Last Encounter date) before Pan Cancer Start Date
pan_demo1c <- pan_demo1b %>% filter (!(AVAL < 0 & CNSR == 1)) 
nrow(pan_demo1c)

In [43]:
#Remove subjects with Unknown sex or Pancreatic location as Endocrine.
RawMainDataset <- pan_demo1c %>% filter(Sex != "Unknown") %>% filter(PanLocation != "Endocrine")

# <mark>**# Trial**</mark>

In [None]:
# Select control : Subjects with No Liver metastases or death during the whole study period of (2years from the date of PanDt)

control <- data %>% filter(is.na(LivDt))
control$indicator <- as.numeric(0)

# Convert R DataFrame to Spark DataFrame
PanData <- as.DataFrame(control)
createOrReplaceTempView(PanData, "control1")

# TotalEnc: Number of diagnosis encounter between PanDt and Analysis Date (Death/LastEnc)
sql8 <-
"
    SELECT e.PersonId, count(distinct e.DiagEncStart) as TotalEnc
    FROM control1 p
    INNER JOIN all_enc e
    ON p.PersonId = e.PersonId
    WHERE (e.DiagEncStart >= p.PanDt AND e.DiagEncStart <= p.ADT)
    GROUP BY e.PersonId
"

Follow_Up <- load_sql_table(con, snapshot, sql8, view_name='Follow_Up',output_mode = "sparklyr") %>% collect()

# merge Pan data with Follow-up
control2 <- merge(control,Follow_Up,by="PersonId",all.x=TRUE)

# Control with followup of atleast 1 years and minimum 2 diagnosis
control3 <- control2[control2$TotalEnc>=2 & control2$AVAL > 1 & !is.na(control2$TotalEnc) & !is.na(control2$AVAL),]

# Subjects who are alive till 1 years from PanDt
library(lubridate)

control4 <- control3[control3$CNSR != 2 | as.Date(control3$DTHDT) > as.Date(control3$PanDt) + years(1), ]
control4 <- control4[,-40]

cases_control <- rbind(cases,control4) 
nrow(cases_control)


In [35]:
# Summary statistics
summary(cases$AVAL)  # Gives Min, 1st Quartile, Median, Mean, 3rd Quartile, and Max

In [36]:
IQR(cases$AVAL)      # Computes the Interquartile Range (IQR)

In [39]:
# Boxplot for visualization
boxplot(control$AVAL, 
        main = "Time in years(Pandt to last analysis date for control)", 
        ylab = "AVAL", 
        ylim = c(0, 5.5))  # Corrected ylim

# Add custom Y-axis tick marks
axis(2, at = seq(0, 2.5, by = 0.5))

In [40]:
# Summary statistics
summary(control$AVAL)  # Gives Min, 1st Quartile, Median, Mean, 3rd Quartile, and Max

In [41]:
IQR(control$AVAL)      # Computes the Interquartile Range (IQR)

In [51]:
cat("1. subjects with atleast 2 diagnosis encounters and a follow-up period of atleast 1 years:", sum(control2$TotalEnc>=2 & control2$AVAL > 1,na.rm=TRUE),"\n")

cat("2. subjects with atleast 3 diagnosis encounters and a follow-up period of atleast 1 years:", sum(control2$TotalEnc>=3 & control2$AVAL > 1,na.rm=TRUE),"\n")

cat("2. subjects with atleast 2 diagnosis encounters and a follow-up period of atleast 2 years:", sum(control2$TotalEnc>=2 & control2$AVAL > 2,na.rm=TRUE),"\n")

cat("3. subjects with atleast 3 diagnosis encounters and a follow-up period of atleast 2 years:", sum(control2$TotalEnc>=3 & control2$AVAL > 2,na.rm=TRUE),"\n")

cat("2. subjects with atleast 2 diagnosis encounters and a follow-up period of atleast 3 years:", sum(control2$TotalEnc>=2 & control2$AVAL > 3,na.rm=TRUE),"\n")

cat("3. subjects with atleast 3 diagnosis encounters and a follow-up period of atleast 3 years:", sum(control2$TotalEnc>=3 & control2$AVAL > 3,na.rm=TRUE),"\n")

#### Data Driven Approach to select top medication or condition in the snapshot cohort

In [7]:
# Notriptylin
# use codeset() to create dataframe and SparkSQL 
notriptylin_treatment_codes <- codeset(con, snapshot, "RxNorm",'selfAndDescendants',
  "198045",
  "198046",
  "198047",
  "312036",
  "317136", view_name = 'notriptylin_treatment_codes')

  
  display_df(notriptylin_treatment_codes)

In [5]:
# Data Driven Approach to check top 5 medication in the cohort

#medication_dataset <- MedicationAdministration
#medication_dataset <- MedicationDispense
#medication_dataset <- MedicationRequest

sql <-"
SELECT mcc.CodeConceptId, COUNT(DISTINCT m.PersonId) AS PersonCount
FROM MedicationAdministration m 
INNER JOIN MedicationCodeConceptMap mcc 
    ON m.CodeConceptMapId = mcc.Id
GROUP BY mcc.CodeConceptId
ORDER BY PersonCount DESC
LIMIT 10

"
topDrug <- load_sql_table(con, snapshot, sql, view_name='topDrug',output_mode = "sparklyr") 
display_df(topDrug)

In [7]:
# Data Driven Approach to check top 5 condition in the cohort

sql <-"
    SELECT ccm.CodeConceptId, COUNT(DISTINCT c.PersonId) AS PersonCount
    FROM condition c
     INNER JOIN ConditionCodeConceptMap ccm 
        ON c.CodeConceptMapId = ccm.Id
    GROUP BY CodeConceptId 
    ORDER BY PersonCount DESC
    LIMIT 10
"
topCond <- load_sql_table(con, snapshot, sql, view_name='topCond',output_mode = "sparklyr") 
display_df(topCond)

In [None]:
# Data Driven Approach to check top 5 medication in the cohort
medication_dataset <- MedicationAdministration
#medication_dataset <- MedicationDispense
#medication_dataset <- MedicationRequest

sql <-"
WITH adm AS 
(
    SELECT mcc.ConceptCode, COUNT(DISTINCT m.PersonId) AS PersonCount
    FROM MedicationAdministration m 
     INNER JOIN MedicationCodeConceptMap mcc 
        ON m.CodeConceptMapId = mcc.Id
    GROUP BY ConceptCode PersonCount
    ORDER BY PersonCount DESC
    LIMIT 5
),

Disp AS 
(
    SELECT mcc.ConceptCode, COUNT(DISTINCT m.PersonId) AS PersonCount
    FROM MedicationDispense m 
     INNER JOIN MedicationCodeConceptMap mcc 
        ON m.CodeConceptMapId = mcc.Id
    GROUP BY ConceptCode PersonCount
    ORDER BY PersonCount DESC
    LIMIT 5
),

Req AS
(
    SELECT mcc.ConceptCode, COUNT(DISTINCT m.PersonId) AS PersonCount
    FROM MedicationRequest m 
     INNER JOIN MedicationCodeConceptMap mcc 
        ON m.CodeConceptMapId = mcc.Id
    GROUP BY ConceptCode PersonCount
    ORDER BY PersonCount DESC
    LIMIT 5
)

"

In [None]:
# Temporary code to check the issue with LastEnc date
sql_temp <- "
WITH cond as
(
    SELECT PersonId, min(COALESCE(OnsetDateTime, RecordedDateTime)) AS CondMin, MAX(COALESCE(OnsetDateTime, RecordedDateTime)) AS CondMax
    FROM condition
    WHERE PersonId IN ('c8f21ab0-854d-3215-7edc-f86318febb67', 
                   'cb0e997b-c279-4481-09d4-ca7fb12188eb',
                   'e30f780e-15a7-14ca-2bc0-92203794434c',
                   'ed89a453-3b35-fa56-af8b-65047eebfd7c',
                   '18474cdf-9794-60a1-1ea2-7c51116734b5',
                   '2cfbe1b5-7823-3c35-0d03-240de7404328',
                   '3182469f-b840-e68e-0ad3-f46a43299026')
    GROUP BY PersonId

),

enc
(
    SELECT PersonId, MIN(StartDateTime) AS EncMin, MAX(StartDateTime) AS EncMax
    FROM encounterF
    WHERE PersonId IN ('c8f21ab0-854d-3215-7edc-f86318febb67', 
                   'cb0e997b-c279-4481-09d4-ca7fb12188eb',
                   'e30f780e-15a7-14ca-2bc0-92203794434c',
                   'ed89a453-3b35-fa56-af8b-65047eebfd7c',
                   '18474cdf-9794-60a1-1ea2-7c51116734b5',
                   '2cfbe1b5-7823-3c35-0d03-240de7404328',
                   '3182469f-b840-e68e-0ad3-f46a43299026')
    GROUP BY PersonId
)

   SELECT a.PersonId, a.CondMin, a.CondMax, b.EncMin, b.EncMax
   FROM cond a
   LEFT JOIN enc b
   ON a.PersonId = b.PersonId
"

TempEncCond <- load_sql_table(con, snapshot, sql_temp, view_name='TempEncCond',output_mode = "sparklyr")

#### <u>_Frequency check_</u>

In [44]:
data <- RawMainDataset

# Count patients with >= 1 Diagnosis Encounters
count_1_or_more <- sum(data$diagEnc90 >= 1, na.rm=TRUE)

# Count patients with >= 3 Diagnosis Encounters
count_3_or_more <- sum(data$diagEnc90 >= 3, na.rm=TRUE)

# Count patients with >= 5 Diagnosis Encounters
count_5_or_more <- sum(data$diagEnc90 >= 5, na.rm=TRUE)

# Print Results
cat("Patients with >= 1 Diagnosis Encounters where Enc is < Pandt - 90D:", count_1_or_more, "\n")
cat("Patients with >= 3 Diagnosis Encounters where Enc is < Pandt - 90D", count_3_or_more, "\n")
cat("Patients with >= 5 Diagnosis Encounters where Enc is < Pandt - 90D", count_5_or_more, "\n")

# Count patients with >= 1 Diagnosis Encounters
count_1_or_more <- sum(data$diagEnc30 >= 1, na.rm=TRUE)

# Count patients with >= 3 Diagnosis Encounters
count_3_or_more <- sum(data$diagEnc30 >= 3, na.rm=TRUE)

# Count patients with >= 5 Diagnosis Encounters
count_5_or_more <- sum(data$diagEnc30 >= 5, na.rm=TRUE)

# Print Results
cat("Patients with >= 1 Diagnosis Encounters where Enc is < Pandt - 30D:", count_1_or_more, "\n")
cat("Patients with >= 3 Diagnosis Encounters where Enc is < Pandt - 30D", count_3_or_more, "\n")
cat("Patients with >= 5 Diagnosis Encounters where Enc is < Pandt - 30D", count_5_or_more, "\n")

# Count patients with >= 1 Diagnosis Encounters
count_1_or_more <- sum(data$diagEnc90Out >= 1, na.rm=TRUE)

# Count patients with >= 3 Diagnosis Encounters
count_3_or_more <- sum(data$diagEnc90Out >= 3, na.rm=TRUE)

# Count patients with >= 5 Diagnosis Encounters
count_5_or_more <- sum(data$diagEnc90Out >= 5, na.rm=TRUE)

# Print Results
cat("Patients with >= 1 Outpatients Diagnosis Encounters where Enc is < Pandt - 90D:", count_1_or_more, "\n")
cat("Patients with >= 3 Outpatients Diagnosis Encounters where Enc is < Pandt - 90D", count_3_or_more, "\n")
cat("Patients with >= 5 Outpatients Diagnosis Encounters where Enc is < Pandt - 90D", count_5_or_more, "\n")

# Count patients with >= 1 Diagnosis Encounters
count_1_or_more <- sum(data$diagEnc30Out >= 1, na.rm=TRUE)

# Count patients with >= 3 Diagnosis Encounters
count_3_or_more <- sum(data$diagEnc30Out >= 3, na.rm=TRUE)

# Count patients with >= 5 Diagnosis Encounters
count_5_or_more <- sum(data$diagEnc30Out >= 5, na.rm=TRUE)

# Print Results
cat("Patients with >= 1 Outpatients Diagnosis Encounters where Enc is < Pandt - 30D:", count_1_or_more, "\n")
cat("Patients with >= 3 Outpatients Diagnosis Encounters where Enc is < Pandt - 30D", count_3_or_more, "\n")
cat("Patients with >= 5 Outpatients Diagnosis Encounters where Enc is < Pandt - 30D", count_5_or_more, "\n")

# Count patients with >= 1 outpatient Only Type) Diagnosis Encounters
count_1_or_more <- sum(data$diagEnc90OutTemp >= 1, na.rm=TRUE)

# Count patients with >= 3 Diagnosis Encounters
count_3_or_more <- sum(data$diagEnc90OutTemp >= 3, na.rm=TRUE)

# Count patients with >= 5 Diagnosis Encounters
count_5_or_more <- sum(data$diagEnc90OutTemp >= 5, na.rm=TRUE)

# Print Results
cat("Patients with >= 1 Outpatients Diagnosis(Only Type) Encounters where Enc is < Pandt - 90D:", count_1_or_more, "\n")
cat("Patients with >= 3 Outpatients Diagnosis(Only Type) Encounters where Enc is < Pandt - 90D", count_3_or_more, "\n")
cat("Patients with >= 5 Outpatients Diagnosis(Only Type) Encounters where Enc is < Pandt - 90D", count_5_or_more, "\n")

# Count patients with >= 1 Diagnosis Encounters
count_1_or_more <- sum(data$diagEnc30OutTemp >= 1, na.rm=TRUE)

# Count patients with >= 3 Diagnosis Encounters
count_3_or_more <- sum(data$diagEnc30OutTemp >= 3, na.rm=TRUE)

# Count patients with >= 5 Diagnosis Encounters
count_5_or_more <- sum(data$diagEnc30OutTemp >= 5, na.rm=TRUE)

# Print Results
cat("Patients with >= 1 Outpatients(Only Type) Diagnosis Encounters where Enc is < Pandt - 30D:", count_1_or_more, "\n")
cat("Patients with >= 3 Outpatients(Only Type) Diagnosis Encounters where Enc is < Pandt - 30D", count_3_or_more, "\n")
cat("Patients with >= 5 Outpatients(Only Type) Diagnosis Encounters where Enc is < Pandt - 30D", count_5_or_more, "\n")

In [45]:
data <- RawMainDataset

cat("Total pts in Full data :", nrow(data),"\n")

cat("Total Final patient with >= 2 diagnosis encounter:", 
    sum(data$diagEnc90 >= 2, na.rm = TRUE),"\n")

cat("Total Final patient with >= 2 diagnosis encounter & remove patient with emergency pancreatic diagnosis:", 
    sum(data$diagEnc90 >= 2 & data$PanEncounterType != "Emergency", na.rm = TRUE),"\n")

cat("Total Final patient with >= 2 diagnosis encounter & >= 1 outpatient diagnosis encounter visit:", 
    sum(data$diagEnc90 >= 2 & data$diagEnc90OutTemp >= 1, na.rm = TRUE), "\n")

cat("Total Final patient with >= 3 diagnosis encounter & >= 1 outpatient diagnosis encounter visit:", 
    sum(data$diagEnc90 >= 3 & data$diagEnc90OutTemp >= 1, na.rm = TRUE), "\n")

cat("Total Final patient with >= 2 diagnosis encounter & >= 1 outpatient diagnosis encounter visit and remove patient with emergency pancreatic diagnosis:", 
    sum(data$diagEnc90 >= 2 & data$diagEnc90OutTemp >= 1 & data$PanEncounterType != "Emergency", na.rm = TRUE),"\n")

cat("Total Final patient with >= 3 diagnosis encounter & >= 1 outpatient diagnosis encounter visit and remove patient with emergency pancreatic diagnosis:", 
    sum(data$diagEnc90 >= 3 & data$diagEnc90OutTemp >= 1 & data$PanEncounterType != "Emergency" , na.rm = TRUE),"\n")

cat ("Count of patient with Liver metastases diagnosis >=1 after 2018(All data):",sum(data$LivMetN >= 1, na.rm = TRUE) ,"\n")

cat("Count of patient with Liver metastases diagnosis >=2 after 2018(All data):",sum(data$LivMetN >= 2, na.rm = TRUE),"\n")

cat("Count of patients with Liver metastases at Baseline(All data):", sum(data$LivDtFlBase == "Y", na.rm = TRUE), "\n")

cat("Count of patients with Liver metastases after 30 days of Pancreatic cancer(All data):", sum(data$LivDtFl30 == "Y", na.rm = TRUE), "\n")

cat("Count of patients with Liver metastases anytime on or after Pancreatic cancer(All data):", sum(data$LivDtFl == "Y", na.rm = TRUE), "\n")

cat("Patients with death Record(All data):",sum(data$DTHFL=="Y",na.rm="TRUE"),"\n")


In [19]:
temp <- pan_liv_final1a[pan_liv_final1a$diagEnc90 >= 2 & !is.na(pan_liv_final1a$diagEnc90) & pan_liv_final1a$diagEnc90OutTemp >= 1 & !is.na(pan_liv_final1a$diagEnc90OutTemp) &
        pan_liv_final1a$PanEncounterType != "Emergency" & !is.na(pan_liv_final1a$PanEncounterType),]

In [20]:
cat("Total subject with diag enc >= 2 and OutDiagEnc >=1 and no emergency Diagnosis:", nrow(temp),"\n")

cat("Count of patients with Liver metastases at Baseline:", sum(temp$LivDtFlBase == "Y", na.rm = TRUE), "\n")

cat("Count of patients with Liver metastases after 30 days of Pancreatic cancer:", sum(temp$LivDtFl30 == "Y", na.rm = TRUE), "\n")

cat("Count of patients with Liver metastases anytime on or after Pancreatic cancer:", sum(temp$LivDtFl == "Y", na.rm = TRUE), "\n")

cat("Patients with death Record:",sum(temp$DTHFL=="Y",na.rm="TRUE"),"\n")

cat("Patients with death Record after liver metastases:",sum(temp$LivDt <= temp$DTHDT & !(is.na(temp$DTHDT)) & !(is.na(temp$LivDt)) & temp$LivDtFl == "Y",na.rm="TRUE"),"\n")

In [46]:
temp2 <- RawMainDataset
temp3 <- temp2[temp2$diagEnc90 >= 2 & !is.na(temp2$diagEnc90) & temp2$diagEnc90OutTemp >= 1 & !is.na(temp2$diagEnc90OutTemp),]

cat("Count of patients with either anymetdt missing or less than Pandt:",nrow(temp2),"\n")
cat("Total subject with diag enc >= 2 and OutDiagEnc >=1:",nrow(temp3),"\n")
cat("Count of patients with Liver metastases at Baseline:", sum(temp3$LivDtFlBase == "Y", na.rm = TRUE), "\n")

cat("Count of patients with Liver metastases after 30 days of Pancreatic cancer:", sum(temp3$LivDtFl30 == "Y", na.rm = TRUE), "\n")

cat("Count of patients with Liver metastases anytime on or after Pancreatic cancer:", sum(temp3$LivDtFl == "Y", na.rm = TRUE), "\n")

cat("Patients with death Record:",sum(temp3$DTHFL=="Y",na.rm="TRUE"),"\n")

cat("Patients with death Record after liver metastases:",sum(temp3$LivDt <= temp3$DTHDT & !(is.na(temp3$DTHDT)) & !(is.na(temp3$LivDt)) & temp3$LivDtFl == "Y",na.rm="TRUE"),"\n")


In [26]:
sum(temp3$PrimaryDiagnosisConceptId == 1200405,na.rm=TRUE)

In [27]:
# Check number of emergency pancreatic cancer encounter
cat("emergency pancreatic cancer diagnosis:", sum(temp3$PanEncounterType =="Emergency",na.rm=TRUE))

In [None]:
#Histogram for count
AnalysisData1<- 

LivMetCount <- factor(
  ifelse(
    !is.na(AnalysisData1$LivDt) & as.Date(AnalysisData1$LivDt) > as.Date(AnalysisData1$PanDt) + 30, "LivDt > PanDt + 30D",
    ifelse(
      !is.na(AnalysisData1$LivDt) & (as.Date(AnalysisData1$LivDt) <= as.Date(AnalysisData1$PanDt) + 30) & 
        as.Date(AnalysisData1$LivDt) > as.Date(AnalysisData1$PanDt), "PanDt < LivDt <= PanDt + 30D",
      ifelse(
        !is.na(AnalysisData1$LivDt) & as.Date(AnalysisData1$LivDt) == as.Date(AnalysisData1$PanDt), "LivDt = PanDt",
        ifelse(!is.na(AnalysisData1$LivDt) & as.Date(AnalysisData1$LivDt) < as.Date(AnalysisData1$PanDt), "LivDt < PanDt", "LivDt Missing")
      )
    )
  ),
  levels = c(
    "LivDt > PanDt + 30D", 
    "PanDt < LivDt <= PanDt + 30D", 
    "LivDt = PanDt", 
    "LivDt < PanDt", 
    "LivDt Missing"
  )
)

# Count occurrences for each category
LivMetCount_table <- table(LivMetCount)

# Define colors for the groups
colors <- c("skyblue", "orange", "lightgreen", "pink", "blue")

# Create the barplot
bar_positions <- barplot(
  LivMetCount_table,
  col = colors,                       # Apply colors
  main = "Histogram of Liver Metastases Diagnosis Date", 
  xlab = "Groups",
  ylab = "Count of Subjects",
  ylim = c(0, max(LivMetCount_table) + 5) # Add space for counts above bars
)

# Add counts above each bar
text(bar_positions, LivMetCount_table + 1, labels = LivMetCount_table, col = "black", cex = 0.8)

# Add legend in the upper-right corner
legend("topright", legend = names(LivMetCount_table), fill = colors, title = "Groups")


### Objective 2 Analysis of Diagnoses stage

In [None]:
# data with >=3 diagnosis encounter and >=1 outpatient diagnosis encounter (step 4)
AnalysisData1 <- pan_liv_final2[(pan_liv_final2$diagEnc90 >= 3 & !is.na(pan_liv_final2$diagEnc90)) & (pan_liv_final2$diagEnc90OutTemp >= 1 & !is.na(pan_liv_final2$diagEnc90OutTemp)),]
nrow(AnalysisData1)

##### Distribution of OnsetDt and RecordedDt

In [None]:

sql <- "
    SELECT 
        PersonId, OnsetDateTime, RecordedDateTime, RowNum
    FROM(
        SELECT PersonId, OnsetDateTime, RecordedDateTime,
        ROW_NUMBER() OVER (PARTITION BY PersonId ORDER BY COALESCE(OnsetDateTime, RecordedDateTime)) AS RowNum
    FROM condition c
    WHERE c.CodeConceptMapId IN (
        SELECT Id 
        FROM ConditionCodeConceptMap 
        WHERE CodeConceptId IN (SELECT ConceptId FROM pancreatic_cond_codes)  
    AND (SourceConceptId = 2703595 OR SourceConceptId = 2703594))
    AND OnsetDateTime IS NOT NULL 
    )
    subquery
    WHERE RowNum = 1
"
DiffOnsetRecData <- load_sql_table(con, snapshot, sql, view_name='DiffOnsetRecData',output_mode = "sparklyr") %>% collect()

In [None]:
DiffOnsetRecData <- DiffOnsetRecData %>% filter(PersonId %in% AnalysisData1$PersonId) %>% mutate(DiffOnsetRecord = as.numeric(as.Date(OnsetDateTime) - as.Date(RecordedDateTime))) %>% filter(!is.na(DiffOnsetRecord))  # Remove rows with NA in DiffOnsetRecord
nrow(DiffOnsetRecData)

In [None]:
display_df(DiffOnsetRecData,10)

In [None]:
#DiffOnsetRecord <- as.Date(DistrDt$OnsetDateTime) - as.Date(DiffOnsetRecord$RecordedDateTime)
cat("Mean Difference:",mean(DiffOnsetRecData$DiffOnsetRecord),"\t","Median Difference:",median(DiffOnsetRecData$DiffOnsetRecord),"\t","Maximum difference",max(DiffOnsetRecData$DiffOnsetRecord),"\t","Minimum Difference",min(DiffOnsetRecData$DiffOnsetRecord))

In [None]:

# Calculate summary statistics
cat(
  "Mean Difference:", mean(DiffOnsetRecData$DiffOnsetRecord), "\t",
  "Median Difference:", median(DiffOnsetRecData$DiffOnsetRecord), "\t",
  "Maximum Difference:", max(DiffOnsetRecData$DiffOnsetRecord), "\t",
  "Minimum Difference:", min(DiffOnsetRecData$DiffOnsetRecord)
)


In [None]:
ggplot(DiffOnsetRecData, aes(x = DiffOnsetRecord)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Time Difference", x = "Time Difference", y = "Count") +
  theme_minimal()

  ggplot(DiffOnsetRecData, aes(x = DiffOnsetRecord)) +
  geom_density(fill = "lightblue", alpha = 0.5) +
  labs(title = "Density Plot of Time Difference", x = "Time Difference", y = "Density") +
  theme_minimal()

  ggplot(DiffOnsetRecData, aes(y = DiffOnsetRecord)) +
  geom_boxplot(fill = "lightgreen", color = "black") +
  labs(title = "Box Plot of Time Difference", y = "Time Difference") +
  theme_minimal()

In [None]:
# Onset date
cat("pts with pancreatic first diagnosis code comes from onsetDate", sum(AnalysisData1$OnsetDtFLPan=="Y", na.rm=TRUE),"\n")
cat("pts with pancreatic first diagnosis code comes from RecordeddateTime", sum(AnalysisData1$OnsetDtFLPan == "N", na.rm=TRUE),"\n")


In [None]:
#Total Liver metastases subject in the AnalysisData1
sum(!is.na(AnalysisData1$LivDt))

<mark>Histogram</mark>

In [None]:
# Number of Subjects with Any Metastases Code On or before Pancreatic Cancer date or within 1 month of Pancreatic Cancer (stage 4)

cat("Liver, Colon, Gastric, Abdomen Metastases:",sum(as.Date(AnalysisData1$AllMetDt) <= as.Date(AnalysisData1$PanDt) + 30, na.rm=TRUE),"\n")

# Number of Subjects with Lymp Node Code On or before Pancreatic Cancer date or within 1 month of Pancreatic Cancer (stage 3)

cat("Lymp Node:",sum(as.Date(AnalysisData1$LympNodeDt) <= as.Date(AnalysisData1$PanDt) + 30, na.rm=TRUE),"\n")

# Number of Subjects with Any Metastases Code  Or Lymp Node Code On or before Pancreatic Cancer date or within 1 month of Pancreatic Cancer (stage 4 + Stage 3)
#cat("Stage 3 + Stage 4:", sum((as.Date(AnalysisData1$LympNodeDt) <= as.Date(AnalysisData1$PanDt) + 30) | (as.Date(AnalysisData1$AllMetDt) <= as.Date(AnalysisData1$PanDt) + 30), na.rm=TRUE))

In [None]:
cat("Stage 3 + Stage 4:", sum(!((as.Date(AnalysisData1$LympNodeDt) <= as.Date(AnalysisData1$PanDt) + 30) | (as.Date(AnalysisData1$AllMetDt) <= as.Date(AnalysisData1$PanDt) + 30)) & as.Date(AnalysisData1$LivDt) > as.Date(AnalysisData1$PanDt) + 30, na.rm=TRUE))

#### Remove Subject with Pancreatic patient Diagnosis Type as "Emergency"

In [None]:
# data with >=3 diagnosis encounter and >=1 outpatient diagnosis encounter and no emergency first pancreatic patient encounter (step 5)
AnalysisData2 <- pan_liv_final2[(pan_liv_final2$diagEnc90 >= 3 & !is.na(pan_liv_final2$diagEnc90)) & (pan_liv_final2$diagEnc90OutTemp >= 1 & !is.na(pan_liv_final2$diagEnc90OutTemp)) &
        !(pan_liv_final2$PanEncounterType == "Emergency" & !is.na(pan_liv_final2$PanEncounterType)),]
nrow(AnalysisData2)

In [None]:
# Create categories with factor to ensure all levels are included
LivMetCount <- factor(
  ifelse(
    !is.na(AnalysisData2$LivDt) & as.Date(AnalysisData2$LivDt) > as.Date(AnalysisData2$PanDt) + 30, "LivDt > PanDt + 30D",
    ifelse(
      !is.na(AnalysisData2$LivDt) & (as.Date(AnalysisData2$LivDt) <= as.Date(AnalysisData2$PanDt) + 30) & 
        as.Date(AnalysisData2$LivDt) > as.Date(AnalysisData2$PanDt), "PanDt < LivDt <= PanDt + 30D",
      ifelse(
        !is.na(AnalysisData2$LivDt) & as.Date(AnalysisData2$LivDt) == as.Date(AnalysisData2$PanDt), "LivDt = PanDt",
        ifelse(!is.na(AnalysisData2$LivDt) & as.Date(AnalysisData2$LivDt) < as.Date(AnalysisData2$PanDt), "LivDt < PanDt", "LivDt Missing")
      )
    )
  ),
  levels = c(
    "LivDt > PanDt + 30D", 
    "PanDt < LivDt <= PanDt + 30D", 
    "LivDt = PanDt", 
    "LivDt < PanDt", 
    "LivDt Missing"
  )
)

# Count occurrences for each category
LivMetCount_table <- table(LivMetCount)

# Define colors for the groups
colors <- c("skyblue", "orange", "lightgreen", "pink", "blue")

# Create the barplot
bar_positions <- barplot(
  LivMetCount_table,
  col = colors,                       # Apply colors
  main = "Histogram of Liver Metastases Date for Analysis Data 2", 
  xlab = "Groups",
  ylab = "Count of Subjects",
  ylim = c(0, max(LivMetCount_table) + 5) # Add space for counts above bars
)

# Add counts above each bar
text(bar_positions, LivMetCount_table + 1, labels = LivMetCount_table, col = "black", cex = 0.8)

# Add legend in the upper-right corner
legend("topright", legend = names(LivMetCount_table), fill = colors, title = "Groups")


In [None]:
# Count occurrences directly
df <- AnalysisData2

condition_counts <- data.frame(
  Condition = c(
    "Presence of metastatic cancer Date <= PanDt + 30D",
    "Presence of Lymph Node <= PanDt + 30D",
    "Lymph node or metastatic code reasonably near PanDt",
    "Pts with any metastases after Panreatic diagnosis",
    "Death without any metastases"
  ),
  Count = c(
    sum(as.Date(df$AllMetDt) <= as.Date(df$PanDt) + 30, na.rm=TRUE),
    sum(as.Date(df$LympNodeDt) <= as.Date(df$PanDt) + 30, na.rm=TRUE),
    sum(as.Date(df$AllMetDt) <= as.Date(df$PanDt) + 30 | as.Date(df$LympNodeDt) <= as.Date(df$PanDt) + 30, na.rm=TRUE),
    sum(as.Date(df$AllMetDt) > as.Date(df$PanDt) + 30, na.rm=TRUE),
    sum(is.na(df$AllMetDt) & as.Date(df$DTHDT) > as.Date(df$PanDt), na.rm=TRUE)
  )
)


 table2_2 <- ggplot(condition_counts, aes(x = Condition, y = Count, fill = Condition)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = Count), vjust = -0.5, size = 5) +  # Add numbers on top of bars
  theme_minimal() +
  labs(title = "Histogram of Patient Conditions (analysis data 2)",
       x = "Condition",
       y = "Count",
       fill = "Condition Type") +  # Legend label
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "bottom") +  # Move legend to bottom
  scale_fill_brewer(palette = "Set3")  # Use color-friendly palette


In [None]:
display_plot(table2_2 , width = 8, height = 9)

In [None]:
condition_counts

In [None]:
    #Pts with Metastic cancer at any point from 2018 to last encounter death or Death without any metastatic code 
    sum(!is.na(AnalysisData2$AllMetDt) | as.Date(AnalysisData2$DTHDT) > as.Date(AnalysisData2$PanDt), na.rm=TRUE)

#### Time-To-Event data

In [None]:
# Time to event analysis data (No Liver metastases till end of study(censored) or Liver metastases reasonably after Pancreatic diagnosis(event) and remove subject who died on or before Pancreatic cancer date)
AnalysisData3 <- AnalysisData2[AnalysisData2$LivDtFl30 =="Y"| is.na(AnalysisData2$LivDt),]
nrow(AnalysisData3)

In [None]:
# step 6.2 Subject with deathdate < panDt Data Issue
sum(as.Date(AnalysisData3$DTHDT) <= as.Date(AnalysisData3$PanDt))

In [None]:
# Count occurrences directly
df <- AnalysisData3

condition_counts <- data.frame(
  Condition = c(
    "Presence of metastatic cancer Date <= PanDt + 30D",
    "Presence of Lymph Node Date <= PanDt + 30D",
    "Lymph node or metastatic code reasonably near PanDt",
    "Pts with any metastases after Panreatic diagnosis",
    "Death without any metastases",
    "Liver Metastases Diagnosis"
  ),
  Count = c(
    sum(as.Date(df$AllMetDt) <= as.Date(df$PanDt) + 30, na.rm=TRUE),
    sum(as.Date(df$LympNodeDt) <= as.Date(df$PanDt) + 30, na.rm=TRUE),
    sum(as.Date(df$AllMetDt) <= as.Date(df$PanDt) + 30 | as.Date(df$LympNodeDt) <= as.Date(df$PanDt) + 30, na.rm=TRUE),
    sum(as.Date(df$AllMetDt) > as.Date(df$PanDt) + 30, na.rm=TRUE),
    sum(is.na(df$AllMetDt) & as.Date(df$DTHDT) > as.Date(df$PanDt), na.rm=TRUE),
    sum(as.Date(df$LivDt) > as.Date(df$PanDt) + 30,na.rm=TRUE)
    
  )
)


 table3_1 <- ggplot(condition_counts, aes(x = Condition, y = Count, fill = Condition)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = Count), vjust = -0.5, size = 5) +  # Add numbers on top of bars
  theme_minimal() +
  labs(title = "Histogram of Patient Conditions(time-to-event data)",
       x = "Condition",
       y = "Count",
       fill = "Condition Type") +  # Legend label
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "bottom") +  # Move legend to bottom
  scale_fill_brewer(palette = "Set3")  # Use color-friendly palette


In [None]:
    #Pts with Metastic cancer at any point from 2018 to last encounter or Death without any metastatic code 
    sum(!is.na(AnalysisData3$AllMetDt) | (is.na(AnalysisData3$AllMetDt)  & as.Date(AnalysisData3$DTHDT) > as.Date(AnalysisData3$PanDt)), na.rm=TRUE)

In [None]:
display_plot(table3_1 , width = 10, height = 10)

<mark>Graph</mark><mark></mark>

In [None]:
sum(!is.na(AnalysisData3$DTHDT))

In [None]:
display_df(AnalysisData3[AnalysisData3$AVAL<0,c("PersonId","PanDt","DTHDT","CNSR","ADT","AVAL")],10)

In [None]:
# check if time status is negative(remove)
# Time to Liver Metastases Analysis
trial <- AnalysisData3 %>% mutate(time_year = AVAL, 
       event = ifelse(CNSR == 0,1,0))  %>%
        filter(time_year > 0) %>%
        select(time_year,event)

# Kaplan-Meier for Time to Liver Metastases
S_met <- survfit(Surv(trial$time_year, trial$event) ~ 1)

plot(S_met, xlab = "Time (Years)", main = "Kaplan-Meier Curve: Time to Liver Metastases", xlim = c(0,5))

In [None]:
summary(S_met, times = c(1, 2, 3, 4, 5))

In [None]:
# check if time status is negative(remove)
# Time to Death Analysis
trial <- AnalysisData3 %>% mutate(time_year = AVAL, 
       event = ifelse(CNSR == 2,1,0))  %>%
        filter(time_year>0) %>%
        select(time_year,event)

# Kaplan-Meier for Time to Death
S_met <- survfit(Surv(trial$time_year, trial$event) ~ 1)

plot(S_met, xlab = "Time (Years)", main = "Kaplan-Meier Curve: Time to Death", xlim = c(0,5))

In [None]:
summary(S_met, times = c(1, 2, 3, 4, 5))

In [None]:
# check if time status is negative(remove)
# Time to Death or liver metastases Analysis
trial <- AnalysisData3 %>% mutate(time_year = AVAL, 
       event = ifelse(CNSR == 2 | CNSR == 0,1,0))  %>%
        filter(time_year>0) %>%
        select(time_year,event)

# Kaplan-Meier for Time to Death
S_met <- survfit(Surv(trial$time_year, trial$event) ~ 1)

plot(S_met, xlab = "Time (Years)", ylab = "Survival Probability", main = "Kaplan-Meier Curve: Time to Death/Liver Metastases", xlim = c(0,5))

In [None]:
summary(S_met, times = c(1, 2, 3, 4, 5))

In [None]:
# median Time to Liver metastases 
median(AnalysisData3$DiffPanDthDays,na.rm=TRUE)
#plot(trial$time_year)

In [None]:
cat("Subjects with PanDt before Livdt or LivDt missing are considered:",sum(data$PanDt <= data$LivDt | is.na(data$LivDt), na.rm=TRUE))

In [None]:
display_df(data2,20)

In [None]:
# check if time status is negative
# Time to Liver Metastases Analysis
time_year <- data2$AVAL
event <- ifelse(data2$LivDtFl=="Y",1,0)

In [None]:
# check negative values in AVAL
display_df(data2[data2$AVAL<0,],20)

In [None]:
summary(time_year)
table(time_year < 0)  # Check if there are negative values

In [None]:
survfit2(Surv(time, status) ~ 1, data = lung) %>% 
  ggsurvfit() +
  labs(
    x = "Days",
    y = "Overall survival probability"
    ) + 
  add_confidence_interval() +
  add_risktable()

  summary(survfit(Surv(time, status) ~ 1, data = lung), times = 365.25)

  survfit(Surv(time, status) ~ 1, data = lung) %>% 
  tbl_survfit(
    times = 365.25,
    label_header = "**1-year survival (95% CI)**"
  )

In [None]:
# check if time status is negative
# Time to Liver Metastases Analysis
trial1 <- data2 %>% mutate(time_year = as.numeric(DiffPanDthDays/365.25), event = ifelse(data2$LivDtFl=="Y",1,0)) %>% filter(time_year>0) %>%
select(time_year,event) 

# Kaplan-Meier for Time to Liver Metastases
S_met <- survfit(Surv(trial1$time_year, trial1$event) ~ 1)
summary(S_met, times = c(1, 2, 3, 4, 5))
plot(S_met, xlab = "Time (Years)", ylab = "Survival Probability", main = "Kaplan-Meier Curve: Time to Liver Metastases")

In [None]:
nrow(trial)

Check the possible reasons for less metastases count

In [None]:
Temp_AnalysisData2 <- pan_liv_final4_3[(pan_liv_final4_3$diagEnc90 >= 3 & !is.na(pan_liv_final4_3$diagEnc90)) & (pan_liv_final4_3$diagEnc90OutTemp >= 1 & !is.na(pan_liv_final4_3$diagEnc90OutTemp)) &
        !(pan_liv_final4_3$PanEncounterType == "Emergency" & !is.na(pan_liv_final4_3$PanEncounterType)),]
nrow(Temp_AnalysisData2)

In [None]:
head(Temp_AnalysisData2,10)

In [None]:
# Time to event analysis data (No Liver metastases till end of study(censored) or Liver metastases reasonably after Pancreatic diagnosis(event) and remove subject who died on or before Pancreatic cancer date)
Temp_AnalysisData3 <- Temp_AnalysisData2[Temp_AnalysisData2$LivDtFl30 =="Y"| is.na(Temp_AnalysisData2$LivDt),]
nrow(Temp_AnalysisData3)

In [None]:
data <- Temp_AnalysisData3

# Proportion of Male vs Female in all data
cat("Female:", sum(data$Sex == "Female"), "\n")
cat("Male:", sum(data$Sex == "Male"), "\n")

# Incidence of Liver Metastases in Male vs Female
cat("Analysis Data 3 Total LM:", sum(as.Date(data$LivDt) >= as.Date(data$PanDt), na.rm=TRUE), "\n")
cat("LM in F:", sum(data$Sex == "Female" & as.Date(data$LivDt) >= as.Date(data$PanDt), na.rm=TRUE), "\n")
cat("LM in M:", sum(data$Sex == "Male" & as.Date(data$LivDt) >= as.Date(data$PanDt), na.rm=TRUE), "\n")

cat("Analysis Data 3 Total Metastases:", sum(as.Date(data$AllMetDt) >= as.Date(data$PanDt), na.rm=TRUE), "\n")
cat("Metastases in Female:", sum(data$Sex == "Female" & as.Date(data$AllMetDt) >= as.Date(data$PanDt), na.rm=TRUE), "\n")
cat("Metastases in Male:", sum(data$Sex == "Male" & as.Date(data$AllMetDt) >= as.Date(data$PanDt), na.rm=TRUE), "\n")

cat("Analysis Data 3 Total Metastases after 30 d of PanDt:", sum(as.Date(data$AllMetDt) > as.Date(data$PanDt) + 30, na.rm=TRUE), "\n")
cat("Metastases in Female:", sum(data$Sex == "Female" & as.Date(data$AllMetDt) > as.Date(data$PanDt) + 30, na.rm=TRUE), "\n")
cat("Metastases in Male:", sum(data$Sex == "Male" & as.Date(data$AllMetDt) > as.Date(data$PanDt) + 30, na.rm=TRUE), "\n")

In [None]:
# Average or Median follow-up time

median(data <- Temp_AnalysisData3$AVAL)

In [None]:
#Overall Survival vs Time-to event survival

# check if time status is negative(remove)
# Time to Death for Analysis
trial1 <- Temp_AnalysisData3 %>% mutate(time_year = AVAL, 
       event = ifelse(CNSR == 2 | CNSR == 0,1,0))  %>%
        filter(time_year>0) %>%
        select(time_year,event)

# Kaplan-Meier for Time to Death
S_met <- survfit(Surv(trial1$time_year, trial1$event) ~ 1)

plot(S_met, xlab = "Time (Years)", ylab = "Survival Probability", main = "Localized: Time to Death", xlim = c(0,5))



In [None]:
trial <- pan_liv_final4_3 %>% mutate(time_year = AVAL, 
       event = ifelse(CNSR == 2 | CNSR == 0,1,0))  %>%
        filter(time_year>0) %>%
        select(time_year,event)

# Kaplan-Meier for Time to Death
S_met <- survfit(Surv(trial$time_year, trial$event) ~ 1)

plot(S_met, xlab = "Time (Years)", ylab = "Survival Probability", main = "Overall: Time to Death", xlim = c(0,5))

In [None]:
summary(S_met, times = c(1, 2, 3, 4, 5))

In [None]:
# plotting Death distribution
# Define bins for the ranges
bins <- c(-Inf, 0, 1, 7, 30, 90, 360, 722, Inf)  # The edges of your ranges

# Create labels for the bins
labels <- c("0 Days", "1 Day", "2-7 Days", "8-30 Days", "31-90 Days", "91-360 Days", "361-722 Days", "723+ Days")

# Categorize the data into the bins
days_bins <- cut(Temp_AnalysisData2$DiffPanDthDays, breaks = bins, labels = labels, right = TRUE)

# Calculate frequencies
freq_table <- table(days_bins)

# Print frequency table
print(freq_table)

# Create the histogram
library(ggplot2)
ggplot(data = as.data.frame(freq_table), aes(x = days_bins, y = Freq)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(
    title = "Histogram of Number of Days of Death After Diagnosis",
    x = "Time Ranges",
    y = "Frequency"
  ) +
  theme_minimal()


In [None]:
stop_session()