# Initialize Truveta SDK

In [1]:
# These are some commonly used R Packages.  
# The arrow package makes loading data with spark faster. 
library(readr, warn.conflicts = FALSE)
library(arrow, warn.conflicts = FALSE)
library(magrittr, warn.conflicts = FALSE)
library(stringr, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(rlang, warn.conflicts = FALSE)
library(data.table, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)
library(tidyr, warn.conflicts = FALSE)
library(truveta.notebook.study)
library(sparklyr)
library(ggplot2)
library(reshape2)
library(survival)
library(dplyr)
library(tidyr)
library(knitr)

In [2]:
print("load snapshot")
con <- create_connection()
study <- get_study(con)
#print(study)
population <- get_population(con, id='p-an3klyy7kz2u7hq3fngqv7v63i')
snapshot <- get_latest_snapshot(con, population)
snapshot

In [3]:
# get list of tables from the snapshot
tables <- get_tables(con, snapshot)

In [None]:
tables

In [4]:
#Get your working directory
# use fs = true when reading and writing files locally
output_path_local <- get_output_path(con, study, fs = TRUE)
output_path_local

In [31]:
# Read the file
t1 <- paste(output_path_local, "/ADSL.csv", sep = "")

# use read.csv to read file into a R dataframe
ADSL <- read.csv(t1)

In [65]:
#Write the file
# Build path
file_to_write <- paste(output_path_local, "/FinalPatient.csv", sep = "")

# use write.csv to write your file
write.csv(FinalPatient, file_to_write, row.names = FALSE)

In [12]:
# Convert R DataFrame to Spark DataFrame
ADSL_sql <- as.DataFrame(ADSL)
createOrReplaceTempView(ADSL_sql, "ADSL_sql") 

In [None]:
# Check the sourceconcept codes
sql <- "
    SELECT SourceConceptId, count (*) as cnt FROM ConditionCodeConceptMap
    group by SourceConceptId
    "
source_id_cond_tbl <- load_sql_table(con, snapshot,sql)
#display_df(source_id_cond_tbl)

In [None]:
# All encounters
sql <-
"
 SELECT *,
    CASE 
	    WHEN (TypeConceptId in (3059272,1065290,1065342) and ClassConceptId in (1065217, 1065225)) then 'Emergency'
      WHEN (TypeConceptId in (3059272,1065297,3059289,1065290,1065342,1065307) and ClassConceptId in (1065215,1065220)) then 'Inpatient'
      WHEN (TypeConceptId in (1065310,3059277,1065310,1065286) and ClassConceptId in (2649591,1065216,1067561,1065227,1065226,1065220,0,1065217,1065225)) then 'LabImaging'
      WHEN (TypeConceptId in (3059271,3059272,2649591,1067557,1065280,1065333,1065318,1065323,1065342,1065330,1065337) and ClassConceptId in (1065216,1065227,1065225)) then 'Outpatient'
      WHEN (TypeConceptId in (3059263,3059301,1067555,3059265,3059272,2649591,3059264) and ClassConceptId in (1065230,1065216,0)) then 'Virtual'
      else 'Other'
    end as EncounterType
  FROM Encounter 
  WHERE ClassConceptId !=1067555 
        AND StatusConceptId NOT IN (2506591,2983200,2983199,2506590)
"
encounterF <- load_sql_table(con, snapshot, sql, view_name='encounterF',output_mode = "sparklyr")

In [14]:
mci_icd10_codes = codeset(con, snapshot,"ICD10CM", 'selfAndDescendants',"G31.84")

mci_snomed_codes = codeset(con, snapshot,
  "SNOMED CT",
  'selfAndDescendants',
  "386805003",
  "836301008")

mci_icd9_codes = codeset(con,snapshot,"ICD9CM", 'selfAndDescendants',"331.83")

mci_codes = rbind(mci_icd10_codes,mci_snomed_codes,mci_icd9_codes)
#create view
create_view(mci_codes, 'mci_codes')

In [None]:
# MCI patients
sql1 <- "

WITH tb1 as 
(
    SELECT DISTINCT c.PersonId, c.EncounterId, c.RecordedDateTime, c.OnsetDateTime, c.PrimaryDiagnosisConceptId, c.CategoryConceptId
    FROM condition c
     INNER JOIN ConditionCodeConceptMap ccm 
      ON c.CodeConceptMapId = ccm.Id
     INNER JOIN mci_codes cc 
      ON ccm.CodeConceptId = cc.ConceptId
     WHERE ccm.SourceConceptId IN (2703595, 2703594)
      AND (c.OnsetDateTime IS NOT NULL OR c.RecordedDateTime IS NOT NULL) 
      AND c.VerificationStatusConceptId NOT IN (1065198,1065195)
      AND c.ClinicalStatusConceptId NOT IN (1065184,1065178,1065183)
),

-- count the total number of MCI diagnosis records for each PersonId 
tb2 as
(
    SELECT PersonId, count(*) as MCITotalDiagC
    FROM tb1
    GROUP BY PersonId
),

tb3 as 
(
 SELECT 
  c.PersonId, c.EncounterId as MCIEncounterId, c.PrimaryDiagnosisConceptId as MCIPDId,
  COALESCE(RecordedDateTime,OnsetDateTime) as MCIDT,
  CASE
    WHEN RecordedDateTime IS NOT NULL THEN 'Y'
      ELSE 'N'
    END AS MCIRecordedDtFl,
  ROW_NUMBER() OVER (PARTITION BY c.PersonId ORDER BY COALESCE(RecordedDateTime,OnsetDateTime)) AS RowNum,
  e.ClassConceptId as MCIEncClassId, 
  e.TypeConceptId as MCIEncTypeId, 
  e.EncounterType as MCIEncounterType
  FROM tb1 c
     INNER JOIN encounterF e ON e.Id = c.EncounterId
  WHERE c.CategoryConceptId NOT IN (1065172,1065174) -- Remove Medical History/ Problem List
)  
    
 SELECT t3.*, t2.MCITotalDiagC
   FROM tb3 t3
   JOIN tb2 t2 ON t3.PersonId = t2.PersonId
   WHERE t3.RowNum = 1
"
MCI <- load_sql_table(con, snapshot, sql1, view_name='MCI',output_mode = "sparklyr")

In [None]:
%%sql
SELECT count(*) FROM MCI;

In [23]:
#AD condition codes

ad_icd10_codes = codeset(con, snapshot,"ICD10CM",'selfAndDescendants', "G30")

ad_icd9_codes = codeset(con, snapshot, "ICD9CM",'selfAndDescendants', "331.0")

ad_SNOMED_codes = codeset(con, snapshot,"SNOMED CT",'selfAndDescendants', "26929004")

AD_codes=rbind(ad_icd10_codes,ad_icd9_codes,ad_SNOMED_codes)

create_view(AD_codes, "AD_codes")

In [None]:
# AD patients
sql2 <- "

WITH tb1 as 
(
    SELECT DISTINCT c.PersonId, c.EncounterId, c.RecordedDateTime, c.OnsetDateTime, c.PrimaryDiagnosisConceptId, c.CategoryConceptId
    FROM condition c
     INNER JOIN ConditionCodeConceptMap ccm 
      ON c.CodeConceptMapId = ccm.Id
     INNER JOIN AD_codes cc 
      ON ccm.CodeConceptId = cc.ConceptId
     WHERE ccm.SourceConceptId IN (2703595, 2703594)
      AND (c.OnsetDateTime IS NOT NULL OR c.RecordedDateTime IS NOT NULL) 
      AND c.VerificationStatusConceptId NOT IN (1065198,1065195)
      AND c.ClinicalStatusConceptId NOT IN (1065184,1065178,1065183)
),

-- count the total number of AD diagnosis records for each PersonId 
tb2 as
(
    SELECT PersonId, count(*) as ADTotalDiagC
    FROM tb1
    GROUP BY PersonId
),

tb3 as 
(
 SELECT 
  c.PersonId, c.EncounterId as ADEncounterId, c.PrimaryDiagnosisConceptId as ADPDId, c1.ConceptName as ADCondCategory,
  COALESCE(RecordedDateTime,OnsetDateTime) as ADDT,
  CASE
    WHEN RecordedDateTime IS NOT NULL THEN 'Y'
      ELSE 'N'
    END AS ADRecordedDtFl,
  ROW_NUMBER() OVER (PARTITION BY c.PersonId ORDER BY COALESCE(RecordedDateTime,OnsetDateTime)) AS RowNum,
  e.ClassConceptId as ADEncClassId, 
  e.TypeConceptId as ADEncTypeId, 
  e.EncounterType as ADEncounterType
  FROM tb1 c
     INNER JOIN encounterF e ON e.Id = c.EncounterId
     INNER JOIN concept c1 ON c.CategoryConceptId = c1.ConceptId
)
    
 SELECT t3.*, t2.ADTotalDiagC
   FROM tb3 t3
   JOIN tb2 t2 ON t3.PersonId = t2.PersonId
   WHERE t3.RowNum = 1
"
AD <- load_sql_table(con, snapshot, sql2, view_name='AD',output_mode = "sparklyr")

In [None]:
%%sql
SELECT count(*) FROM AD;

In [None]:
#Diagnosis Encounter
# Step 1: All condition
# Step 2: All Encounter with Encounter Id in Condition Table

sql3 <-
"
WITH all_cond as 
(
    SELECT DISTINCT c.Personid, c.EncounterId, ccm.CodeConceptId
    FROM condition c 
     INNER JOIN ConditionCodeConceptMap ccm 
      ON c.CodeConceptMapId = ccm.Id
     WHERE ccm.SourceConceptId IN (2703595, 2703594)
      AND c.VerificationStatusConceptId NOT IN (1065198,1065195)
      AND c.ClinicalStatusConceptId NOT IN (1065184,1065178,1065183)
),

all_enc as 
(
 SELECT 
    c.PersonId, c.EncounterId, e.StartDateTime as DiagEncStart, e.EncounterType
 FROM encounterF e
  INNER JOIN all_cond c
   ON e.Id = c.EncounterId
 WHERE e.StartDateTime IS NOT NULL 
),

diagencbefmci as
(
    SELECT e.PersonId, count(distinct e.DiagEncStart) as diagencbefmci, min(e.DiagEncStart) as FirstDiagEncDt
    FROM MCI p
    INNER JOIN all_enc e
    ON p.PersonId = e.PersonId
    WHERE e.DiagEncStart < p.MCIDT
    GROUP BY e.PersonId
),

-- Number of Diagnosis encounter 90 days before MCI diagnosis
diagenc90 as
(
    SELECT e.PersonId, count(distinct e.DiagEncStart) as diagEnc90
    FROM MCI p
    INNER JOIN all_enc e
    ON p.PersonId = e.PersonId
    WHERE e.DiagEncStart < DATE_SUB(p.MCIDt,90)
    GROUP BY e.PersonId
),

-- Number of Diagnosis encounter 30 days before MCI diagnosis
diagenc30 as
(
    SELECT e.PersonId, count(distinct e.DiagEncStart) as diagEnc30
    FROM MCI p
    INNER JOIN all_enc e
    ON p.PersonId = e.PersonId
    WHERE e.DiagEncStart < DATE_SUB(p.MCIDT,30)
    GROUP BY e.PersonId
),

-- Number of Outpatients Diagnosis encounter 90 days before MCI diagnosis
diagenc90Out as
(
    SELECT e.PersonId, count(distinct e.DiagEncStart) as diagEnc90Out
    FROM MCI p
    INNER JOIN all_enc e
    ON p.PersonId = e.PersonId
    WHERE e.DiagEncStart < DATE_SUB(p.MCIDT,90) AND e.EncounterType = 'Outpatient'
    GROUP BY e.PersonId
),

-- Number of Outpatient Diagnosis encounter 30 days before MCI diagnosis
diagenc30Out as
(
    SELECT e.PersonId, count(distinct e.DiagEncStart) as diagEnc30Out
    FROM MCI p
    INNER JOIN all_enc e
    ON p.PersonId = e.PersonId
    WHERE e.DiagEncStart < DATE_SUB(p.MCIDT,30) AND e.EncounterType = 'Outpatient'
    GROUP BY e.PersonId
),

diagencaftrmci as
(
    SELECT e.PersonId, count(distinct e.DiagEncStart) as diagencaftrmci, max(e.DiagEncStart) as LastDiagEncDt
    FROM MCI p
    INNER JOIN all_enc e
    ON p.PersonId = e.PersonId
    WHERE e.DiagEncStart > p.MCIDT
    GROUP BY e.PersonId
),

-- Number of diagnosis encounter 1 year before MCI diagnosis

diagnenc1year as
(
    SELECT e.PersonId, count(distinct e.DiagEncStart) as diagnenc1year
    FROM MCI p
    INNER JOIN all_enc e
    ON p.PersonId = e.PersonId
    WHERE e.DiagEncStart >= DATE_SUB(p.MCIDT,360) AND e.DiagEncStart < p.MCIDT
    GROUP BY e.PersonId
),

EncDates as
(
    SELECT PersonId, min(StartDateTime) as FirstEncDt, max(StartDateTime) as LastEncDt
    FROM EncounterF e
    GROUP BY PersonId
)

SELECT p.*, h.FirstDiagEncDt, f.LastDiagEncDt, e.FirstEncDt, e.LastEncDt, h.diagencbefmci, f.diagencaftrmci, g.diagnenc1year, a.diagEnc90, b.diagEnc30, c.diagEnc90Out, d.diagEnc30Out   
FROM MCI p
LEFT JOIN diagencbefmci h
      ON p.PersonId=h.PersonId
LEFT JOIN diagenc90 a 
     ON p.PersonId=a.PersonId
LEFT JOIN diagenc30 b 
     ON p.PersonId=b.PersonId
LEFT JOIN diagenc90Out c 
     ON p.PersonId=c.PersonId
LEFT JOIN diagenc30Out d 
     ON p.PersonId=d.PersonId
LEFT JOIN EncDates e 
     ON p.PersonId=e.PersonId
LEFT JOIN diagencaftrmci f 
     ON p.PersonId=f.PersonId
LEFT JOIN diagnenc1year g
     ON p.PersonId=g.PersonId
"
MCIDiagEnc<- load_sql_table(con, snapshot, sql3, view_name='MCIDiagEnc',output_mode = "sparklyr")

In [None]:
# Get the death results

sql6 <- "
   SELECT 
    PersonId, 
    min(COALESCE(DeathDateTime, RecordedDateTime)) AS DTHDT,
    'Y' AS DTHFL
   FROM 
    PersonDeathFact d
   GROUP BY 
    PersonId
"
death_tb <- load_sql_table(con,snapshot,sql6, view_name='death_tb', output_mode = "sparklyr")

In [None]:
#Count total subjects
%%sql
SELECT COUNT(DISTINCT PersonId)
FROM death_tb

#### Dementia

In [11]:
#Dementia codes
othdementia = codeset_from_prose(con,snapshot, url = "/definitions/dementia", variable_name = "dementia") 
create_view(othdementia, "othdementia")

In [13]:
sql7 <- "
WITH tb1 as 
(
    SELECT DISTINCT c.PersonId, c.EncounterId, c.RecordedDateTime, c.OnsetDateTime, c.CategoryConceptId, cc.ConceptCode as OthDementiaCode,
    cc.ConceptName as OthDementiaName,
    CASE
        WHEN c.PrimaryDiagnosisConceptId = 1200406 THEN 1  -- “Yes”
        WHEN c.PrimaryDiagnosisConceptId = 1200405 THEN 0  -- “No”
        ELSE NULL
      END AS PrimDiagnosisNum
    FROM condition c
     INNER JOIN ConditionCodeConceptMap ccm 
      ON c.CodeConceptMapId = ccm.Id
     INNER JOIN othdementia cc 
      ON ccm.CodeConceptId = cc.ConceptId
      WHERE ccm.SourceConceptId IN (2703595, 2703594)
      AND (c.OnsetDateTime IS NOT NULL OR c.RecordedDateTime IS NOT NULL) 
      AND c.VerificationStatusConceptId NOT IN (1065198,1065195)
      AND c.ClinicalStatusConceptId NOT IN (1065184,1065178,1065183)
),

-- count the total number of Other Dementia diagnosis records for each PersonId 
tb2 as
(
    SELECT PersonId, count(*) as DementiaTotalDiagC, sum(PrimDiagnosisNum) as  OthDemPrimDiagnosisSum 
    FROM tb1
    GROUP BY PersonId
),

tb3 as 
(
 SELECT 
  c.PersonId, c.EncounterId as DementiaEncounterId, 
  COALESCE(RecordedDateTime,OnsetDateTime) as OthDementiaDt,
  CASE
    WHEN RecordedDateTime IS NOT NULL THEN 'Y'
      ELSE 'N'
    END AS OthDementiaRecordedDtFl,
  ROW_NUMBER() OVER (PARTITION BY c.PersonId ORDER BY COALESCE(RecordedDateTime,OnsetDateTime)) AS RowNum
  FROM tb1 c
)  
    
 SELECT t3.*, t2.DementiaTotalDiagC, 
    CASE
      WHEN t2.OthDemPrimDiagnosisSum = 1 THEN 'Y'
      WHEN t2.OthDemPrimDiagnosisSum = 0 THEN 'N'
      ELSE NULL                                    
    END AS OthDemPrimDiagFl
   FROM tb3 t3
   JOIN tb2 t2 ON t3.PersonId = t2.PersonId
   WHERE t3.RowNum = 1
"
OthDementia <- load_sql_table(con, snapshot, sql7, view_name='OthDementia',output_mode = "sparklyr") %>% collect()


In [14]:
summary(OthDementia)

In [15]:
# Merge all MCI patients with dementia
ADSL2 <- ADSL1 %>%
  left_join(OthDementia %>% select(-RowNum), by = "PersonId")
nrow(ADSL2)

### #Additional Updates

In [None]:
# Derive Age at the time of Pancreatic diagnosis
ADSL1 <- ADSL1 %>% mutate(AgeAtDiagnosis = round(as.numeric(as.Date(MCIDT) - as.Date(BirthDateTime)) / 365.25),
                                  Age_Group = ifelse(AgeAtDiagnosis < 65, "Age < 65",
                                                     ifelse(AgeAtDiagnosis >= 65 & AgeAtDiagnosis < 75, "Age 65-74",
                                                            ifelse(AgeAtDiagnosis >= 75 & AgeAtDiagnosis < 85, "Age 75-84",
                                                            "Age 85+"
                                                            )
                                                            )
                                                    )
                                )

In [None]:
# Create a seperate Region Variable

# Define region mapping
ADSL1 <- ADSL1 %>%
  mutate(Region = as.factor(case_when(
    StateOrProvince %in% c("Connecticut", "Maine", "Massachusetts", "New Hampshire", 
                           "Rhode Island", "Vermont", "New Jersey", "New York", "Pennsylvania") ~ "Northeast",
    StateOrProvince %in% c("Illinois", "Indiana", "Michigan", "Ohio", "Wisconsin", 
                           "Iowa", "Kansas", "Minnesota", "Missouri", "Nebraska", 
                           "North Dakota", "South Dakota") ~ "Midwest",
    StateOrProvince %in% c("Delaware", "Florida", "Georgia", "Maryland", "North Carolina", 
                           "South Carolina", "Virginia", "West Virginia", "Alabama", "Kentucky", 
                           "Mississippi", "Tennessee", "Arkansas", "Louisiana", "Oklahoma", "Texas") ~ "South",
    StateOrProvince %in% c("Arizona", "Colorado", "Idaho", "Montana", "Nevada", 
                           "New Mexico", "Utah", "Wyoming", "Alaska", "California", 
                           "Hawaii", "Oregon", "Washington") ~ "West",
    TRUE ~ "Unknown" # Default case for states not listed
  )))

In [12]:
ADSL1 <- ADSL %>%
  mutate(
    AdFlBase = ifelse(
      !is.na(ADDT) & as.Date(MCIDT) <= as.Date(ADDT) & as.Date(ADDT) <= as.Date(MCIDT) + 30,
      "Y", "N"
    ),

    CNSR = ifelse(
      !is.na(ADDT), 1, 
      ifelse(
        !is.na(DTHFL) & DTHFL == "Y", 2, 
        0 )), # Censoring indicator variable

    ADT = case_when(
      CNSR == 1 ~ as.Date(ADDT),
      CNSR == 2 ~ as.Date(DTHDT),
      TRUE ~ pmin(as.Date(LastEncDt), as.Date("2025-05-29"), na.rm = TRUE)
    ),  # Analysis Date

    # Difference in Time from Analysis date and StartDate(PanDT)
    AVAL = round(as.numeric(ADT - as.Date(MCIDT) + 1) / 365.25, 2),

    ADTotalDiagC = ifelse(is.na(ADTotalDiagC), 0, ADTotalDiagC)
  )

  # TTA = ifelse(is.na(ADDT), NA, round(as.numeric(as.Date(ADDT) - as.Date(MCIDT) + 1) / 365.25, 2)),
  #STUDYENDT = coalesce(pmin(as.Date(LastEncDt), as.Date("2025-05-29"), na.rm = TRUE),as.Date("2025-05-29")),
  #FollowupTm = round(as.numeric((STUDYENDT - as.Date(MCIDT)) + 1) / 30.44, 2)

In [19]:
# Data filter

ADSL2 <- ADSL %>% 
  filter(
    AgeAtDiagnosis >= 65,
    Sex != "Unknown",
    as.Date(MCIDT) >= as.Date("2018-01-01"),
    as.Date(MCIDT) <= as.Date("2024-12-31"),
    (is.na(ADDT) | (!is.na(ADDT) & as.Date(ADDT) >= as.Date(MCIDT))),
    round(as.numeric(as.Date(MCIDT) - as.Date(FirstDiagEncDt) + 1), 2) >= 365,
    (!is.na(diagnenc1year) & diagnenc1year >= 2),
    (is.na(DTHFL) | as.Date(DTHDT) > as.Date(MCIDT)),
    (is.na(OthDementiaDt) | (!is.na(OthDementiaDt) & as.Date(OthDementiaDt) >= as.Date(MCIDT)))
  )
nrow(ADSL2)

In [None]:
# Convert R DataFrame to Spark DataFrame
ADSL_sql <- as.DataFrame(ADSL2)
createOrReplaceTempView(ADSL_sql, "ADSL_sql")

#### Read the approved AD drugs: 

In [12]:
med_flag <- function(codes = "codes", med_name = "name") {
create_view(codes, "med_codes")
  
sql <- "
 
WITH map AS (
    SELECT mcm.Id, cc.*
    FROM MedicationCodeConceptMap mcm 
    JOIN Concept cc ON mcm.CodeConceptId = cc.ConceptId
    WHERE mcm.CodeConceptId IN (SELECT ConceptId FROM med_codes)
),
dis1 AS (
    SELECT DISTINCT d.PersonId, d.DispenseDateTime, d.DaysSupply
    FROM MedicationDispense d
    INNER JOIN map mcm ON mcm.Id = d.CodeConceptMapId
    WHERE StatusConceptId NOT IN (2989063, 2989065, 2989060, 2989064)
),
dis2 AS (
    SELECT PersonId, 
           MIN(DispenseDateTime) AS DSSTDT, 
           MAX(DispenseDateTime) AS DSENDT,
           COUNT(DISTINCT DispenseDateTime) AS TotNumDisRecDrug, 
           SUM(DaysSupply) AS TotalDaysSupply
    FROM dis1
    GROUP BY PersonId
),
req1 AS (
    SELECT DISTINCT r.PersonId, r.AuthoredOnDateTime
    FROM MedicationRequest r
    INNER JOIN map mcm ON mcm.Id = r.CodeConceptMapId
    WHERE StatusConceptId NOT IN (2989063, 2989065, 2989060, 2989064)
),
req2 AS (
    SELECT PersonId, 
           MIN(AuthoredOnDateTime) AS RQSTDT,
           MAX(AuthoredOnDateTime) AS RQENDT,
           COUNT(DISTINCT AuthoredOnDateTime) AS TotNumReqRecDrug
    FROM req1
    GROUP BY PersonId
)
SELECT 
    COALESCE(d.PersonId, r.PersonId) AS PersonId,
    d.DSSTDT, d.TotNumDisRecDrug, d.TotalDaysSupply, 
    CASE WHEN d.DSSTDT IS NOT NULL AND d.DSENDT IS NOT NULL 
         THEN DATEDIFF(d.DSENDT, d.DSSTDT) END AS DISSDAYSDRUG,
    r.RQSTDT, r.TotNumReqRecDrug,
    CASE WHEN r.RQSTDT IS NOT NULL AND r.RQENDT IS NOT NULL 
         THEN DATEDIFF(r.RQENDT, r.RQSTDT) END AS REQDAYSDRUG,
    CASE 
        WHEN COALESCE(d.TotNumDisRecDrug, 0) >= 1 THEN 1
        WHEN COALESCE(d.TotNumDisRecDrug, 0) = 0 AND COALESCE(r.TotNumReqRecDrug, 0) >= 2 THEN 1
        WHEN COALESCE(d.TotNumDisRecDrug, 0) = 0 AND COALESCE(r.TotNumReqRecDrug, 0) < 2 THEN 0
        ELSE NULL
    END AS %sFl
FROM dis2 d
FULL OUTER JOIN req2 r ON d.PersonId = r.PersonId

"
sql1 <- sprintf(sql,med_name)

tb <- load_sql_table(con,snapshot, query = sql1, view_name = 'tb', output_mode = "sparklyr" ) %>% collect()

return(tb)

}

In [13]:
Namzaric = codeset_from_prose(con,snapshot, url = "/definitions/approved-ad-drug", variable_name = "Namzaric") 
Namzarictbl = med_flag(codes = Namzaric,med_name = "Namzaric")

In [23]:
donanemab = codeset_from_prose(con,snapshot, url = "/definitions/approved-ad-drug", variable_name = "donanemab")
lecanemab = codeset_from_prose(con,snapshot, url = "/definitions/approved-ad-drug", variable_name = "lecanemab") 
Anemab = rbind(donanemab,lecanemab)
AntiamyloidTbl = med_flag(Anemab,"Antiamyloid")

In [41]:
donepezil = codeset_from_prose(con,snapshot, url = "/definitions/approved-ad-drug", variable_name = "donepezil") 
rivastigmine = codeset_from_prose(con,snapshot, url = "/definitions/approved-ad-drug", variable_name = "rivastigmine")
galantamine = codeset_from_prose(con,snapshot, url = "/definitions/approved-ad-drug", variable_name = "galantamine") 

Cholinesterase = rbind(donepezil,rivastigmine, galantamine)
CholinesteraseTbl = med_flag(Cholinesterase,"Cholinesterase")

In [42]:
memantine = codeset_from_prose(con,snapshot, url = "/definitions/approved-ad-drug", variable_name = "memantine")
MemantineTbl = med_flag(memantine,"Memantine")

In [48]:
#quality check
table(MemantineTbl$MemantineFl)

In [49]:
# Join the ADSL with the approved drugs tbl
cols_to_replace <- c("MemantineFl", "CholinesteraseFl", "NamzaricFl", "antiamyloidFl")

ADSL <- ADSL %>%
  left_join(MemantineTbl %>% select(PersonId, MemantineFl), by = "PersonId") %>%
  left_join(CholinesteraseTbl %>% select(PersonId, CholinesteraseFl), by = "PersonId") %>%
  left_join(Namzarictbl %>% select(PersonId, NamzaricFl), by = "PersonId") %>%
  left_join(antiamyloid %>% select(PersonId, antiamyloidFl), by = "PersonId") %>%
  mutate(across(all_of(cols_to_replace), ~ replace_na(as.integer(.), 0))) 

In [51]:
table(Cholinesterase = ADSL$CholinesteraseFl,Memantine = ADSL$MemantineFl )

In [58]:
#sum(ADSL$CholinesteraseFl == 1 & ADSL$MemantineFl == 1 & ADSL$ADTotalDiagC == 0)
#sum(ADSL$NamzaricFl == 1 & ADSL$ADTotalDiagC == 0)
sum((ADSL$CholinesteraseFl == 1 | ADSL$NamzaricFl == 1 | ADSL$MemantineFl == 1 | ADSL$antiamyloidFl == 1) & ADSL$ADTotalDiagC == 1)

In [59]:
sum(ADSL$ADTotalDiagC == 1)

In [None]:
# Temporary
# Combine both datasets
AD_Drug_tbl_temp <- AD_Drug_tbl %>% select(PersonId, DSSTDT, DSENDT, TotNumDisRec, TotDaysSupply, DrugName, DISSDAYSDRUG) %>%
                    filter(!is.na(AD_Drug_tbl$DrugName))
                    
combined_AD_Drug_tbl <- bind_rows(AD_Drug_tbl_temp, new_AD_Drug_tbl)

# Summarize per person
AD_Drug_tbl_final <- combined_AD_Drug_tbl %>%
  group_by(PersonId) %>%
  summarise(
    DSSTDT = min(DSSTDT, na.rm = TRUE),
    DSENDT = max(DSENDT, na.rm = TRUE),
    TotNumDisRec = sum(TotNumDisRec, na.rm = TRUE),
    TotDaysSupply = sum(TotDaysSupply, na.rm = TRUE),
    DISSDAYSDRUG = as.numeric(difftime(max(DSENDT, na.rm = TRUE), min(DSSTDT, na.rm = TRUE), units = "days"))
  )

cat("Both type of drugs are exposed",sum(!is.na(AD_Drug_tbl_final$DrugName1) & !is.na(AD_Drug_tbl_final$DrugName)),"\n")
cat("Only antiamyloid exposed:",sum(!is.na(AD_Drug_tbl_final$DrugName1)),"\n")
dataset <- ADSLTemp

# Step 1: Keep only drug records where PersonId is in ADSL2
AD_Drug_tbl1 <- AD_Drug_tbl %>%
  filter(PersonId %in% dataset$PersonId)

# Step 2: Subset ADSL3 to include only AD patients or Dementia patients (non-missing ADDT/OthDemDT)
cohort <- dataset %>%
  filter(!is.na(ADDT) | !is.na(OthDementiaDt)) %>%
  select(PersonId, MCIDT, ADDT, ADEncounterId, ADEncounterType, 
         ADTotalDiagC, ADCondCategory, OthDementiaDt, DementiaTotalDiagC, OthDemPrimDiagFl)

# Step 3: Full join to get everyone with either AD or drug exposure
AD_Drug_tbl2 <- full_join(AD_Drug_tbl1, cohort, by = "PersonId")

#### Create Outcome variables

In [35]:
ADSL <- ADSL %>% 
        mutate(Outcome = case_when(is.na(ADDT) ~ 0,
                         !is.na(ADDT) & ADTotalDiagC >=2 ~ 1,
                         !is.na(ADDT) & ADTotalDiagC ==1 & (NamzaricFl == 1 | antiamyloidFl == 1 | CholinesteraseFl == 1 | MemantineFl == 1) ~ 1,
                         TRUE ~ 2))

In [40]:
#Quality check,
cat("Total AD:",sum(ADSL$Outcome == 1),"\n")
cat("Total Non-AD:",sum(ADSL$Outcome == 0),"\n")
cat("Total ND:",sum(ADSL$Outcome == 2),"\n")

### Count Inpatient and Outpatient MCI Diagnosis

In [15]:
sql1 <- "

-- Get the subjects with MCI codes from Condition table
WITH tb1 as 
(
    SELECT DISTINCT c.PersonId, c.EncounterId
    FROM condition c
     INNER JOIN ConditionCodeConceptMap ccm 
      ON c.CodeConceptMapId = ccm.Id
     INNER JOIN mci_codes cc 
      ON ccm.CodeConceptId = cc.ConceptId
     WHERE ccm.SourceConceptId IN (2703595, 2703594)
      AND (c.OnsetDateTime IS NOT NULL OR c.RecordedDateTime IS NOT NULL) 
      AND c.VerificationStatusConceptId NOT IN (1065198,1065195)
      AND c.ClinicalStatusConceptId NOT IN (1065184,1065178,1065183)
      AND c.CategoryConceptId NOT IN (1065172,1065174)
),

-- merge with encounter table
-- Create Flag for Inpatient and Outpatient Encounters

tb2 as 
(
 SELECT c.PersonId, 
 CASE WHEN e.ClassConceptId in (1065223, 1065215, 1065220, 1065225) then 1 else 0 END AS MCIInpEncCnt,
 CASE WHEN e.TypeConceptId in (1065323,1065280) OR e.ClassConceptId in (1065216, 1065227) then 1 else 0 END AS MCIOutEncCnt

  FROM tb1 c
     INNER JOIN encounter e ON e.Id = c.EncounterId
  WHERE e.StatusConceptId NOT IN (2506591,2983200,2983199,2506590)
),

-- Count the number of outpatient and inpatient MCI encounter
tb3 as
(
    SELECT PersonId, sum(MCIInpEncCnt) as MCIInpEncCnt, sum(MCIOutEncCnt) as MCIOutEncCnt
    FROM tb2
    GROUP BY PersonId
)

SELECT a.*, b.MCIInpEncCnt, b.MCIOutEncCnt
FROM ADSL_sql a
  LEFT JOIN tb3 b
  ON a.PersonId=b.PersonId
"
ADSL1_1 <- load_sql_table(con, snapshot, sql1, view_name='ADSL1_1',output_mode = "sparklyr") %>% collect()

### Primary Diagnosis Flags

In [28]:
sql <- "

/* STEP 1: filter and score the condition records of interest */
WITH ad1 AS (
  SELECT
      c.PersonId,
      CASE
        WHEN c.PrimaryDiagnosisConceptId = 1200406 THEN 1  -- “Yes”
        WHEN c.PrimaryDiagnosisConceptId = 1200405 THEN 0  -- “No”
        ELSE NULL
      END AS PrimDiagnosisNum
  FROM condition c
  JOIN ConditionCodeConceptMap ccm ON c.CodeConceptMapId = ccm.Id
  JOIN AD_codes cc ON ccm.CodeConceptId = cc.ConceptId
  WHERE ccm.SourceConceptId IN (2703595, 2703594)
    AND (c.OnsetDateTime IS NOT NULL OR c.RecordedDateTime IS NOT NULL)
    AND c.VerificationStatusConceptId NOT IN (1065198,1065195)
    AND c.ClinicalStatusConceptId NOT IN (1065184,1065178,1065183)
    AND c.PrimaryDiagnosisConceptId NOT IN (1067561,1067557)
),

/* STEP 2: aggregate to one row per subject for AD */
ad2 AS (
  SELECT
      PersonId,
      SUM(PrimDiagnosisNum) as ADPrimDiagnosisSum
  FROM ad1
  GROUP BY PersonId
),

/* STEP 3: filter and score MCI condition records */
mci1 AS (
  SELECT
      c.PersonId,
      CASE
        WHEN c.PrimaryDiagnosisConceptId = 1200406 THEN 1  -- “Yes”
        WHEN c.PrimaryDiagnosisConceptId = 1200405 THEN 0  -- “No”
        ELSE NULL
      END AS PrimDiagnosisNum
  FROM condition c
  JOIN ConditionCodeConceptMap ccm ON c.CodeConceptMapId = ccm.Id
  JOIN mci_codes cc ON ccm.CodeConceptId = cc.ConceptId
  WHERE ccm.SourceConceptId IN (2703595, 2703594)
    AND (c.OnsetDateTime IS NOT NULL OR c.RecordedDateTime IS NOT NULL)
    AND c.VerificationStatusConceptId NOT IN (1065198,1065195)
    AND c.ClinicalStatusConceptId NOT IN (1065184,1065178,1065183)
    AND c.PrimaryDiagnosisConceptId NOT IN (1067561,1067557)
),

/* STEP 4: aggregate to one row per subject for MCI */
mci2 AS (
  SELECT
      PersonId,
      SUM(PrimDiagnosisNum) as MCIPrimDiagnosisSum
  FROM mci1
  GROUP BY PersonId
)

-- STEP 5: attach flags
SELECT
    p.PersonId, p.ADDT, p.MCIDT,
    CASE
      WHEN a.ADPrimDiagnosisSum != 0 THEN 'Y'
      WHEN a.ADPrimDiagnosisSum = 0 THEN 'N'
      ELSE NULL
    END AS ADPrimDiagFl,
    CASE
      WHEN b.MCIPrimDiagnosisSum != 0 THEN 'Y'
      WHEN b.MCIPrimDiagnosisSum = 0 THEN 'N'
      ELSE NULL
    END AS MCIPrimDiagFl
FROM ADSL_sql p
LEFT JOIN ad2 a ON p.PersonId = a.PersonId
LEFT JOIN mci2 b ON p.PersonId = b.PersonId
"

PrimDiagFl <- load_sql_table(con, snapshot,sql, view_name="PrimDiagFl", output_mode="sparklyr")

In [30]:
table(dplyr::pull(PrimDiagFl,ADPrimDiagFl))

### New encounter type logic

In [11]:
ADSL <- ADSL %>%
  mutate(
    MCIEncType = case_when(
      MCIEncClassId %in% c(1065217) ~ "Emergency",
      MCIEncClassId %in% c(1065223, 1065215, 1065220, 1065225) ~ "Inpatient",  # 1065225 prioritized here
      MCIEncTypeId %in% c(1065323,1065280) | MCIEncClassId %in% c(1065216, 1065227) ~ "Outpatient",  # 1065216 prioritized here
      MCIEncClassId %in% c(1065230) ~ "Virtual",
      MCIEncClassId %in% c(1067555, 2703598, 1065218, 1067558, 2649591, 0, 1067561) ~ "Unknown",
      TRUE ~ "Other"
    ),
    ADEncType = case_when(
      ADEncClassId %in% c(1065217) ~ "Emergency",
      ADEncClassId %in% c(1065223, 1065215, 1065220, 1065225) ~ "Inpatient",  # 1065225 prioritized here
      ADEncClassId %in% c(1065216, 1065227) | ADEncTypeId %in% c(1065323,1065280) ~ "Outpatient",  # 1065216 prioritized here
      ADEncClassId %in% c(1065230) ~ "Virtual",
      ADEncClassId %in% c(1067555, 2703598, 1065218, 1067558, 2649591, 0, 1067561) ~ "Unknown",
      TRUE ~ "Other"
    )
  )

In [32]:
ADSL <- ADSL %>%
  mutate(ADEncType = case_when(
    ADEncClassId %in% c(1065217) ~ "Emergency",
    ADEncClassId %in% c(1065223, 1065215, 1065220, 1065225) ~ "Inpatient",      # 1065225 prioritized here
    ADEncClassId %in% c(1065216, 1065227) OR ADEncTypeId %in% c(1065323,1065280)~ "Outpatient",             # 1065216 prioritized here
    ADEncClassId %in% c(1065230) ~ "Virtual",
    ADEncClassId %in% c(1067555, 2703598, 1065218, 1067558, 2649591, 0, 1067561) ~ "Unknown",
    TRUE ~ "Other"
  ))

data <- ADSL3
cat("Total mismatch in encounter type between old and new",sum(data$ADEncounterType != data$NewADEncType, na.rm = TRUE),"\n")
cat("Total missing MCIENCTYPe old:",sum(is.na(data$ADEncounterType)),"\n")
cat("Total missing MCIENCTYPe new:",sum(is.na(data$NewADEncType)),"\n")

In [11]:
data <- ADSL1
cat("Total mismatch in encounter type between old and new",sum(data$MCIEncounterType != data$NewMCIEncType, na.rm = TRUE),"\n")
cat("Total missing MCIENCTYPe old:",sum(is.na(data$MCIEncounterType)),"\n")
cat("Total missing MCIENCTYPe new:",sum(is.na(data$NewMCIEncType)),"\n")

### Merge with Medication_QC

In [None]:
# No need to run again
MED_QC1 <- MED_QC %>%  mutate(Ratio = round(TotNumDisRec/TotNumReqRec,2))%>%
    rename("DispenseFirstDt" = "DSSTDT", "DispenseLastDt" = "DSENDT")

ADSL1 <- ADSL %>%
  mutate(TotalDiagEnc = diagencbefmci + diagencaftrmci) %>%
  left_join(MED_QC1 %>% select(PersonId, TotNumReqRec, TotNumDisRec, Ratio), by = "PersonId")

# Add values 0 if NULL

In [None]:
ADSL <- ADSL1
#Write the file
file_to_write1 <- paste(output_path_local, "/ADSL.csv", sep = "")

# use write.csv to write your file
write.csv(ADSL, file_to_write1, row.names = FALSE)

MED_QC <- MED_QC1
#Write the file
file_to_write <- paste(output_path_local, "/MED_QC.csv", sep = "")

# use write.csv to write your file
write.csv(MED_QC, file_to_write, row.names = FALSE)

#### Quality Checks

In [14]:
# if everything ok save it as ADSL i.e ADSL <- ADSL1 
display_df(ADSL1,10)
ADSL <- ADSL1

In [49]:
cat("MCI diag > 1 and atleast 1 outp mCI diag",sum(ADSL2$MCIOutEncCnt > 0 & ADSL2$MCITotalDiagC > 1, na.rm=TRUE),"\n")
cat("MCI diag > 1 and atleast 1 out/inp mci enco",sum((ADSL2$MCIOutEncCnt > 0 | ADSL2$MCIInpEncCnt > 0) & ADSL2$MCITotalDiagC > 1, na.rm=TRUE),"\n")

In [58]:
ADSLTemp <- ADSL2 %>% filter((ADSL2$MCIOutEncCnt > 0 | ADSL2$MCIInpEncCnt > 0) & ADSL2$MCITotalDiagC > 1)

In [20]:
dataset <- ADSL2 %>% filter(ADSL2$MCIOutEncCnt > 0 & ADSL2$MCITotalDiagC > 1)
nrow(dataset)

In [21]:
dataset <- ADSL2 %>% filter(ADSL2$MCIOutEncCnt > 0 & ADSL2$MCITotalDiagC > 1)
#dataset <- ADSL2 %>% filter(!is.na(diagEnc90Out) & diagEnc90Out >= 1) 
#dataset <- ADSL2

cat("Total MCI with only 1 code:", 
    sum(dataset$MCITotalDiagC == 1, na.rm = TRUE), "\n")

cat("Total AD", 
    sum(!is.na(dataset$ADDT), na.rm=TRUE), "\n")

cat("Total AD with only 1 code:", 
    sum(dataset$ADTotalDiagC == 1, na.rm = TRUE), "\n")

cat("Total AD within 30 days of MCI:", 
    sum(!is.na(dataset$ADDT) & as.Date(dataset$MCIDT) <= as.Date(dataset$ADDT) & as.Date(dataset$ADDT) <= as.Date(dataset$MCIDT) + 30, na.rm=TRUE), "\n")

cat("Total AD within 60 days of MCI:", 
    sum(!is.na(dataset$ADDT) & as.Date(dataset$MCIDT) <= as.Date(dataset$ADDT) & as.Date(dataset$ADDT) <= as.Date(dataset$MCIDT) + 60, na.rm=TRUE), "\n")

cat("Total AD within 90 days of MCI:", 
    sum(!is.na(dataset$ADDT) & as.Date(dataset$MCIDT) <= as.Date(dataset$ADDT) & as.Date(dataset$ADDT) <= as.Date(dataset$MCIDT) + 90, na.rm=TRUE), "\n")

cat("Subjects with follow-up within 6 month of MCI(No AD or Death, cendored):",
    sum(dataset$CNSR == 0 & dataset$AVAL < 0.5, na.rm=TRUE), "\n")

In [22]:
cat("Subjects with follow-up within 6 month of MCI(No AD or Death, cendored):",
    sum(dataset$CNSR == 0 & dataset$AVAL < 0.5, na.rm=TRUE), "\n")

In [23]:
# Distribution of MCI to AD in years
dataset1 <- dataset %>% filter(CNSR == 1)
summary(dataset1$AVAL)

In [24]:
# Plot time difference between MCI to AD
boxplot(dataset1$AVAL)

In [30]:
data <- AD_Drug_tbl2

# 1. Total subjects with exposure to AD drugs at any point
cat("1. Total Number of subjects in the MCI cohort with exposure to approved AD medication at any point of time:",
    sum(!is.na(data$DrugName)), "\n")

# 1.1 TotNumDisRec > 1 OR DISSDAYS > 30 from 1
cat("1.1 TotNumDisRec > 1 OR DISSDAYS > 30 from 1:",
    sum((data$TotNumDisRec > 1 | data$DISSDAYSDRUG > 30) & !is.na(data$DrugName)), "\n")

# 2. Subjects without AD but with drug exposure
cat("2. Total Number of subjects without AD but with exposure to approved AD drug at any time:",
    sum(is.na(data$ADDT) & !is.na(data$DrugName)), "\n")

# 2.1 From (2): TotNumDisRec > 1 OR DISSDAYS > 30
cat("2.1 TotNumDisRec > 1 OR DISSDAYS > 30 from 2:",
    sum(is.na(data$ADDT) &
        !is.na(data$DrugName) &
        (data$TotNumDisRec > 1 | data$DISSDAYSDRUG > 30)), "\n")

# 2.2 From (2): TotNumDisRec > 1 OR DISSDAYS > 30 without AD but with drug exp and other dementia
cat("2.2 TotNumDisRec > 1 OR DISSDAYS > 30 and no AD + Drug exp + dementia from 2:",
    sum(is.na(data$ADDT) &
        !is.na(data$DrugName) &
        !is.na(data$OthDemPrimDiagFl) &
        (data$TotNumDisRec > 1 | data$DISSDAYSDRUG > 30)), "\n")

# 3. Subjects with both AD diagnosis and drug exposure
cat("3. Total number of subjects with AD + exposed to approved drug at any time:",
    sum(!is.na(data$ADDT) & !is.na(data$DrugName)), "\n")

# 3.1 Exposure after AD (within time window)
cat("3.1 Total subjects with exposure after AD: ADDT – 30 days <= DSSTDT from 3:",
    sum(!is.na(data$ADDT) &
        !is.na(data$DSSTDT) &
        (as.Date(data$ADDT) - 30 <= as.Date(data$DSSTDT))), "\n")

# 3.2 Exposure after AD + (TotNumDisRec > 1 OR DISSDAYS > 30)
cat("3.2 Total subjects with exposure after AD and (TotNumDisRec > 1 OR DISSDAYS > 30):",
    sum(
      !is.na(data$ADDT) &
      !is.na(data$DSSTDT) &
      ((as.Date(data$ADDT) - 30) <= as.Date(data$DSSTDT)) &
      (data$TotNumDisRec > 1 | data$DISSDAYSDRUG > 30)
    ), "\n")

# 5. Total subjects with 1 ADTotalDiagC AND TRT is NOT NULL AND (TotNumDisRec > 1 OR DISSDAYS > 30))]
cat("5. Total subjects with 1 ADTotalDiagC AND TRT is NOT NULL AND (TotNumDisRec > 1 OR DISSDAYS > 30))]:",
    sum(
      data$ADTotalDiagC == 1 &
        (!is.na(data$DrugName) & 
         (data$TotNumDisRec > 1 | data$DISSDAYSDRUG > 30))
      ,
      na.rm = TRUE
    ), "\n")

#6. 
cat("6. Total subjects with 1 ADTotalDiagC AND TRT start is after AD:",
    sum(
      data$ADTotalDiagC == 1 &
        !is.na(data$DrugName) & 
        (as.Date(data$ADDT) - 30 <= as.Date(data$DSSTDT)),
      na.rm = TRUE
    ), "\n")

In [None]:
# 4. Total subjects with 1 ADTotalDiagC AND PrimDFl is NOT NULL AND ADPDId = Y:
cat("4. Total subjects with 1 ADTotalDiagC AND PrimDFl is NOT NULL AND ADPDId = Y:",
    sum(
      data$ADTotalDiagC == 1 &
      (!is.na(data$PrimDFl) & data$ADPDId == 1200406),
      na.rm = TRUE
    ), "\n")

In [41]:
AD_temp <- ADSL3 %>%
  filter(
    !ADCondCategory %in% c("Medical history", "Problem list"),
    !( !is.na(ADDT) & as.Date(ADDT) <= as.Date(MCIDT) + 30 ),
    !is.na(ADDT)
  )

sum(!is.na(AD_temp$ADDT))

sum(AD_temp$ADTotalDiagC == 1,na.rm=TRUE)

In [60]:
data <- AD_Drug_tbl2

# 2.3 From (2): First AD drug exposure with 30D of MCI
cat("2.3  no AD + Drug exp withn 30D of MCI:",
    sum(is.na(data$ADDT) &
        !is.na(data$DrugName) &
        as.Date(data$MCIDT) <= as.Date(data$DSSTDT) & as.Date(data$MCIDT) + 30 >= as.Date(data$DSSTDT), na.rm=TRUE), "\n")

# 2.4 From (2): First AD drug exposure with 60D of MCI
cat("2.4  no AD + Drug exp withn 60D of MCI:",
    sum(is.na(data$ADDT) &
        !is.na(data$DrugName) &
        as.Date(data$MCIDT) <= as.Date(data$DSSTDT) & as.Date(data$MCIDT) + 60 >= as.Date(data$DSSTDT), na.rm=TRUE), "\n")

# 2.5 From (2): First AD drug exposure with 90D of MCI
cat("2.4  no AD + Drug exp withn 90D of MCI:",
    sum(is.na(data$ADDT) &
        !is.na(data$DrugName) &
        as.Date(data$MCIDT) <= as.Date(data$DSSTDT) & as.Date(data$MCIDT) + 90 >= as.Date(data$DSSTDT), na.rm=TRUE), "\n")

In [None]:
# Distribution of AD variables : Quality Check
#Patient Characteristics

data <- ad_drug %>% filter(!is.na(ADDT))
# Remove AD within 30 days (n=); Remove AD with the first Diagnosis Category as “Medical History”/ “Problem List” (n=)

#data <- ad_drug %>% filter(AdFlBase == "N" & ADCondCategory != "")

summary_table <- function(df, var) {
  df %>%
    count(!!sym(var)) %>%
    mutate(Percentage = round(n / sum(n) * 100, 2)) %>%
    rename(Category = !!sym(var))
}

# Creating summaries for all categorical variables

category_summary <- summary_table(data, "ADCondCategory")
Primdiag_summary <- summary_table(data, "ADPDId")
Encounter_summary <- summary_table(data, "ADEncounterType")

# Combine all summaries into one table
final_summary <- bind_rows(
  mutate(category_summary, Variable = "ADCondCategory"),
  mutate(Primdiag_summary, Variable = "ADPDId"),
  mutate(Encounter_summary, Variable = "ADEncounterType"),
 ) %>%
  select(Variable, Category, n, Percentage) %>%
  arrange(Variable)

# Display the final summary table
kable(final_summary, caption = "Patient Characteristics Summary for AD")


In [None]:
#Total diagnosis encounter 
temp <- ADSL2 %>% mutate(totaldisenc = sum(ADSL2$diagencbefmeci, ADSL2$diagencaftermci)) %>% select(totaldisenc)
summary(temp$totaldisenc)

In [24]:
cat("AD filter 1",sum(ADSL3$ADCondCategory == "Medical history" | ADSL3$ADTotalDiagC == 1 |  (!is.na(ADSL3$ADDT) & as.Date(ADSL3$ADDT) <= as.Date(ADSL3$MCIDT) + 30), na.rm = TRUE), "\n")
cat("AD filter 2",sum(ADSL3$ADCondCategory == "Medical history" | (!is.na(ADSL3$ADDT) & as.Date(ADSL3$ADDT) <= as.Date(ADSL3$MCIDT) + 30), na.rm = TRUE), "\n")

In [31]:
table(ADSL3$NewMCIEncType)

In [43]:
table(AD_temp$NewADEncType)

In [None]:
table(ADSL2$MCIPDId)

In [None]:
table(ADSL2$ADPDId)

In [None]:
table(ADSL2$ADCondCategory)

In [None]:
cat("Total AD:", sum(all_mci1$ADFl == "Y", na.rm = TRUE), "\n")
cat("age below 65:", sum(all_mci1$AgeAtDiagnosis < 65, na.rm = TRUE), "\n")
cat("Sex unknown:", sum(all_mci1$Sex == "Unknown", na.rm = TRUE), "\n")
cat("MCI before 2018:", sum(as.Date(all_mci1$MCIDt) < as.Date("2018-01-01"), na.rm = TRUE), "\n")
cat("dementia before mci:", sum(!is.na(all_mci1$OthDementiaDt) & 
                                as.Date(all_mci1$OthDementiaDt) < as.Date(all_mci1$MCIDt), 
                                na.rm = TRUE), "\n")
cat("Less than 1 year history:", sum(!is.na(all_mci1$FirstDiagEncDt) & round(as.numeric(as.Date(all_mci1$MCIDt) - as.Date(all_mci1$FirstDiagEncDt) + 1), 2) < 365),"\n")
cat("Less than 2 diagnosis encounter within 1 year:",sum(!is.na(all_mci1$diagnenc1year) & all_mci1$diagnenc1year < 2),"\n")
cat("Death before MCI:",sum(!is.na(all_mci1$DTHFL) & as.Date(all_mci1$DTHDT) <= as.Date(all_mci1$MCIDt)),"\n")
cat("AD before mci:", sum(!is.na(all_mci1$ADDt) & 
                                as.Date(all_mci1$ADDt) < as.Date(all_mci1$MCIDt), 
                                na.rm = TRUE), "\n")


### Medication QC

In [56]:
file_path = file.path("public/qc/MRQC.parquet")
MRQC <- load_artifacts_data(con,study, file_path, data_type = "parquet")
mrqc <- sparkr_df_to_sparklyr_df(con, MRQC)
create_view(mrqc,"mrqc")

display_df(mrqc)

In [57]:
sql <- 
"
  SELECT count(a.PersonId) cnt
  FROM mrqc a
  INNER JOIN ADSL_sql b ON a.PersonId = b.PersonId
  WHERE a.ReqDiffDAYS < 30 AND a.TotNumReqRec < 2
"

mrqctb <- load_sql_table(con,snapshot,sql)
display_df(mrqctb)

In [45]:
file_path = file.path("public/qc/MRMDQC.parquet")

MRMDQC <- load_artifacts_data(con,study, file_path, data_type = "parquet")

In [46]:
med <- sparkr_df_to_sparklyr_df(con, MRMDQC)
#MRMDQC <- MRMDQC %>% collect()

In [48]:
create_view(med,"med")
sql <- "
  SELECT b.MedicationFL, count(distinct b.PersonId) cnt
  FROM med b
  INNER JOIN ADSL_sql a ON a.PersonId = b.PersonId
  GROUP BY b.MedicationFL
"
med1 <- load_sql_table(con,snapshot,sql)
display_df(med1)

In [56]:
display_df(DIAG_ENCT_QC,10)

In [26]:
med_count <- MED_QC %>% filter(PersonId %in% ADSL2$PersonId)
cat("Disp only",sum(!is.na(med_count$DSSTDT) & is.na(med_count$RequestFirstDT)),"\n")
cat("Request Only", sum(is.na(med_count$DSSTDT) & !is.na(med_count$RequestFirstDT)),"\n")
cat("Unk",sum(is.na(med_count$DSSTDT) & is.na(med_count$RequestFirstDT)),"\n")
cat("MR and MD", sum(!is.na(med_count$DSSTDT) & !is.na(med_count$RequestFirstDT)),"\n")

In [13]:
display_df(MED_QC,10)

In [9]:
colnames(DIAG_ENCT_QC)

In [58]:
MED_QC <- MED_QC %>% mutate(TotNumReqRec = ifelse(is.na(TotNumReqRec),0,TotNumReqRec),
                            TotNumDisRec = ifelse(is.na(TotNumDisRec),0,TotNumDisRec),
                            Ratio = round(TotNumDisRec/TotNumReqRec,2))

In [60]:
ADSL3 <- ADSL2 %>%
  mutate(totaldiagenc = diagencbefmci + diagencaftrmci) %>%
  left_join(MED_QC %>% select(PersonId, TotNumReqRec, TotNumDisRec, Ratio), by = "PersonId") 
  
 nrow(ADSL3) 
#left_join(DIAG_ENCT_QC %>% select(PersonId, TotEnctCnt), by = "PersonId")

In [21]:
library(ggplot2)
library(dplyr)

# Assuming your dataframe is called 'patient_data' with columns:
# 'PersonId', 'TotalDispenseRecords', 'TotalRequestRecords', 'TotalDiagnosisEncounters'

# Scatter plot
ggplot(ADSL2, aes(x = Ratio, y = TotEnctCnt)) +
  geom_point(alpha = 0.4, color = "steelblue") +
  geom_smooth(method = "loess", color = "darkred") +
  scale_y_continuous(trans = 'log10') +  # Optional: Use if encounters are skewed
  labs(title = "Dispense-to-Request Ratio vs Diagnosis Encounters",
       x = "Dispense/Request Ratio",
       y = "Total Diagnosis Encounters (log10 scale)") +
  theme_minimal()


In [23]:
# Basic plot with specified x-axis limits
plot(ADSL2$Ratio,ADSL2$TotEnctCnt, xlim = c(0, 10), xaxt = 'n', main = "Custom X-axis", xlab = "Dispense/Request Ratio", ylab = "Diagnosis Encounters")

# Add custom x-axis ticks from 0 to 10 in 0.5 increments
axis(1, at = seq(0, 10, by = 0.5))


In [65]:
cat("Dis/Req == 0.1 & TotalDiagnosis Enc > 10",sum(ADSL3$Ratio == 0.1 & ADSL3$totaldiagenc > 10, na.rm = TRUE),"\n")
cat("Dis/Req == 0.0 & TotalDiagnosis Enc > 10:",sum(ADSL3$Ratio == 0.0 & ADSL3$totaldiagenc> 10, na.rm = TRUE),"\n")
cat("Dis == 0.0 & TotalDiagnosis Enc > 10:",sum(ADSL3$TotNumDisRec == 0.0 & ADSL3$totaldiagenc > 10, na.rm = TRUE),"\n")
cat("Req == 0.0 & TotalDiagnosis Enc > 10:",sum(ADSL3$TotNumReqRec == 0.0 & ADSL3$totaldiagenc> 10, na.rm = TRUE),"\n")

cat("Dis/Req == 0.1 & TotalDiagnosis Enc > 100:",sum(ADSL3$Ratio == 0.1 & ADSL3$totaldiagenc > 10, na.rm = TRUE),"\n")

In [61]:
# Basic plot with specified x-axis limits
plot(ADSL3$Ratio,ADSL3$totaldiagenc, xlim = c(0, 10), xaxt = 'n', main = "Custom X-axis", xlab = "Dispense/Request Ratio", ylab = "Diagnosis Encounters")

# Add custom x-axis ticks from 0 to 10 in 0.5 increments
axis(1, at = seq(0, 10, by = 0.5))

In [15]:
ggplot(ADSL2, aes(x = Ratio, y = TotEnctCnt)) +
  geom_boxplot(fill = "lightblue") +
  scale_y_continuous(trans = 'log10') +  # Optional: log-scale for skewed data
  labs(title = "Diagnosis Encounters Distribution by Ratio Bins",
       x = "Dispense/Request Ratio Bins",
       y = "Total Diagnosis Encounters") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


In [None]:
#Age distribution of subjects without any dispense or request data
sql1 <- "
 SELECT mean(p.AgeAtDiagnosis) as MeanAge, median(p.AgeAtDiagnosis) as MedAge, mode(p.AgeAtDiagnosis) as ModeAge
  FROM med b
  INNER JOIN ADSL2 a ON a.PersonId = b.PersonId
  WHERE a.DISSDAYS < 30 OR a.TotDissRec < 2
"
med2 <- load_sql_table(con,snapshot,sql1)
display_df(med2)

In [55]:
summary(MRMDQC)

In [56]:
# Only subjects in MRMDQC which are present in ADSL2
MRMDQC_temp <- MRMDQC %>% filter(PersonId %in% ADSL2$PersonId)
nrow(exqc_temp)

In [None]:
table(MRMDQC$)

In [58]:
cat("Patients with only DISSDAYS < 30:", sum(exqc_temp$DISSDAYS < 30, na.rm = TRUE), "\n")
cat("Patients with TotalNumDisRec < 2:", sum(exqc_temp$TotNumDisRec < 2, na.rm = TRUE), "\n")
cat("Patients with DISSDAYS < 30 & TotalNumDisRec < 2:", 
    sum(exqc_temp$DISSDAYS < 30 & exqc_temp$TotNumDisRec < 2, na.rm = TRUE), "\n")
cat("Patients with DISSDAYS < 30 | TotalNumDisRec < 2:", 
    sum(exqc_temp$DISSDAYS < 30 | exqc_temp$TotNumDisRec < 2, na.rm = TRUE), "\n")

In [8]:
#Patient Characteristics
library(dplyr)
library(tidyr)
library(knitr)

data <- FinalPatient %>% left_join(ADSL,by="PersonId")

summary_table <- function(df, var) {
  df %>%
    count(!!sym(var)) %>%
    mutate(Percentage = round(n / sum(n) * 100, 2)) %>%
    rename(Category = !!sym(var))
}

# Creating summaries for all categorical variables

age_summary <- summary_table(data, "Age_Group")
sex_summary <- summary_table(data, "Sex")
ethnicity_summary <- summary_table(data, "Ethnicity")
race_summary <- summary_table(data, "Race")
marital_summary <- summary_table(data, "MaritalStatus")
#bmi_summary <- summary_table(data, "BMI_Group")
#bmi_impute_summary <- summary_table(data, "BMI_GROUP_impute")
region_summary <- summary_table(data, "Region")
year_summary <- summary_table(data, "IndexYear")
state_summary <- summary_table(data, "StateOrProvince")


# Combine all summaries into one table
final_summary <- bind_rows(
  mutate(age_summary, Variable = "Age_Group"),
  mutate(sex_summary, Variable = "Sex"),
  mutate(ethnicity_summary, Variable = "Ethnicity"),
  mutate(race_summary, Variable = "Race"),
  mutate(marital_summary, Variable = "MaritalStatus"),
  mutate(region_summary, Variable = "Region"),
  mutate(region_summary, Variable = "IndexYear"),
  mutate(state_summary, Variable = "StateOrProvince")
) %>%
  select(Variable, Category, n, Percentage) %>%
  arrange(Variable)

# Display the final summary table
kable(final_summary, caption = "Patient Characteristics Summary for MCI")

In [None]:
#Distribution of Follow-up
summary(ADSL2$FollowupTm)

In [None]:
# Histogram
hist(ad_final1$FollowupTm, 
     main = "Time from MCI Diagnosis to Last Encounter/Death", 
     xlab = "Months", 
     col = "lightgreen", 
     breaks = 3)

In [None]:
#Total number of MCI and AD cases by years
df <- ad_final
df$AD_year <- format(as.Date(df$AdDt), "%Y")
df$MCI_year <- format(as.Date(df$MCIDt), "%Y")
# AD count by year
ad_count <- table(df$AD_year)

# MCI count by year
mci_count <- table(df$MCI_year)


In [None]:
# Print results
print(ad_count)
print(mci_count)

In [None]:
data <- ad_final

# Count patients with >= 1 Diagnosis Encounters
count_1_or_more <- sum(data$diagEnc90 >= 1, na.rm=TRUE)

# Count patients with >= 3 Diagnosis Encounters
count_3_or_more <- sum(data$diagEnc90 >= 3, na.rm=TRUE)

# Count patients with >= 5 Diagnosis Encounters
count_5_or_more <- sum(data$diagEnc90 >= 5, na.rm=TRUE)

# Print Results
cat("Patients with >= 1 Diagnosis Encounters where Enc is < Pandt - 90D:", count_1_or_more, "\n")
cat("Patients with >= 3 Diagnosis Encounters where Enc is < Pandt - 90D", count_3_or_more, "\n")
cat("Patients with >= 5 Diagnosis Encounters where Enc is < Pandt - 90D", count_5_or_more, "\n")

# Count patients with >= 1 Diagnosis Encounters
count_1_or_more <- sum(data$diagEnc30 >= 1, na.rm=TRUE)

# Count patients with >= 3 Diagnosis Encounters
count_3_or_more <- sum(data$diagEnc30 >= 3, na.rm=TRUE)

# Count patients with >= 5 Diagnosis Encounters
count_5_or_more <- sum(data$diagEnc30 >= 5, na.rm=TRUE)

# Print Results
cat("Patients with >= 1 Diagnosis Encounters where Enc is < Pandt - 30D:", count_1_or_more, "\n")
cat("Patients with >= 3 Diagnosis Encounters where Enc is < Pandt - 30D", count_3_or_more, "\n")
cat("Patients with >= 5 Diagnosis Encounters where Enc is < Pandt - 30D", count_5_or_more, "\n")

# Count patients with >= 1 Diagnosis Encounters
count_1_or_more <- sum(data$diagEnc90Out >= 1, na.rm=TRUE)

# Count patients with >= 3 Diagnosis Encounters
count_3_or_more <- sum(data$diagEnc90Out >= 3, na.rm=TRUE)

# Count patients with >= 5 Diagnosis Encounters
count_5_or_more <- sum(data$diagEnc90Out >= 5, na.rm=TRUE)

# Print Results
cat("Patients with >= 1 Outpatients Diagnosis Encounters where Enc is < Pandt - 90D:", count_1_or_more, "\n")
cat("Patients with >= 3 Outpatients Diagnosis Encounters where Enc is < Pandt - 90D", count_3_or_more, "\n")
cat("Patients with >= 5 Outpatients Diagnosis Encounters where Enc is < Pandt - 90D", count_5_or_more, "\n")

# Count patients with >= 1 Diagnosis Encounters
count_1_or_more <- sum(data$diagEnc30Out >= 1, na.rm=TRUE)

# Count patients with >= 3 Diagnosis Encounters
count_3_or_more <- sum(data$diagEnc30Out >= 3, na.rm=TRUE)

# Count patients with >= 5 Diagnosis Encounters
count_5_or_more <- sum(data$diagEnc30Out >= 5, na.rm=TRUE)

# Print Results
cat("Patients with >= 1 Outpatients Diagnosis Encounters where Enc is < Pandt - 30D:", count_1_or_more, "\n")
cat("Patients with >= 3 Outpatients Diagnosis Encounters where Enc is < Pandt - 30D", count_3_or_more, "\n")
cat("Patients with >= 5 Outpatients Diagnosis Encounters where Enc is < Pandt - 30D", count_5_or_more, "\n")

# Count patients with >= 1 outpatient Only Type) Diagnosis Encounters
count_1_or_more <- sum(data$diagEnc90OutTemp >= 1, na.rm=TRUE)

# Count patients with >= 3 Diagnosis Encounters
count_3_or_more <- sum(data$diagEnc90OutTemp >= 3, na.rm=TRUE)

# Count patients with >= 5 Diagnosis Encounters
count_5_or_more <- sum(data$diagEnc90OutTemp >= 5, na.rm=TRUE)

# Print Results
cat("Patients with >= 1 Outpatients Diagnosis(Only Type) Encounters where Enc is < Pandt - 90D:", count_1_or_more, "\n")
cat("Patients with >= 3 Outpatients Diagnosis(Only Type) Encounters where Enc is < Pandt - 90D", count_3_or_more, "\n")
cat("Patients with >= 5 Outpatients Diagnosis(Only Type) Encounters where Enc is < Pandt - 90D", count_5_or_more, "\n")

# Count patients with >= 1 Diagnosis Encounters
count_1_or_more <- sum(data$diagEnc30OutTemp >= 1, na.rm=TRUE)

# Count patients with >= 3 Diagnosis Encounters
count_3_or_more <- sum(data$diagEnc30OutTemp >= 3, na.rm=TRUE)

# Count patients with >= 5 Diagnosis Encounters
count_5_or_more <- sum(data$diagEnc30OutTemp >= 5, na.rm=TRUE)

# Print Results
cat("Patients with >= 1 Outpatients(Only Type) Diagnosis Encounters where Enc is < Pandt - 30D:", count_1_or_more, "\n")
cat("Patients with >= 3 Outpatients(Only Type) Diagnosis Encounters where Enc is < Pandt - 30D", count_3_or_more, "\n")
cat("Patients with >= 5 Outpatients(Only Type) Diagnosis Encounters where Enc is < Pandt - 30D", count_5_or_more, "\n")

In [None]:
#Medications

Bumetanide       = codeset(con, snapshot, "RxNorm", selfAndDescendants, "1808")
Torasemide       = codeset(con, snapshot, "RxNorm", selfAndDescendants, "38413")
Sildenafil       = codeset(con, snapshot, "RxNorm", selfAndDescendants, "136411")
Ciclopirox       = codeset(con, snapshot, "RxNorm", selfAndDescendants, "21090")
Tianeptine       = codeset(con, snapshot, "RxNorm", selfAndDescendants, "38252")
Phenylalanine    = codeset(con, snapshot, "RxNorm", selfAndDescendants, "8156")
Metformin        = codeset(con, snapshot, "RxNorm", selfAndDescendants, "6809")
Losartan         = codeset(con, snapshot, "RxNorm", selfAndDescendants, "52175")
Atenolol         = codeset(con, snapshot, "RxNorm", selfAndDescendants, "1202")
Febuxostat       = codeset(con, snapshot, "RxNorm", selfAndDescendants, "73689")
Pioglitazone     = codeset(con, snapshot, "RxNorm", selfAndDescendants, "33738")
Hydroxychloroquine = codeset(con, snapshot, "RxNorm", selfAndDescendants, "5521")
Omeprazole       = codeset(con, snapshot, "RxNorm", selfAndDescendants, "7646")
Pantoprazole     = codeset(con, snapshot, "RxNorm", selfAndDescendants, "40790")
Gabapentin       = codeset(con, snapshot, "RxNorm", selfAndDescendants, "25480")
Atorvastatin     = codeset(con, snapshot, "RxNorm", selfAndDescendants, "83367")
Fluticasone      = codeset(con, snapshot, "RxNorm", selfAndDescendants, "41126")
Amoxicillin      = codeset(con, snapshot, "RxNorm", selfAndDescendants, "723")


In [None]:
DiagnosisEncounter <- all_enc %>% collect()

In [None]:
sparklyr::spark_last_error()

In [None]:
colnames(ad_final1)

In [None]:
#TO count
%%sql
SELECT COUNT(DISTINCT PersonId) FROM tbl_index_conditions_filtered

In [None]:
display_df(ad_final1[is.na(ad_final1$MCI_AD_time),],10)

In [None]:
display_df(ad_tb,5)

In [None]:
display_df(all_mci1,10)

In [None]:
#Remove variables

df <- df %>% select(-c("MCIDt","EncounterId","MCIRecordedDtFl","Personrecords","MCIEncClass","MCIEncType","MCIEncounterType","ADDt","ADRecordedDtFl","ADFl","ADClassEnc","ADTypeEnc",
"ADEncounterType","AdFl30","AdDtFl","AdDtFlBase","AdBefMCIFl","MCI_AD_time"))

# convert to SQL
temp_data <- as.DataFrame(pan_liv_final1b)
createOrReplaceTempView(temp_data, "temp_data")

In [None]:
# Join AD and MCI variables with the df
sql <- "
SELECT a.*, 
t1.ADEncounterId, t1.ADEncounterType, t1.ADDT, t1.ADRecordedDtFl, t1.ADCondCategory, t1.ADPrimaryDiagnosis, t1.ADTotalDiagC,
t2.MCIEncounterId, t2.MCIEncounterType, t2.MCIDT, t2.MCIRecordedDtFl, t2.MCICondCategory, t2.MCIPrimaryDiagnosis, t2.MCITotalDiagC
 FROM df a
 INNER JOIN AD_tbl t1 ON a.PersonId = t1.PersonId
 INNER JOIN MCI_tbl t2 ON a.PersonId = c.PersonId
"

In [None]:
Save EncouADTotalDiagCnter File

In [66]:
length(unique(AD_Drug_tbl2$PersonId))

In [13]:
display_df(data[data$MCIEncounterType != data$NewMCIEncType,c("MCIEncClassId","MCIEncTypeId","MCIEncounterType","NewMCIEncType")])

In [47]:
str(ADSL)

In [26]:
display_df(ADSL[ADSL$AVAL <0,],5)

In [24]:
summary(ADSL)

In [67]:
colnames(ADSL)

In [26]:
sum(antiamyloid$antiamyloidFl == 0)

In [27]:
display_df(antiamyloid)

In [8]:
ADSL <- ADSL %>% select(-c("MCIEncounterType","ADEncounterType","MCIInpEncCnt","MCIOutEncCnt"))
ADSL <- ADSL %>% rename("MCIEncounterType"="NewMCIEncType", "ADEncounterType" = "NewADEncType")

In [22]:
ADSL <- ADSL1

In [61]:

# Data Driven Approach to check top 5 medication in the cohort

#medication_dataset <- MedicationAdministration
#medication_dataset <- MedicationDispense
#medication_dataset <- MedicationRequest

sql <-"
SELECT mcc.CodeConceptId, COUNT(DISTINCT m.PersonId) AS PersonCount
FROM MedicationRequest m 
INNER JOIN MedicationCodeConceptMap mcc 
    ON m.CodeConceptMapId = mcc.Id
GROUP BY mcc.CodeConceptId
ORDER BY PersonCount DESC
LIMIT 30

"
topDrug <- load_sql_table(con, snapshot, sql, view_name='topDrug',output_mode = "sparklyr") 
display_df(topDrug,30)

In [64]:
# Basic Data Quality filter from snapshot

ADSL1 <- ADSL %>% 
  filter(
    AgeAtDiagnosis >= 65,
    Sex != "Unknown",
    as.Date(MCIDT) >= as.Date("2018-01-01"),
    as.Date(MCIDT) <= as.Date("2024-12-31"),
    (is.na(ADDT) | (!is.na(ADDT) & as.Date(ADDT) >= as.Date(MCIDT))),
    round(as.numeric(as.Date(MCIDT) - as.Date(FirstDiagEncDt) + 1), 2) >= 365,
    (!is.na(diagnenc1year) & diagnenc1year >= 2),
    (is.na(DTHFL) | as.Date(DTHDT) > as.Date(MCIDT)),
    (is.na(OthDementiaDt) | (!is.na(OthDementiaDt) & as.Date(OthDementiaDt) >= as.Date(MCIDT)))
  ) 
cat("Total record:", sum(nrow(ADSL1)),"\n")

# Criteria on MCI defination
ADSL2 <- ADSL1 %>% filter(MCIOutEncCnt > 0 & MCITotalDiagC > 1)
cat("Total record after MCi filter:", sum(nrow(ADSL2)),"\n")

# Remove Baseline AD cases
ADSL3 <- ADSL2 %>% filter(AdFlBase == "N")
cat("Total record after baseline AD exclude:", sum(nrow(ADSL3)),"\n")

# Quality filter on Medication
ADSL4 <- ADSL3 %>% filter(TotNumReqRec != 0)
cat("Total record after removing potential data error for medication:", sum(nrow(ADSL4)),"\n")

FinalPatient <- ADSL4 %>% select(PersonId)
# Quality filter on Outcome : if AD status is undetermiuned remove from analysis
#ADSL5 <- ADSL4 %>% filter(Outcome != 2)
#cat("Total record after rmeoving undetermined outcome:"sum(nrow(ADSL5)),"\n")

In [None]:
TotEnctCnt, TotNumReqRec, TotNumDisRec, CNSR, ADT, AVAL, MCIDT, ADDT, AgeAtDiagnosis, Age_Group, Race

In [33]:
nrow(ADSL)

In [66]:
ADSL <- ADSL %>% select(-"Outcome")

In [5]:
# Read the file
t1 <- paste(output_path_local, "/ADSL.csv", sep = "")
# use read.csv to read file into a R dataframe
ADSL <- read.csv(t1)
# Mutate Index year
ADSL <- ADSL %>% mutate(IndexYear = as.numeric(format(as.Date(MCIDT), "%Y")))
# Write ADSL
file_to_write <- paste(output_path_local, "ADSL.csv.r", sep = "")
write.csv(ADSL, file_to_write, row.names = FALSE)
# Check the outcome and Index year count
table(Outcome=ADSL$Outcome,Year=ADSL$IndexYear)

In [6]:
t5 <- paste(output_path_local, "/FinalPatient.csv", sep = "")
FinalPatient <- read.csv(t5)

In [7]:
data <- FinalPatient %>% left_join(ADSL %>% select(Outcome,IndexYear,PersonId),by="PersonId")
table(Outcome=data$Outcome,Year=data$IndexYear)