In [1]:
# Add libraries
library(tidyverse)
library(repr)
library(datateachr)
library(digest)
library(infer)
library(gridExtra)
library(cowplot)
library(dplyr)
library(broom)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.3.6      [32m✔[39m [34mpurrr  [39m 0.3.4 
[32m✔[39m [34mtibble [39m 3.1.8      [32m✔[39m [34mdplyr  [39m 1.0.10
[32m✔[39m [34mtidyr  [39m 1.2.1      [32m✔[39m [34mstringr[39m 1.4.1 
[32m✔[39m [34mreadr  [39m 2.1.2      [32m✔[39m [34mforcats[39m 0.5.2 
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Attaching package: ‘gridExtra’


The following object is masked from ‘package:dplyr’:

    combine




In [2]:
# Read data from .csv file
survey <- read.csv('survey lung cancer.csv')
head(survey)

Unnamed: 0_level_0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC.DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL.CONSUMING,COUGHING,SHORTNESS.OF.BREATH,SWALLOWING.DIFFICULTY,CHEST.PAIN,LUNG_CANCER
Unnamed: 0_level_1,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>
1,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
2,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
3,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
4,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
5,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO
6,F,75,1,2,1,1,2,2,2,2,1,2,2,1,1,YES


In [3]:
# Change most of the variables to boolean variables instead of integer or string variables
survey_2 <- survey %>%
    summarize(gender = GENDER,
              age = AGE,
              smoking = SMOKING - 1 == 1,
              yellow_fingers = YELLOW_FINGERS - 1 == 1,
              anxiety = ANXIETY - 1 == 1,
              chronic_disease = CHRONIC.DISEASE - 1 == 1,
              fatigue = FATIGUE - 1 == 1,
              allergy = ALLERGY - 1 == 1,
              wheezing = WHEEZING - 1 == 1,
              alcohol_consuming = ALCOHOL.CONSUMING - 1 == 1,
              coughing = COUGHING - 1 == 1,
              shortness_of_breath = SHORTNESS.OF.BREATH - 1 == 1,
              swallowing_difficulty = SWALLOWING.DIFFICULTY - 1 == 1,
              chest_pain = CHEST.PAIN - 1 == 1,
              lung_cancer = LUNG_CANCER == "YES"
             )
head(survey_2)

Unnamed: 0_level_0,gender,age,smoking,yellow_fingers,anxiety,chronic_disease,fatigue,allergy,wheezing,alcohol_consuming,coughing,shortness_of_breath,swallowing_difficulty,chest_pain,lung_cancer
Unnamed: 0_level_1,<chr>,<int>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>
1,M,69,False,True,True,False,True,False,True,True,True,True,True,True,True
2,M,74,True,False,False,True,True,True,False,False,False,True,True,True,True
3,F,59,False,False,False,False,True,False,True,False,True,True,False,True,False
4,M,63,True,True,True,False,False,False,False,True,False,False,True,True,False
5,F,63,False,True,False,False,False,False,True,False,True,True,False,False,False
6,F,75,False,True,False,True,True,True,True,False,True,True,False,False,True


In [4]:
# Clean and wrangle data; we are only interested in people who have lung cancer and whether or not they smoked and/or consumed alcohol
survey_clean_wrangled <- survey_2 %>%
    filter(lung_cancer == TRUE) %>%
    select(gender, smoking, alcohol_consuming) %>%
    mutate(none = !smoking & !alcohol_consuming,
           only_smoking = smoking & !alcohol_consuming,
           only_drinking = !smoking & alcohol_consuming,
           both = smoking & alcohol_consuming) %>%
    select(-alcohol_consuming, -smoking)

head(survey_clean_wrangled)

Unnamed: 0_level_0,gender,none,only_smoking,only_drinking,both
Unnamed: 0_level_1,<chr>,<lgl>,<lgl>,<lgl>,<lgl>
1,M,False,False,True,False
2,M,False,True,False,False
3,F,True,False,False,False
4,M,False,False,False,True
5,F,False,True,False,False
6,M,False,False,False,True


In [5]:
# Create bootstrap distributions using the clean and wrangled data, then calculate the proportion differences between genders
set.seed(1)
survey_female <- survey_clean_wrangled %>%
    filter(gender == "F") %>%
    select(-gender)

bootstrap_survey_female <- survey_female %>%
    rep_sample_n(size = nrow(survey_female), reps = 1000, replace = TRUE)


set.seed(1)
survey_male <- survey_clean_wrangled %>%
    filter(gender == "M") %>%
    select(-gender)

bootstrap_survey_male <- survey_male %>%
    rep_sample_n(size = nrow(survey_male), reps = 1000, replace = TRUE)


head(bootstrap_survey_female)
head(bootstrap_survey_male)

replicate,none,only_smoking,only_drinking,both
<int>,<lgl>,<lgl>,<lgl>,<lgl>
1,False,False,False,True
1,False,False,True,False
1,False,True,False,False
1,True,False,False,False
1,False,False,True,False
1,False,False,True,False


replicate,none,only_smoking,only_drinking,both
<int>,<lgl>,<lgl>,<lgl>,<lgl>
1,False,False,True,False
1,False,False,False,True
1,True,False,False,False
1,False,True,False,False
1,False,False,True,False
1,False,False,False,True
