# Importing and Loading packages


In [233]:
#install.packages("here")
library(here, help, pos = 2, lib.loc = NULL)
source(here('Modules','modules.R'))
#install_all_packages()
load_library_packages()

# Importing and Exploring the Dataset

In [234]:
df <- read_excel(path = here('Data','before_pa.xlsx'))
df <- data.frame(df)
dim(df)
unique(sapply(df, class))
head(df)

Unnamed: 0_level_0,sex,age_range,martialStatus,state,zone,co.resident_range,rooms_range,income_range,scholarity,diabetes,...,PA_barriers_before_time_family_responsabilities,PA_barriers_before_tiredness,PA_barriers_before_location_distance,PA_barriers_before_money,PA_barriers_before_time_convenience,PA_barriers_before_hard_task,PA_barriers_before_interest,PA_barriers_before_none,PA_practice_during,sedentary_time_range_during
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,1,2,1,25,2,1,1,4,4,0,...,0,0,0,0,1,0,0,0,1,5
2,2,3,1,21,1,2,3,6,4,0,...,0,1,0,0,0,0,0,0,0,4
3,1,3,1,21,1,2,2,5,3,0,...,0,0,0,1,0,0,0,0,0,2
4,1,3,1,21,1,1,1,3,4,0,...,0,0,0,0,1,0,0,0,0,3
5,1,2,2,25,2,2,2,6,4,0,...,1,0,0,0,0,0,0,0,0,5
6,2,3,2,21,1,2,2,6,6,0,...,1,0,0,0,0,0,0,0,0,2


# Adding new Columns

In [235]:
sports = c('running_before','hiking_before','cycling_before','muscle_training_before',
'soccer_before','tennis_before','volleyball_before','basketball_before',
'swimming_before','surfing_before','yoga_before','pilates_before',
'fighting_before','other_sports_before')
    
df[, "pa_number_before"] = rowSums(df[, sports])

# Treating missing data

### As the dataset had little missing data, we chose to omit them

In [236]:
df <- df[complete.cases(df), ]
row.names(df) <- NULL
dim(df)

# Treating variables type

### Changing some class names to make insights easier with clear lables
For columns PA_practice_before and PA_practive_during:
 - 0 -> practice
 - 1 -> dont_practice

For columns PA_intesity:
 - 4 -> 0 (class 4 means don't practice, we need to make it smaller then other intensities)


For column PA_duration_before:
 - 5 -> 0 (class 5 means don't practice, we need to make it smaller then other durations )


In [237]:
df$PA_practice_before[df$PA_practice_before == 0] <- "practice"
df$PA_practice_before[df$PA_practice_before == 1] <- "dont_practice"
df$PA_practice_during[df$PA_practice_during == 0] <- "practice"
df$PA_practice_during[df$PA_practice_during == 1] <- "dont_practice"
df$PA_intensity_before[df$PA_intensity_before == 4] <- 0
df$PA_duration_before[df$PA_duration_before == 5] <- 0


df$sedentary_time_range_during[df$sedentary_time_range_during < 5] <- "less_then_8_hours"
df$sedentary_time_range_during[df$sedentary_time_range_during == 5] <- "8_hour_or_more"

df$sedentary_time_range_before[df$sedentary_time_range_before < 5] <- "less_then_8_hours"
df$sedentary_time_range_before[df$sedentary_time_range_before == 5] <- "8_hour_or_more"

In [238]:
pa_behavior1 <- df[df$PA_practice_before == "practice", ]
pa_behavior2 <- df[df$PA_practice_before == "dont_practice", ]

convert_pa_behavior <- function (row) {
    if(row["PA_practice_before"] == "dont_practice" && row["PA_practice_during"] == "dont_practice"){
        "still_dont_practice"
    } else if (row["PA_practice_before"] == "dont_practice" && row["PA_practice_during"] == "practice"){
        "change_to_practice"
    } else if (row["PA_practice_before"] == "practice" && row["PA_practice_during"] == "practice"){
        "still_practice"
    } else {
        "change_to_dont_practice"
    }

}


df["pa_behavior"] <- apply(df, MARGIN=1, convert_pa_behavior)

### Columns with some sense of order needs to be numeric 
(this type has better support then ordered factor)

In [239]:
numeric_columns = c(
    'age_range',
    'rooms_range',
    'income_range',
    'scholarity',
    'co.resident_range',
    "PA_weekly_frequency_before",
    'PA_intensity_before',
    'PA_duration_before',
    "pa_number_before"
)
columns = names(df)
categorical_columns <- columns[!columns %in% numeric_columns]


df[, categorical_columns] <- lapply(df[, categorical_columns], as.factor)
df[, numeric_columns] <- lapply(df[, numeric_columns], as.integer)


df[1, 2] < df[2, 2] 

head(df[, numeric_columns])


Unnamed: 0_level_0,age_range,rooms_range,income_range,scholarity,co.resident_range,PA_weekly_frequency_before,PA_intensity_before,PA_duration_before,pa_number_before
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,2,1,4,4,1,4,1,2,1
2,3,3,6,4,2,3,2,3,4
3,3,2,5,3,2,2,3,2,2
4,3,1,3,4,1,4,3,3,2
5,2,2,6,4,2,4,3,3,2
6,3,2,6,6,2,3,2,2,3


# Sanity Check

### There was no occurrence of DA (Alzheimer) and DP (Parkinson) in the dataset

In [240]:
lapply(df, levels)[c("DA", "DP")]
df <- subset(df, select = -c(DA, DP, state, zone))

### More then one column have the class "dont_practice", we need to check if they agree.

In [241]:
table(df[, c("PA_practice_before", "PA_duration_before")])
table(df[, c("PA_practice_before", "PA_intensity_before")])
table(df[, c("PA_practice_before", "PA_weekly_frequency_before")])
table(df[, c("PA_practice_before", "pa_number_before")])


                  PA_duration_before
PA_practice_before   0   1   2   3   4
     dont_practice 131   8   4   2   0
     practice        4  60 654 324  55

                  PA_intensity_before
PA_practice_before   0   1   2   3
     dont_practice 131  14   0   0
     practice        2 148 590 357

                  PA_weekly_frequency_before
PA_practice_before   1   2   3   4
     dont_practice 144   0   1   0
     practice        2 424 473 198

                  pa_number_before
PA_practice_before   0   1   2   3   4   5   6   7   9  14
     dont_practice 144   1   0   0   0   0   0   0   0   0
     practice        0 388 364 217  95  22   4   5   1   1

In [242]:
differ_p <- (   df$PA_duration_before == 0 
                | df$PA_intensity_before == 0 
                | df$PA_weekly_frequency_before == 1
                | df$pa_number_before == 0
            )
strange_p <- df$PA_practice_before == "practice" & differ_p
table(strange_p)


strange_p
FALSE  TRUE 
 1236     6 

In [243]:
differ_dp <- ( df$PA_duration_before != 0 
                | df$PA_intensity_before != 0 
                | df$PA_weekly_frequency_before != 1 
                | df$pa_number_before != 0
            ) 
strange_dp <- df$PA_practice_before == "dont_practice" & differ_dp 



Seems that some answers are inconsistent. We found out "strange practice" and "strange don't practice":

- Strange practice: people who said that practiced an exercise before the pandemic, but when asked about the exercise duration or the exercise intesity or the exercise frequency or the exercise type marked as if weren't practicing.
- Strange don't practice: people who said that weren't practicing any exercise before the pandemic, but when asked about the exercise duration or the exercise intesity or the exercise frequency or the exercise type marked as if they were practicing

To avoid losing data (mainly from our minority class), we chose to analyse, with unsupervised machine learning methods, where are those people in our data clusters. 

# Dividing Dataset by output variable

In [244]:
output_variables = c(
    "sedentary_time_range_during",
    "pa_behavior",
    "PA_practice_during"
)

before_dataset <- df[, !(names(df) %in% output_variables)]
pa_dataset <-  df[, !(names(df) %in% output_variables[-(3)])]
sedentary_dataset <- df[, !(names(df) %in% output_variables[-(1)])]
pa_behavior_dataset <- df[, !(names(df) %in% append( output_variables[-(2)], "PA_practice_before"))]

