## Importing packages

In [73]:
library(here, help, pos = 2, lib.loc = NULL)
source(here('Modules','modules.R'))
#install_all_packages()
load_library_packages()

"package 'cluster' was built under R version 4.1.3"


## Importing and Cleaning dataset

In [74]:
df_during <- read_excel(path = here('Data','during_dataset.xlsx'))
df_during <- data.frame(df_during)
unique(sapply(df_during, class))
df_during <- df_during[complete.cases(df_during), ]
dim(df_during)
head(df_during)

Unnamed: 0_level_0,PA_barrier_time_family_responsabilities,PA_barrier__tiredness,PA_barrier_places_closed_dangerous,PA_barrier_money,PA_barrier_time_convenience,PA_barrier_hard_work,PA_barrier_no_interest,PA_barrier_none,PA_easiness_outdoor_activities,PA_easiness_professional_guidance,...,soccer,tennis,volleyball,basketball,swimming,surfing,yoga,pilates,fighting,other_sports
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,1,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


### Converting dataset types and labels

In [75]:
df_during$PA_intensity[df_during$PA_intensity == 4] <- 0
df_during$PA_duration[df_during$PA_duration == 5] <- 0
df_during$PA_weekly_frequency[df_during$PA_weekly_frequency == 4] <- 0
df_during$PA_practice_during <- ifelse(df_during$PA_practice_during == 0, 1, 0)
df_during$sedentary_time_range_during[df_during$sedentary_time_range_during < 5] <- 0
df_during$sedentary_time_range_during[df_during$sedentary_time_range_during == 5] <- 1
sum(df_during[, "PA_practice_during"])

In [76]:
sports <- c("running","hiking","cycling",
            "muscle_training","soccer","tennis",
            "volleyball","basketball","swimming",
            "surfing","yoga","pilates","fighting","other_sports")

df_during$PA_number <- rowSums(df_during[, sports])

### Separing barriers from dataset

In [77]:
df_during_barriers_facilitators <- df_during[, 1:18]
dim(df_during_barriers_facilitators)
head(df_during_barriers_facilitators)

Unnamed: 0_level_0,PA_barrier_time_family_responsabilities,PA_barrier__tiredness,PA_barrier_places_closed_dangerous,PA_barrier_money,PA_barrier_time_convenience,PA_barrier_hard_work,PA_barrier_no_interest,PA_barrier_none,PA_easiness_outdoor_activities,PA_easiness_professional_guidance,PA_easiness_unguided,PA_easiness_music,PA_easiness_collective,PA_easiness_home_activity,PA_easiness_family_activity,PA_easiness_home_space,PA_easiness_home_equipament,PA_easiness_others
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0
3,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,1,1,0,0,0,1,0,1,1,0
5,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0
6,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0


### Removing Strangers

In [78]:
differ_p <- (   df_during$PA_duration == 0 
                | df_during$PA_intensity == 0 
                | df_during$PA_weekly_frequency == 0
                | df_during$PA_number == 0
            )
strange_p <- df_during$PA_practice_during == 1 & differ_p
df_during <- df_during[!strange_p, ]
df_during_barriers_facilitators <- df_during_barriers_facilitators[!strange_p, ]
table(strange_p)

differ_dp <- ( df_during$PA_duration != 0 
                | df_during$PA_intensity != 0 
                | df_during$PA_weekly_frequency != 0
                | df_during$PA_number != 0
            ) 
strange_dp <- df_during$PA_practice_during == 0 & differ_dp 


table(strange_dp)
df_during <- df_during[!strange_dp, ]
df_during_barriers_facilitators <- df_during_barriers_facilitators[!strange_dp, ]

strange_p
FALSE  TRUE 
 1244     6 

strange_dp
FALSE  TRUE 
 1218    26 

### Agregate barriers and clean them


In [79]:
df_during_barriers_facilitators$PA_easiness_others <- NULL
df_during_barriers_facilitators$PA_barrier_none <- NULL
names(df_during_barriers_facilitators)

In [80]:

time_barrier_1 <- df_during_barriers_facilitators$PA_barrier_time_family_responsabilities
time_barrier_2 <- df_during_barriers_facilitators$PA_barrier_time_convenience


hard_barrier_1 <- df_during_barriers_facilitators$PA_barrier__tiredness
hard_barrier_2 <- df_during_barriers_facilitators$PA_barrier_hard_work

group_easiness_1 <- df_during_barriers_facilitators$PA_easiness_family_activity
group_easiness_2 <- df_during_barriers_facilitators$PA_easiness_collective


home_easiness_1 <- df_during_barriers_facilitators$PA_easiness_home_space
home_easiness_2 <- df_during_barriers_facilitators$PA_easiness_home_equipament


df_during_barriers_facilitators$PA_barrier_time <- time_barrier_1 + time_barrier_2
df_during_barriers_facilitators$PA_barrier_hard <- hard_barrier_1 + hard_barrier_2
df_during_barriers_facilitators$PA_easiness_group <- group_easiness_1 + group_easiness_2
df_during_barriers_facilitators$PA_easiness_home <- home_easiness_1 + home_easiness_2


In [81]:
df_during_barriers_facilitators$PA_barrier_time_family_responsabilities <- NULL
df_during_barriers_facilitators$PA_barrier_time_convenience <- NULL

df_during_barriers_facilitators$PA_barrier__tiredness <- NULL
df_during_barriers_facilitators$PA_barrier_hard_work <- NULL

df_during_barriers_facilitators$PA_easiness_family_activity <- NULL
df_during_barriers_facilitators$PA_easiness_collective <- NULL

df_during_barriers_facilitators$PA_easiness_home_space <- NULL
df_during_barriers_facilitators$PA_easiness_home_equipament <- NULL

## Kmeans and Silhouette method

In [82]:
dis = dist(df_during_barriers_facilitators)^2

In [105]:
num_seeds = 100
max_indexs <- c()
max_values <- c()
dis <- dist(df_during_barriers_facilitators)^2
seed_k <- data.frame(matrix(NA, ncol=num_seeds))
for(seed in 1:num_seeds){
    silhouette_scs <- c()
    for(k in 2:8){
        set.seed(seed)
 
        km <- kmeans(df_during_barriers_facilitators, centers = k, nstart=200)
        ss <- silhouette(km$cluster, dis)
        sc <- mean(ss[, 3])
        silhouette_scs <- append(sc, silhouette_scs)
    }
    mvalue <- max(silhouette_scs)
    max_values <- append(mvalue, max_values)
    max_indexs <- append(9 - match(max(mvalue), silhouette_scs), max_indexs)
}


write.csv(data.frame(k=rev(max_indexs), sc=rev(max_values)),"../Data/silhouette.csv", row.names = FALSE)

In [106]:
best_silhouettes <- read.csv('../Data/silhouette.csv')
best_silhouettes$seed <- seq.int(nrow(best_silhouettes))
head(best_silhouettes[order(best_silhouettes$sc, decreasing=TRUE), ], 10)

Unnamed: 0_level_0,k,sc,seed
Unnamed: 0_level_1,<int>,<dbl>,<int>
1,2,0.3339729,1
2,2,0.3339729,2
3,2,0.3339729,3
4,2,0.3339729,4
5,2,0.3339729,5
6,2,0.3339729,6
7,2,0.3339729,7
8,2,0.3339729,8
9,2,0.3339729,9
10,2,0.3339729,10
