## Importing packages

In [50]:
library(here, help, pos = 2, lib.loc = NULL)
source(here('Modules','modules.R'))
#install_all_packages()
load_library_packages()

"package 'cluster' was built under R version 4.1.3"


## Importing and Cleaning dataset

In [4]:
df_during <- read_excel(path = here('Data','during_dataset.xlsx'))
df_during <- data.frame(df_during)
unique(sapply(df_during, class))
df_during <- df_during[complete.cases(df_during), ]
dim(df_during)
head(df_during)

Unnamed: 0_level_0,PA_barrier_time_family_responsabilities,PA_barrier__tiredness,PA_barrier_places_closed_dangerous,PA_barrier_money,PA_barrier_time_convenience,PA_barrier_hard_work,PA_barrier_no_interest,PA_barrier_none,PA_easiness_outdoor_activities,PA_easiness_professional_guidance,...,soccer,tennis,volleyball,basketball,swimming,surfing,yoga,pilates,fighting,other_sports
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,1,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


### Converting dataset types and labels

In [5]:
df_during$PA_intensity[df_during$PA_intensity == 4] <- 0
df_during$PA_duration[df_during$PA_duration == 5] <- 0
df_during$PA_weekly_frequency[df_during$PA_weekly_frequency == 4] <- 0
df_during$PA_practice_during <- ifelse(df_during$PA_practice_during == 0, 1, 0)
df_during$sedentary_time_range_during[df_during$sedentary_time_range_during < 5] <- 0
df_during$sedentary_time_range_during[df_during$sedentary_time_range_during == 5] <- 1
sum(df_during[, "PA_practice_during"])

In [6]:
sports <- c("running","hiking","cycling",
            "muscle_training","soccer","tennis",
            "volleyball","basketball","swimming",
            "surfing","yoga","pilates","fighting","other_sports")

df_during$PA_number <- rowSums(df_during[, sports])

### Separing barriers from dataset

In [7]:
colnames(df_during)
head(df_during)

Unnamed: 0_level_0,PA_barrier_time_family_responsabilities,PA_barrier__tiredness,PA_barrier_places_closed_dangerous,PA_barrier_money,PA_barrier_time_convenience,PA_barrier_hard_work,PA_barrier_no_interest,PA_barrier_none,PA_easiness_outdoor_activities,PA_easiness_professional_guidance,...,tennis,volleyball,basketball,swimming,surfing,yoga,pilates,fighting,other_sports,PA_number
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,2
3,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,2
4,0,0,1,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,3
5,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
6,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,2


In [8]:
df_during[1:18] <- lapply(df_during[1:18], as.factor)
df_during_barriers_facilitators <- df_during[, 1:18]
dim(df_during_barriers_facilitators)
head(df_during_barriers_facilitators)

Unnamed: 0_level_0,PA_barrier_time_family_responsabilities,PA_barrier__tiredness,PA_barrier_places_closed_dangerous,PA_barrier_money,PA_barrier_time_convenience,PA_barrier_hard_work,PA_barrier_no_interest,PA_barrier_none,PA_easiness_outdoor_activities,PA_easiness_professional_guidance,PA_easiness_unguided,PA_easiness_music,PA_easiness_collective,PA_easiness_home_activity,PA_easiness_family_activity,PA_easiness_home_space,PA_easiness_home_equipament,PA_easiness_others
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>
1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0
3,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,1,1,0,0,0,1,0,1,1,0
5,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0
6,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0


### Removing Strangers

In [9]:
differ_p <- (   df_during$PA_duration == 0 
                | df_during$PA_intensity == 0 
                | df_during$PA_weekly_frequency == 0
                | df_during$PA_number == 0
            )
strange_p <- df_during$PA_practice_during == 1 & differ_p
df_during <- df_during[!strange_p, ]
df_during_barriers_facilitators <- df_during_barriers_facilitators[!strange_p, ]
table(strange_p)

differ_dp <- ( df_during$PA_duration != 0 
                | df_during$PA_intensity != 0 
                | df_during$PA_weekly_frequency != 0
                | df_during$PA_number != 0
            ) 
strange_dp <- df_during$PA_practice_during == 0 & differ_dp 


table(strange_dp)
df_during <- df_during[!strange_dp, ]
df_during_barriers_facilitators <- df_during_barriers_facilitators[!strange_dp, ]

strange_p
FALSE  TRUE 
 1244     6 

strange_dp
FALSE  TRUE 
 1218    26 

### Agregate barriers and clean them


In [17]:
df_during_barriers_facilitators$PA_easiness_others <- NULL
df_during_barriers_facilitators$PA_barrier_none <- NULL
names(df_during_barriers_facilitators)

In [39]:

time_barrier_1 <- ifelse(df_during_barriers_facilitators$PA_barrier_time_family_responsabilities=='0',0,1)
time_barrier_2 <- ifelse(df_during_barriers_facilitators$PA_barrier_time_convenience=='0',0,1)


hard_barrier_1 <- ifelse(df_during_barriers_facilitators$PA_barrier__tiredness=='0',0,1)
hard_barrier_2 <- ifelse(df_during_barriers_facilitators$PA_barrier_hard_work=='0',0,1)

group_easiness_1 <- ifelse(df_during_barriers_facilitators$PA_easiness_family_activity=='0',0,1)
group_easiness_2 <- ifelse(df_during_barriers_facilitators$PA_easiness_collective=='0',0,1)


home_easiness_1 <- ifelse(df_during_barriers_facilitators$PA_easiness_home_space=='0',0,1)
home_easiness_2 <- ifelse(df_during_barriers_facilitators$PA_easiness_home_equipament=='0',0,1)


df_during_barriers_facilitators$PA_barrier_time <- time_barrier_1 | time_barrier_2
df_during_barriers_facilitators$PA_barrier_hard <- hard_barrier_1 | hard_barrier_2
df_during_barriers_facilitators$PA_easiness_group <- group_easiness_1 | group_easiness_2
df_during_barriers_facilitators$PA_easiness_home <- home_easiness_1 | home_easiness_2


In [40]:
df_during_barriers_facilitators$PA_barrier_time_family_responsabilities <- NULL
df_during_barriers_facilitators$PA_barrier_time_convenience <- NULL

df_during_barriers_facilitators$PA_barrier__tiredness <- NULL
df_during_barriers_facilitators$PA_barrier_hard_work <- NULL

df_during_barriers_facilitators$PA_easiness_family_activity <- NULL
df_during_barriers_facilitators$PA_easiness_collective <- NULL

df_during_barriers_facilitators$PA_easiness_home_space <- NULL
df_during_barriers_facilitators$PA_easiness_home_equipament <- NULL

In [43]:
names(df_during_barriers_facilitators)

## Silhouette method

In [44]:
df_matrix <- data.matrix(df_during_barriers_facilitators) - 1
similarity_matrix <- df_matrix %*% t(df_matrix) + ((!df_matrix) + 0) %*% ((!t(df_matrix)) + 0)
dissimilarity_matrix <- ncol(df_during_barriers_facilitators) - similarity_matrix
head(dissimilarity_matrix)

Unnamed: 0,2,3,4,5,6,8,9,11,12,14,...,1246,1247,1248,1249,1250,1251,1252,1253,1254,1255
2,0,4,2,3,2,5,4,6,2,4,...,2,4,2,6,3,0,5,3,4,1
3,4,0,6,5,6,7,6,6,2,6,...,4,4,6,8,3,4,5,3,4,3
4,2,6,0,3,4,5,4,6,4,4,...,4,4,2,4,5,2,5,5,6,3
5,3,5,3,0,5,4,5,5,3,3,...,3,5,3,3,4,3,2,6,5,4
6,2,6,4,5,0,7,6,6,4,6,...,2,4,2,6,3,2,5,5,4,3
8,5,7,5,4,7,0,7,5,5,3,...,5,5,7,3,6,5,6,6,5,6


Genrating Ks samples

In [48]:
num_seeds = 100
max_indexs <- c()
max_values <- c()
seed_k <- data.frame(matrix(NA, ncol=num_seeds))
for(seed in 1:num_seeds){
    silhouette_scs <- c()
    for(k in 2:8){
        sc <- silhouette_values(num_clusters=k, df=df_during, diss_matrix=dissimilarity_matrix, iters=200, s=seed)[[1]]

        silhouette_scs <- append(sc, silhouette_scs)
    }
    mvalue <- max(silhouette_scs)
    max_values <- append(mvalue, max_values)
    max_indexs <- append(9 - match(max(mvalue), silhouette_scs), max_indexs)
}


write.csv(data.frame(k=rev(max_indexs), sc=rev(max_values)),"../Data/silhouette.csv", row.names = FALSE)

In [51]:
best_silhouettes <- read.csv('../Data/silhouette.csv')
best_silhouettes$seed <- seq.int(nrow(best_silhouettes))
head(best_silhouettes[order(best_silhouettes$sc, decreasing=TRUE), ], 10)

Unnamed: 0_level_0,k,sc,seed
Unnamed: 0_level_1,<int>,<dbl>,<int>
68,3,0.1843959,68
46,2,0.1771349,46
62,4,0.1621042,62
73,3,0.1615957,73
32,2,0.1610452,32
33,2,0.159645,33
21,2,0.1553791,21
84,2,0.1507815,84
69,2,0.1491713,69
65,4,0.1489659,65


In [None]:
# silhoutte_result <-  silhouette_values(num_clusters=3, df=df_during, diss_matrix=dissimilarity_matrix, iters=200, s=68)
# coef_data <- data.frame(coef=silhoutte_result[[2]], clust=silhoutte_result[[3]]$cluster)
# coef_data <- coef_data[order(coef_data$coef, decreasing=TRUE), ]
# coef_data <- coef_data[order(coef_data$clust, decreasing=TRUE), ]
# coef_data$clust <- as.factor(coef_data$clust )
# coef_data$ID <- seq.int(nrow(coef_data))

# ggplot(data = coef_data, aes(y=coef, x=ID, fill=clust)) + 
#   geom_bar(stat = "identity") + 
#   geom_hline(yintercept=mean(coef_data$coef), linetype="dashed", color = "red") + 
#   coord_flip() + 
#   scale_x_reverse() + 
#   theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

In [None]:
# silhoutte_result <-  silhouette_values(num_clusters=4, df=df_during, diss_matrix=dissimilarity_matrix, iters=200, s=62)
# coef_data <- data.frame(coef=silhoutte_result[[2]], clust=silhoutte_result[[3]]$cluster)
# coef_data <- coef_data[order(coef_data$coef, decreasing=TRUE), ]
# coef_data <- coef_data[order(coef_data$clust, decreasing=TRUE), ]
# coef_data$clust <- as.factor(coef_data$clust )
# coef_data$ID <- seq.int(nrow(coef_data))

# ggplot(data = coef_data, aes(y=coef, x=ID, fill=clust)) + 
#   geom_bar(stat = "identity") + 
#   geom_hline(yintercept=mean(coef_data$coef), linetype="dashed", color = "red") + 
#   coord_flip() + 
#   scale_x_reverse() 

In [None]:
# silhoutte_result <-  silhouette_values(num_clusters=3, df=df_during, diss_matrix=dissimilarity_matrix, iters=200, s=13)
# coef_data <- data.frame(coef=silhoutte_result[[2]], clust=silhoutte_result[[3]]$cluster)
# coef_data <- coef_data[order(coef_data$coef, decreasing=TRUE), ]
# coef_data <- coef_data[order(coef_data$clust, decreasing=TRUE), ]
# coef_data$clust <- as.factor(coef_data$clust )
# coef_data$ID <- seq.int(nrow(coef_data))

# ggplot(data = coef_data, aes(y=coef, x=ID, fill=clust)) + 
#   geom_bar(stat = "identity") + 
#   geom_hline(yintercept=mean(coef_data$coef), linetype="dashed", color = "red") + 
#   coord_flip() + 
#   scale_x_reverse() 