In [None]:
# IRkernel::installspec(user = FALSE)|

In [None]:
library(RColorBrewer)
library(readxl)
library(ggplot2)
library(dplyr)

In [None]:
display.brewer.all(colorblindFriendly = TRUE)

In [None]:
# display.brewer.pal(n = 8, name = 'Dark2')
color_map_use = c("#2A363B", RColorBrewer::brewer.pal(8, "Dark2"))
color_map_use = c("#2A363B", "#355C7D", "#99B898", "#E84A5F", "#FF847C", "#FECEAB")

### Common: read data excel

In [None]:
df_original = read_excel("/home/jinhyun/codes/geography_prediction/results/Geography prediction experiments.xlsx", sheet = "from_43M")

In [None]:
names(df_original)

In [None]:
df = df_original %>% 
  mutate(plot_unit = interaction(random_seed, pre_select_method,	n_pre_select,	n_pre_select_goal, select_method, train_model),
         select_method = factor(select_method, c("random","xgb", "rf", "variance", "chi2", "f_classif", "fst", "af"))) 

### Figure 2. Accuracy by n_select and selection method

In [None]:
df_for_plot = df %>% 
  filter(train_model == "SVM", 
         pre_select_method == "variance", #variance, random
         n_pre_select == 1000000,
         # n_dim_reduced == 1024,
         !(select_method %in% c("fst", "af")),
         select_n == n_dim_reduced

        )


p = ggplot(data = df_for_plot, aes(x = select_n, y = testset_accuracy, group = plot_unit, colour = select_method)) +
  geom_line(linewidth = 1) +
  xlab("# of features") + 
  ylab("Testset Accuracy (%)") +
  scale_color_manual(values = color_map_use) +
  scale_x_continuous(trans='log2') +
  #scale_y_continuous(limits = c(0,1)) +
  theme_bw() + 
  theme(panel.border = element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")); 
        print(p)

In [None]:
df_for_plot_summary = df_for_plot %>%
  group_by(pre_select_method, n_pre_select,	n_pre_select_goal, select_n, select_method) %>%
  mutate(target_val = testset_accuracy) %>%
  # mutate(target_val = testset_f1_macro) %>%
  summarise(
    sd = sd(target_val),
    sem = sd(target_val)/ sqrt(n()),
    target_val = mean(target_val),
    n = n()
  )
df_for_plot_summary %>% head()


p = ggplot(data = df_for_plot, aes(x = select_n, y = target_val, colour = select_method)) +
  # geom_jitter(position = position_jitter(0.2)) + 
  geom_line(aes(group = select_method),data = df_for_plot_summary) +
  geom_errorbar(aes(ymin = target_val-sem, ymax = target_val+sem), data = df_for_plot_summary, width = 0.2)+
  ggtitle("With pre-selection (variance)") +
  xlab("# of features selected") + 
  ylab("Accuracy") +
  # ylab("Macro F1 score") +
  scale_color_manual(values = color_map_use) +
  scale_x_continuous(trans='log2', breaks = c(100, 1000, 10000, 100000)) +
  scale_y_continuous(limits = c(0.2,1)) +
  theme_bw() + 
  theme(panel.border = element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")); 
print(p)

ggsave("results/fig2_accuracy_by_select_n.pdf", width=12, height=9, units = c("cm"), dpi = 300)

In [None]:
df %>% 
  filter(train_model == "SVM", 
         pre_select_method == "variance",
         n_pre_select == 1000000,
         # n_dim_reduced == 1024,
         select_n == n_dim_reduced,
         select_n <= 8192,
        ) %>%
  group_by(pre_select_method, n_pre_select,	n_pre_select_goal, select_method) %>%
  summarise(
    sd = sd(testset_accuracy),
    sem = sd(testset_accuracy)/ sqrt(n()),
    testset_accuracy = mean(testset_accuracy),
    n = n()
  ) %>% arrange(-testset_accuracy)


In [None]:
df %>% 
  filter(train_model == "SVM", 
        #  pre_select_method == "variance",
         n_pre_select == 1000000,
         # n_dim_reduced == 1024,
         select_n == n_dim_reduced,
         select_n == 128,
        ) %>%
  group_by(pre_select_method, n_pre_select,	n_pre_select_goal, select_n, select_method) %>%
  summarise(
    sd = sd(testset_accuracy),
    sem = sd(testset_accuracy)/ sqrt(n()),
    testset_accuracy = mean(testset_accuracy),
    n = n()
  ) %>% arrange(-testset_accuracy)


# Figure 2 bar plot

In [None]:
df_for_plot = df %>% 
  filter(train_model == "SVM", 
         pre_select_method %in% c("random", "variance", "fst", "af"),
         n_pre_select == 1000000,
         select_n == n_dim_reduced,
         select_n <= 8192,
         !(select_method %in% c("fst", "af")),
        ) %>% 
  mutate(pre_select_method = factor(pre_select_method, c("random", "variance", "fst", "af")))

df_for_plot_summary = df_for_plot %>%
  group_by(pre_select_method, n_pre_select,	n_pre_select_goal, select_method) %>%
  summarise(
    sd = sd(testset_accuracy),
    sem = sd(testset_accuracy)/ sqrt(n()),
    testset_accuracy = mean(testset_accuracy),
    n = n()
  )

df_for_plot_summary %>% head(n = 20)

p = ggplot(data = df_for_plot, aes(x = select_method, y = testset_accuracy, colour = pre_select_method, fill = pre_select_method)) +
  geom_col(data = df_for_plot_summary, position = position_dodge(0.8), width = 0.65) +
  # geom_jitter(position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.8), colour = "black") + 
  geom_errorbar(aes(ymin = testset_accuracy-sem, ymax = testset_accuracy+sem), data = df_for_plot_summary, width = 0.2, position = position_dodge(0.8), colour = "black")+
  # ggtitle("After pre-selection") +
  xlab("Feature selection method") + 
  ylab("Accuracy") +
  scale_color_manual(values = color_map_use) +
  scale_fill_manual(values = color_map_use) +
  # scale_x_continuous(trans='log2', breaks = c(100, 1000, 10000, 100000)) +
  scale_y_continuous(limits = c(0,1)) +
  theme_bw() + 
  theme(panel.border = element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")); 
print(p)

ggsave("results/fig2_barplot.pdf", width=14, height=9, units = c("cm"), dpi = 300)

# Fig2. By n_pre_select

In [None]:
df_for_plot = df %>% 
  filter(train_model == "SVM", 
         pre_select_method %in% c("random", "variance", "fst", "af"),
         select_n == n_dim_reduced,
         select_n == 128
        ) %>%
  mutate(pre_select_method = factor(pre_select_method, c("random", "variance", "fst", "af")))

df_for_plot_summary = df_for_plot %>%
  group_by(pre_select_method, n_pre_select,	n_pre_select_goal, select_n, select_method) %>%
  summarise(
    sd = sd(testset_accuracy),
    sem = sd(testset_accuracy)/ sqrt(n()),
    testset_accuracy = mean(testset_accuracy),
    n = n()
  )

df_for_plot_summary %>% tail(10)

p = ggplot(data = df_for_plot, aes(x = n_pre_select, y = testset_accuracy, colour = select_method)) +
  geom_jitter(position = position_jitter(0.2)) + 
  geom_line(aes(group = select_method),data = df_for_plot_summary) +
  geom_errorbar(aes(ymin = testset_accuracy-sem, ymax = testset_accuracy+sem), data = df_for_plot_summary, width = 0.2)+
  ggtitle("Without pre-selection") +
  xlab("# of features selected") + 
  ylab("Accuracy") +
  scale_color_manual(values = color_map_use) +
  scale_x_continuous(trans='log2') + # breaks = c(100, 1000, 10000, 100000, 1e6, 1e7, 1e8)
  scale_y_continuous(limits = c(0.2,1)) +
  facet_grid(cols = vars(pre_select_method)) + 
  theme_bw() + 
  theme(panel.border = element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")); 
print(p)


# ggsave("results/fig2_barplot.pdf", width=14, height=9, units = c("cm"), dpi = 300)

# resource tables

In [None]:
df %>% 
    filter(train_model == "SVM") %>%
    filter(select_n %in% c(8192, 131072), n_dim_reduced == select_n, random_seed == 42) %>%
    select(random_seed, select_method, starts_with("select"), starts_with("train_")) %>%
    arrange(select_n, select_method)

# hybrid feature selection

In [None]:
df_hyb = read_excel("/home/jinhyun/codes/geography_prediction/results/Geography prediction experiments.xlsx", sheet = "results_hyb") %>%
    mutate(plot_unit = interaction(n_original, pre_selection, random_seed, select_method),
        #    select_method = factor(paste(n_original, select_method))
           ) 

In [None]:
df_for_plot = df_hyb %>% 
  filter(
         select_n == 256,
        #  select_method == "xgb", 
         pre_selection != 4000000
        ) %>%
  mutate(select_cpu_time = select_cpu_time / 60)


In [None]:
df_for_plot_summary = df_for_plot %>%
  group_by(pre_selection, select_n, select_method) %>%
  summarise(
    sd = sd(testset_accuracy),
    sem = sd(testset_accuracy)/ sqrt(n()),
    testset_accuracy = mean(testset_accuracy),
    n = n()
)

p = ggplot(data = df_for_plot, aes(x = pre_selection, y = testset_accuracy, colour = select_method)) +
  geom_jitter(position = position_jitter(0.2)) + 
  geom_line(aes(group = select_method),data = df_for_plot_summary) +
  geom_errorbar(aes(ymin = testset_accuracy-sem, ymax = testset_accuracy+sem), data = df_for_plot_summary, width = 0.2)+
  xlab("# of pre-selection features ") + 
  ylab("Accuracy") +
  scale_color_manual(values = color_map_use) +
  scale_x_continuous(trans='log2', breaks = c(100, 1000, 10000, 100000, 1e6)) +
  #scale_y_continuous(limits = c(0,1)) +
  theme_bw() + 
  theme(panel.border = element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")); 
print(p)

In [None]:
df_for_plot_summary = df_for_plot %>%
  group_by(pre_selection, select_n, select_method) %>%
  summarise(
    sd = sd(select_cpu_time),
    sem = sd(select_cpu_time)/ sqrt(n()),
    select_cpu_time = mean(select_cpu_time),
    n = n()
)
df_for_plot_summary %>% head

p = ggplot(data = df_for_plot, aes(x = pre_selection, y = select_cpu_time, colour = select_method)) +
  # geom_jitter(position = position_jitter(0.2)) + 
  geom_line(aes(group = select_method),data = df_for_plot_summary) +
  geom_errorbar(aes(ymin = select_cpu_time-sem, ymax = select_cpu_time+sem), data = df_for_plot_summary, width = 0.2)+
  xlab("# of features ") + 
  ylab("CPU Time for feature selection (hours)") +
  scale_color_manual(values = color_map_use) +
  # scale_x_continuous(trans='log2', breaks = c(100, 1000, 10000, 100000, 1e6)) +
  #scale_y_continuous(limits = c(0,1)) +
  theme_bw() + 
  theme(panel.border = element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")); 
print(p)

ggsave("results/fig2_CPU_time.pdf", width=12, height=9, units = c("cm"), dpi = 300)

In [None]:
df_for_plot_summary = df_for_plot %>%
  group_by(pre_selection, select_n, select_method) %>%
  summarise(
    sd = sd(select_memory_usage),
    sem = sd(select_memory_usage)/ sqrt(n()),
    select_memory_usage = mean(select_memory_usage),
    n = n()
)
 df_for_plot_summary %>% head

p = ggplot(data = df_for_plot, aes(x = pre_selection, y = select_memory_usage, colour = select_method)) +
  # geom_jitter(position = position_jitter(0.2)) + 
  geom_line(aes(group = select_method),data = df_for_plot_summary) +
  geom_errorbar(aes(ymin = select_memory_usage-sem, ymax = select_memory_usage+sem), data = df_for_plot_summary, width = 0.2)+
  xlab("# of features ") + 
  ylab("Peak memory for feature selection (GB)") +
  scale_color_manual(values = color_map_use) +
  # scale_x_continuous(trans='log2', breaks = c(100, 1000, 10000, 100000, 1e6)) +
  #scale_y_continuous(limits = c(0,1)) +
  theme_bw() + 
  theme(panel.border = element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")); 
print(p)

ggsave("results/fig2_mem.pdf", width=12, height=9, units = c("cm"), dpi = 300)

In [None]:

p = ggplot(data = df_for_plot %>% filter(select_method  == "xgb"), aes(x = select_cpu_time, y = select_memory_usage, colour = factor(pre_selection))) +
  # geom_jitter(position = position_jitter(0.2)) + 
  geom_point() + 
#   geom_line(aes(group = select_method),data = df_for_plot_summary) +
#   geom_errorbar(aes(ymin = select_memory_usage-sem, ymax = select_memory_usage+sem), data = df_for_plot_summary, width = 0.2)+
  ggtitle("Resources for features selection") +
  xlab("CPU time (hours)") + 
  ylab("Peak memory (GB)") +
  scale_color_manual(values = color_map_use) +
  # scale_x_continuous(trans='log2', breaks = c(100, 1000, 10000, 100000, 1e6)) +
  #scale_y_continuous(limits = c(0,1)) +
  theme_bw() + 
  theme(panel.border = element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")); 
print(p)
ggsave("results/fig2_resources.pdf", width=12, height=9, units = c("cm"), dpi = 300)

In [None]:
df_for_plot_summary = df_for_plot %>% filter(select_method == "xgb") %>%
  group_by(pre_selection, select_n, select_method) %>%
  summarise(
    sd = sd(select_cpu_time),
    sem = sd(select_cpu_time)/ sqrt(n()),
    select_cpu_time = mean(select_cpu_time),
    n = n()
)
df_for_plot_summary
# lr_model = lm(pre_selection ~ select_memory_usage, data = df_for_plot_summary)
# summary(lr_model)
# plot(df_for_plot_summary$pre_selection, df_for_plot_summary$select_memory_usage, pch = 16, col = "blue") #Plot the results
# abline(lr_model)
# predict(lr_model)

# Fig 3a

In [None]:
df_fig4 = read_excel("/home/jinhyun/codes/geography_prediction/results/Geography prediction experiments.xlsx", sheet = "from_43M_cls")

In [None]:
colnames(df_fig4)

In [None]:
df_minimum_n_by_cls = df_fig4 %>% group_by(random_seed, class_target) %>%
    mutate(max_f1 = max(testset_f1_macro)) %>%
    filter(testset_f1_macro > (max_f1-0.02)) %>%
    select(class_target, select_n, n_dim_reduced, testset_f1_macro, max_f1) %>%
    summarise(select_n = min(select_n))
df_minimum_n_by_cls %>% write.table("results/minimum_n_by_cls.csv", sep = ",")

# Supplementary figures

## By encoding

In [None]:
df_original = read_excel("/home/jinhyun/codes/geography_prediction/results/Geography prediction experiments.xlsx", sheet = "encoding_compare")

In [None]:
df_original %>% colnames

In [None]:
df_original %>% group_by(encoding, kernel, random_seed, select_method, select_n, n_dim_reduced, train_model) %>%
    summarise(n=n()) %>% nrow

df = df_original %>% group_by(encoding, kernel, random_seed, select_method, select_n, n_dim_reduced, train_model) %>%
    filter(valset_accuracy == max(valset_accuracy)) %>%
    filter(trainset_accuracy == max(trainset_accuracy)) %>%
    filter(testset_accuracy == max(testset_accuracy)) %>%
    filter(row_number() == 1) 
df %>% nrow

In [None]:
df_for_plot %>% nrow

In [None]:
df_for_plot = df %>% 
  filter(train_model == "SVM", 
         select_method == "random",
         select_n == n_dim_reduced,
         !(encoding == "boolean" & kernel == "sigmoid"),
        ) %>%
  mutate(plot_unit = interaction(encoding, kernel, random_seed, select_method, train_model)) 

p = ggplot(data = df_for_plot, aes(x = select_n, y = testset_accuracy, group = plot_unit, colour = kernel)) +
  geom_point(size = 2) +
  geom_line(linewidth = 1, aes(linetype = encoding)) +
  xlab("# of features") + 
  ylab("Testset Accuracy") +
  scale_color_manual(values = color_map_use[c(2, 3, 4)]) +
  scale_x_continuous(trans='log2') +
  #scale_y_continuous(limits = c(0,1)) +
  theme_bw() + 
  theme(panel.border = element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")); 
        print(p)
  
ggsave("results/fig_supp_encoding_and_kernel.pdf", width=11, height=9, units = c("cm"), dpi = 300)

## By classifier

In [None]:
df_for_plot = df %>% 
  filter(encoding == "0-3",
         !(kernel %in% c("rbf", "sigmoid")),
         select_method == "random",
         select_n == n_dim_reduced,
         select_n %in% c(8192, 131072),
        ) %>%
  mutate(plot_unit = interaction(random_seed, select_method, train_model),
         train_model = factor(train_model, c("SVM", "XGB", "RF"))) 

p = ggplot(data = df_for_plot, aes(x = factor(select_n), y = testset_accuracy, colour = train_model, fill = train_model)) +
  geom_col(position = position_dodge(0.8), width = 0.65) +
#   geom_line(linewidth = 1) +
  xlab("# of features") + 
  ylab("Testset Accuracy") +
  scale_color_manual(values = color_map_use[c(2, 3, 4)]) +
  scale_fill_manual(values = color_map_use[c(2, 3, 4)]) +
  # scale_x_continuous(trans='log2') +
  #scale_y_continuous(limits = c(0,1)) +
  theme_bw() + 
  theme(panel.border = element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")); 
        print(p)

  
ggsave("results/fig_supp_by_classifier.pdf", width=11, height=9, units = c("cm"), dpi = 300)

## By classifier #2

In [None]:
df_original = read_excel("/home/jinhyun/codes/geography_prediction/results/Geography prediction experiments.xlsx", sheet = "from_43M")

In [None]:
df = df_original %>% 
  mutate(plot_unit = interaction(random_seed, pre_select_method,	n_pre_select,	n_pre_select_goal, select_method, train_model))

df_for_plot = df %>% 
  filter(pre_select_method == "variance", #variance, random
         select_method %in% c("random"),
         train_model != "GBLUP",
         n_pre_select == 1000000,
         select_n == n_dim_reduced
        ) %>%
  mutate(train_model = factor(train_model, c("SVM", "XGB", "RF", "SNP-BLUP")))

In [None]:
df_for_plot %>% 
    select(random_seed, pre_select_method,	select_n, select_method, train_model, testset_accuracy) %>%
    arrange(random_seed, pre_select_method,	select_n, select_method, train_model) %>% head

In [None]:
df_for_plot %>% select(random_seed, pre_select_method,	select_n, select_method, train_model, testset_accuracy) %>%
    group_by(pre_select_method,	select_n, select_method, train_model) %>% summarise(n = n()) %>% head()

In [None]:
df_for_plot_summary = df_for_plot %>%
  group_by(pre_select_method, n_pre_select,	n_pre_select_goal, select_n, select_method, train_model) %>%
  mutate(target_val = testset_accuracy) %>%
  # mutate(target_val = testset_f1_macro) %>%
  summarise(
    sd = sd(target_val),
    sem = sd(target_val)/ sqrt(n()),
    target_val = mean(target_val),
    n = n()
  )
df_for_plot_summary %>% head()


p = ggplot(data = df_for_plot, aes(x = select_n, y = target_val, colour = train_model)) +
  geom_line(aes(group = train_model),data = df_for_plot_summary) +
  geom_errorbar(aes(ymin = target_val-sem, ymax = target_val+sem), data = df_for_plot_summary, width = 0.2)+
  xlab("# of features") + 
  ylab("Testset Accuracy (%)") +
  scale_color_manual(values = color_map_use[c(2,3,4,1)]) +
  scale_x_continuous(trans='log2', breaks = c(100, 1000, 10000, 100000)) +
  scale_y_continuous(limits = c(0.2,1)) +
  theme_bw() + 
  theme(panel.border = element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")); 
        print(p)


ggsave("results/fig_supp_by_classifier.pdf", width=12, height=9, units = c("cm"), dpi = 300)