# Train scMLP, a multi layer perceptron-based classifier

- How to train a MLP classifier
- How to format the classifier to use it with adverSCarial

In [1]:
library(reticulate)
use_python("/usr/bin/python3", required = TRUE)

In [2]:
library(keras)
library(dplyr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [3]:
packageVersion("keras")

[1] ‘2.15.0’

# Load data

In [4]:
df_train <- read.table("data//v2//seurat_scaled_pbmc_train.txt")
df_test <- read.table("data//v2//seurat_scaled_pbmc_test.txt")

In [5]:
df_train[1:5, c(colnames(df_train)[1:5],"y")]

Unnamed: 0_level_0,AL627309.1,AP006222.2,RP11.206L10.2,RP11.206L10.9,LINC00115,y
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
TCCCGATGAGATCC-1,-0.05812316,-0.03357571,-0.04166819,-0.03364562,-0.08223981,Memory CD4 T
TTCATGTGTGGTGT-1,-0.05812316,-0.03357571,-0.04166819,-0.03364562,-0.08223981,Memory CD4 T
CCAGTGCTAACCAC-1,-0.05812316,-0.03357571,-0.04166819,-0.03364562,-0.08223981,Memory CD4 T
AACTCTTGCAGGAG-1,-0.05812316,-0.03357571,-0.04166819,-0.03364562,-0.08223981,Memory CD4 T
CGAGCCGAGGCGAA-1,-0.05812316,-0.03357571,-0.04166819,-0.03364562,-0.08223981,Memory CD4 T


In [26]:
# Unbalanced data
table(df_train$y)


           B   CD14+ Mono        CD8 T           DC FCGR3A+ Mono Memory CD4 T 
         172          240          140           16           81          236 
 Naive CD4 T           NK     Platelet 
         356           72            7 

In [7]:
# Generate boostrap data
generate_bootstrap_samples <- function(subset_df, target_count) {
  n_needed <- target_count - nrow(subset_df)
  bootstrap_samples <- data.frame(matrix(ncol = ncol(subset_df), nrow = n_needed))
  
  names(bootstrap_samples) <- names(subset_df)
  
  for (col in names(subset_df)) {
    sampled_values <- sample(subset_df[[col]], n_needed, replace = TRUE)
    bootstrap_samples[[col]] <- sampled_values
  }
  
  return(bootstrap_samples)
}


In [8]:
# Generate a balanced dataset by using boostrap data if necessary
balance_data <- function(df, label_column, target_count) {
  balanced_df <- data.frame()
  
  labels <- unique(df[[label_column]])
  
  for (label in labels) {
    subset_df <- df[df[[label_column]] == label, ]
    
    if (nrow(subset_df) >= target_count) {
      subset_balanced <- subset_df[sample(nrow(subset_df), target_count), ]
    } else {
      extra_samples <- generate_bootstrap_samples(subset_df, target_count)
      subset_balanced <- rbind(subset_df, extra_samples)
    }
    
    balanced_df <- rbind(balanced_df, subset_balanced)
  }
  
  return(balanced_df)
}


In [9]:
df_train_boot <- balance_data(df_train, "y", 100)

In [27]:
# Balanced data
table(df_train_boot$y)


           B   CD14+ Mono        CD8 T           DC FCGR3A+ Mono Memory CD4 T 
         100          100          100          100          100          100 
 Naive CD4 T           NK     Platelet 
         100          100          100 

In [10]:
# Prepare data
x_train <- as.matrix(df_train_boot[, -which(names(df_train_boot) == "y")])
x_test <- as.matrix(df_test[, -which(names(df_test) == "y")])


# One hot encoding
y_train <- model.matrix(~ df_train_boot$y - 1)
y_test <- model.matrix(~ df_test$y - 1)

In [11]:
# Architecture of the model
model <- keras_model_sequential() %>%
  layer_dense(units = 128, activation = 'relu', input_shape = c(ncol(x_train))) %>%
  layer_dropout(rate = 0.5) %>%
  layer_dense(units = 64, activation = 'relu') %>%
  layer_dropout(rate = 0.5) %>%
  layer_dense(units = length(unique(df_train$y)), activation = 'softmax')

# Compilation
model %>% compile(
  loss = 'categorical_crossentropy',
  optimizer = optimizer_adam(),
  metrics = c('accuracy')
)


In [12]:
# Train the model
history <- model %>% fit(
  x_train, y_train,
  epochs = 20,
  batch_size = 32,
  validation_split = 0.2
)

In [13]:
# Faire des prédictions
predictions <- model %>% predict(x_test)

In [14]:
save_model_hdf5(model, "repr_data/classifiers/scMLP/dl_model.h5")

In [16]:
write.table(sub("df_train_boot\\$y","",unique(colnames(y_train))),
            "repr_data/classifiers/scMLP/new_colnames.txt", row.names=F, col.names=F)

# Format the Classifier
To work with adverSCarial the classifier needs to be formated in a certain way.

In [18]:
scMLP_classifier <- function(expr, clusters, target){
    expr = as.matrix(expr)
    message("load model")
    library(reticulate)
    use_python("/usr/bin/python3", required = TRUE)
    library(keras)
    mlpModel <<- load_model_hdf5("repr_data/classifiers/scMLP/dl_model.h5")
    newColnames <<- read.table("repr_data/classifiers/scMLP/new_colnames.txt")$V1
    
    predictions <- predict(mlpModel, expr)
    colnames(predictions) <- newColnames
    rownames(predictions) <- rownames(expr)
    predictions <- as.data.frame(predictions)
    
    if (sum(clusters == target) == 0 ){
        return( c("UNDETERMINED",1))
    }
    
    cell_types <- apply(predictions[clusters == target,], 1, function(x){
        names(x[x == max(x)])[1]
    })
    table_cell_type <<- table(cell_types)
    str_class <- names(table_cell_type[order(table_cell_type, decreasing=T)][1])
    resSCMLP <- list(
        # Cell type prediction for the cluster
        prediction=str_class,
        # Score of the predicted cell type
        odd=1,
        # Score for each cell type for each cell
        typePredictions=as.data.frame(t(predictions)),
        # Cell type for each cell
        cellTypes=cell_types)

    return(resSCMLP)
}


In [19]:
expr_df <- df_test[, -which(names(df_test) == "y")]
clusters_df <- df_test$y
names(clusters_df) <- rownames(df_test)

Classification of a cluster.

In [20]:
myPreds <- scMLP_classifier(expr_df, clusters_df, "NK")

load model



In [21]:
# Cell type prediction for the cluster
myPreds$prediction

In [22]:
# Score for each cell type for each cell
myPreds$typePredictions[1:5,1:5]

Unnamed: 0_level_0,AAACATACAACCAC-1,AAACATTGATCAGC-1,AAACGCACTGGTAC-1,AAATGTTGCCACAA-1,AACACGTGCAGAGG-1
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
B,0.0006802717,2.453048e-09,0.00373296,0.0004155398,0.0019878533
CD14+ Mono,0.0004720952,2.920255e-08,0.005437304,0.0042161699,0.0007037706
CD8 T,0.0462912805,0.003352702,0.152427167,0.2917648554,0.0910318717
DC,0.0021782795,3.081934e-06,0.025633126,0.0205157008,0.007218238
FCGR3A+ Mono,0.0006689421,1.103449e-07,0.006230432,0.0082531357,0.0007133698


In [23]:
# Cell type for each cell
head(myPreds$cellTypes)

In [24]:
for ( myClust in unique(clusters_df)){
    myPreds <- scMLP_classifier(expr_df, clusters_df, myClust)
    message("Cell type: ", myClust, ", prediction: ", myPreds$prediction)
}


load model

Cell type: Memory CD4 T, prediction: Memory CD4 T

load model

Cell type: B, prediction: B

load model

Cell type: CD14+ Mono, prediction: CD14+ Mono

load model

Cell type: NK, prediction: NK

load model

Cell type: CD8 T, prediction: CD8 T

load model

Cell type: Naive CD4 T, prediction: Naive CD4 T

load model

Cell type: FCGR3A+ Mono, prediction: FCGR3A+ Mono

load model

Cell type: DC, prediction: DC

load model

Cell type: Platelet, prediction: NK



## The function RF_classifier can be used as classifier with the adverSCarial package functions

In [25]:
sessionInfo()

R version 4.3.3 (2024-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.4 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=fr_FR.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=fr_FR.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=fr_FR.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=fr_FR.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Paris
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] dplyr_1.1.2       keras_2.15.0      reticulate_1.36.1

loaded via a namespace (and not attached):
 [1] crayon_1.5.2      vctrs_0.6.5       cli_3.6.2         zeallot_0.1.0    
 [5] rlang_1.1.3       png_0.1-8 