# Train scMLP, a multi layer perceptron-based classifier on a kidney 10k cells dataset

- Train a MLP classifier
- Format the classifier to use it with adverSCarial
- Run a single-gene attack

In [3]:
library(reticulate)
use_python("/usr/bin/python3", required = TRUE)

In [4]:
library(keras)
library(dplyr)
library(adverSCarial)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [3]:
packageVersion("keras")

[1] ‘2.15.0’

In [5]:
c_basen = c("hgnc_axilla_10k", "hgnc_kidney_10k", "hgnc_liver_10k")
basen = c_basen[2]

# Load data

In [6]:
df_train_boot <- read.table(paste0("data//v5/data//sc//",basen,"_train_balanced.txt"))
df_test <- read.table(paste0("data//v5/data//sc//",basen,"_test.txt"))

In [6]:
# Unbalanced data
table(df_train_boot$y)


  epithelial cell of proximal tubule                          kidney cell 
                                 100                                  100 
              kidney epithelial cell kidney loop of Henle epithelial cell 
                                 100                                  100 
                    mesenchymal cell                             podocyte 
                                 100                                  100 

In [7]:
# Prepare data
x_train <- as.matrix(df_train_boot[, -which(names(df_train_boot) == "y")])
x_test <- as.matrix(df_test[, -which(names(df_test) == "y")])


# One hot encoding
y_train <- model.matrix(~ df_train_boot$y - 1)
y_test <- model.matrix(~ df_test$y - 1)

In [8]:
# Architecture of the model
model <- keras_model_sequential() %>%
  layer_dense(units = 128, activation = 'relu', input_shape = c(ncol(df_train_boot)-1)) %>%
  layer_dropout(rate = 0.5) %>%
  layer_dense(units = 64, activation = 'relu') %>%
  layer_dropout(rate = 0.5) %>%
  layer_dense(units = length(unique(df_train_boot$y)), activation = 'softmax')

# Compilation
model %>% compile(
  loss = 'categorical_crossentropy',
  optimizer = optimizer_adam(),
  metrics = c('accuracy')
)


In [9]:
# Train the model
history <- model %>% fit(
  x_train, y_train,
  epochs = 20,
  batch_size = 32,
  validation_split = 0.2
)

In [10]:
# Faire des prédictions
predictions <- model %>% predict(x_test)

In [11]:
save_model_hdf5(model, paste0("repr_data/classifiers/scMLP/",basen,"_dl_model.h5"))

In [12]:
write.table(sub("df_train_boot\\$y","",unique(colnames(y_train))),
            paste0("repr_data/classifiers/scMLP/",basen,"_new_colnames.txt"), row.names=F, col.names=F)

# Format the Classifier
To work with adverSCarial the classifier needs to be formated in a certain way.

In [7]:
scMLP_classifier <- function(expr, clusters, target){
    expr = as.matrix(expr)
    message("load model")
    library(reticulate)
    use_python("/usr/bin/python3", required = TRUE)
    library(keras)
    mlpModel <<- load_model_hdf5(paste0("repr_data/classifiers/scMLP/",basen,"_dl_model.h5"))
    newColnames <<- read.table(paste0("repr_data/classifiers/scMLP/",basen,"_new_colnames.txt"))$V1
    
    predictions <- predict(mlpModel, expr)
    colnames(predictions) <- newColnames
    rownames(predictions) <- rownames(expr)
    predictions <- as.data.frame(predictions)
    
    if (sum(clusters == target) == 0 ){
        return( c("UNDETERMINED",1))
    }
    
    cell_types <- apply(predictions[clusters == target,], 1, function(x){
        names(x[x == max(x)])[1]
    })
    table_cell_type <<- table(cell_types)
    str_class <- names(table_cell_type[order(table_cell_type, decreasing=T)][1])
    resSCMLP <- list(
        # Cell type prediction for the cluster
        prediction=str_class,
        # Score of the predicted cell type
        odd=1,
        # Score for each cell type for each cell
        typePredictions=as.data.frame(t(predictions)),
        # Cell type for each cell
        cellTypes=cell_types)

    return(resSCMLP)
}


In [8]:
expr_df <- df_test[, -which(names(df_test) == "y")]
clusters_df <- df_test$y
names(clusters_df) <- rownames(df_test)

Classification of a cluster.

In [17]:
myPreds <- scMLP_classifier(expr_df, clusters_df, 'kidney cell')

load model



In [18]:
# Cell type prediction for the cluster
myPreds$prediction

In [19]:
# Score for each cell type for each cell
myPreds$typePredictions[1:5,1:5]

Unnamed: 0_level_0,4834STDY7002875_F16_KI_45P_CAGAATCTCGCCCTTA-1,4834STDY7002875_F16_KI_45P_CCTTCCCTCTGAGGGA-1,4834STDY7002875_F16_KI_45P_CGCGTTTAGGTGATTA-1,4834STDY7002875_F16_KI_45P_CTACCCAAGCTTATCG-1,4834STDY7002875_F16_KI_45P_TTCTCCTTCGAGAGCA-1
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
epithelial cell of proximal tubule,2.870023e-07,0.0005449362,0.9369884,0.0002297257,2.057816e-11
kidney cell,4.372414e-06,4.750591e-06,0.05101459,0.0009547377,0.2800552
kidney epithelial cell,0.9999946,0.004431158,0.0111873,0.9986643,0.7199416
kidney loop of Henle epithelial cell,2.068693e-09,2.303834e-05,0.000199579,1.029187e-05,2.785402e-16
mesenchymal cell,7.293742e-08,0.9949893,3.633398e-05,9.63294e-06,7.214583e-11


In [20]:
# Cell type for each cell
head(myPreds$cellTypes)

In [21]:
for ( myClust in unique(clusters_df)){
    myPreds <- scMLP_classifier(expr_df, clusters_df, myClust)
    message("Cell type: ", myClust, ", prediction: ", myPreds$prediction)
}


load model

Cell type: kidney epithelial cell, prediction: kidney epithelial cell

load model

Cell type: kidney cell, prediction: kidney cell

load model

Cell type: podocyte, prediction: podocyte

load model

Cell type: epithelial cell of proximal tubule, prediction: epithelial cell of proximal tubule

load model

Cell type: mesenchymal cell, prediction: mesenchymal cell

load model

Cell type: kidney loop of Henle epithelial cell, prediction: kidney cell



## The function scMLP_classifier can be used as classifier with the adverSCarial package functions

# Run a single-gene attack
Example of single-gene attack with the positive-aberrant modification on the fibroblast cluster.
The argument returnFirstFound=T indicates that the process should stop the attack when a successful attack is detected and return the corresponding gene

In [9]:
start_time <- Sys.time()

In [10]:
myAttack <- advSingleGene(expr_df, clusters_df, 'mesenchymal cell', scMLP_classifier, returnFirstFound = T, advMethod = "positive_aberrant")

predictWithNewValue data.frame data.frame

load model

Split number: 1/100

predictWithNewValue data.frame data.frame

load model

cellType: kidney cell

classifTarget: mesenchymal cell

target: mesenchymal cell

predictWithNewValue data.frame data.frame

load model

cellType: kidney cell

classifTarget: mesenchymal cell

target: mesenchymal cell

predictWithNewValue data.frame data.frame

load model

cellType: kidney cell

classifTarget: mesenchymal cell

target: mesenchymal cell

predictWithNewValue data.frame data.frame

load model

cellType: kidney cell

classifTarget: mesenchymal cell

target: mesenchymal cell

predictWithNewValue data.frame data.frame

load model

cellType: kidney epithelial cell

classifTarget: mesenchymal cell

target: mesenchymal cell

predictWithNewValue data.frame data.frame

load model

cellType: kidney epithelial cell

classifTarget: mesenchymal cell

target: mesenchymal cell

predictWithNewValue data.frame data.frame

load model

cellType: kidney epitheli

### Computation time

In [11]:
Sys.time() - start_time

Time difference of 12.44148 mins

In [12]:
myAttack

$MIR1302.2HG
[1] "kidney cell" "1"          


Build the modified RNA expression matrix.

In [15]:
mod_expr_df <- advModifications(expr_df, clusters=clusters_df, target='mesenchymal cell', genes=names(myAttack@values)[1], advMethod = "positive_aberrant")

Check if the attack was successful.

In [16]:
mod_myPreds <- scMLP_classifier(mod_expr_df, clusters_df, 'mesenchymal cell')
mod_myPreds$prediction

load model



In [17]:
sessionInfo()

R version 4.3.3 (2024-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.4 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=fr_FR.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=fr_FR.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=fr_FR.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=fr_FR.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Paris
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] adverSCarial_1.3.6 dplyr_1.1.4        keras_2.15.0       reticulate_1.36.1 

loaded via a namespace (and not attached):
 [1] Matrix_1.6-5          jsonlite_1.8.8        compiler_4.3.3       
 [4] crayon_1.5.3