# Train scMLP, a multi layer perceptron-based classifier on a liver 10k cells dataset

- Train a MLP classifier
- Format the classifier to use it with adverSCarial
- Run a single-gene attack

In [1]:
library(reticulate)
use_python("/usr/bin/python3", required = TRUE)

In [2]:
library(keras)
library(dplyr)
library(adverSCarial)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [3]:
packageVersion("keras")

[1] ‘2.15.0’

In [4]:
c_basen = c("hgnc_axilla_10k", "hgnc_kidney_10k", "hgnc_liver_10k")
basen = c_basen[3]

# Load data

In [5]:
basen = c_basen[3]
df_train_boot <- read.table(paste0("data//v5/data//sc//",basen,"_train_balanced.txt"))
df_test <- read.table(paste0("data//v5/data//sc//",basen,"_test.txt"))

In [6]:
# Unbalanced data
table(df_train_boot$y)


blood vessel smooth muscle cell                endothelial cell 
                            100                             100 
                     macrophage                  malignant cell 
                            100                             100 
               mature NK T cell                        monocyte 
                            100                             100 

In [7]:
# Prepare data
x_train <- as.matrix(df_train_boot[, -which(names(df_train_boot) == "y")])
x_test <- as.matrix(df_test[, -which(names(df_test) == "y")])


# One hot encoding
y_train <- model.matrix(~ df_train_boot$y - 1)
y_test <- model.matrix(~ df_test$y - 1)

In [8]:
# Architecture of the model
model <- keras_model_sequential() %>%
  layer_dense(units = 128, activation = 'relu', input_shape = c(ncol(df_train_boot)-1)) %>%
  layer_dropout(rate = 0.5) %>%
  layer_dense(units = 64, activation = 'relu') %>%
  layer_dropout(rate = 0.5) %>%
  layer_dense(units = length(unique(df_train_boot$y)), activation = 'softmax')

# Compilation
model %>% compile(
  loss = 'categorical_crossentropy',
  optimizer = optimizer_adam(),
  metrics = c('accuracy')
)


In [9]:
# Train the model
history <- model %>% fit(
  x_train, y_train,
  epochs = 20,
  batch_size = 32,
  validation_split = 0.2
)

In [10]:
# Faire des prédictions
predictions <- model %>% predict(x_test)

In [11]:
save_model_hdf5(model, paste0("repr_data/classifiers/scMLP/",basen,"_dl_model.h5"))

In [12]:
write.table(sub("df_train_boot\\$y","",unique(colnames(y_train))),
            paste0("repr_data/classifiers/scMLP/",basen,"_new_colnames.txt"), row.names=F, col.names=F)

# Format the Classifier
To work with adverSCarial the classifier needs to be formated in a certain way.

In [13]:
scMLP_classifier <- function(expr, clusters, target){
    expr = as.matrix(expr)
    message("load model")
    library(reticulate)
    use_python("/usr/bin/python3", required = TRUE)
    library(keras)
    mlpModel <<- load_model_hdf5(paste0("repr_data/classifiers/scMLP/",basen,"_dl_model.h5"))
    newColnames <<- read.table(paste0("repr_data/classifiers/scMLP/",basen,"_new_colnames.txt"))$V1
    
    predictions <- predict(mlpModel, expr)
    colnames(predictions) <- newColnames
    rownames(predictions) <- rownames(expr)
    predictions <- as.data.frame(predictions)
    
    if (sum(clusters == target) == 0 ){
        return( c("UNDETERMINED",1))
    }
    
    cell_types <- apply(predictions[clusters == target,], 1, function(x){
        names(x[x == max(x)])[1]
    })
    table_cell_type <<- table(cell_types)
    str_class <- names(table_cell_type[order(table_cell_type, decreasing=T)][1])
    resSCMLP <- list(
        # Cell type prediction for the cluster
        prediction=str_class,
        # Score of the predicted cell type
        odd=1,
        # Score for each cell type for each cell
        typePredictions=as.data.frame(t(predictions)),
        # Cell type for each cell
        cellTypes=cell_types)

    return(resSCMLP)
}


In [14]:
expr_df <- df_test[, -which(names(df_test) == "y")]
clusters_df <- df_test$y
names(clusters_df) <- rownames(df_test)

Classification of a cluster.

In [15]:
myPreds <- scMLP_classifier(expr_df, clusters_df, 'macrophage')

load model



In [16]:
# Cell type prediction for the cluster
myPreds$prediction

In [17]:
# Score for each cell type for each cell
myPreds$typePredictions[1:5,1:5]

Unnamed: 0_level_0,HTAPP-944-SMP-7479-TST-channel1_ACCTACCGTTCCTAAG-1,HTAPP-944-SMP-7479-TST-channel1_CATGCCTAGAGCCCAA-1,HTAPP-944-SMP-7479-TST-channel1_ATTTCTGTCGCCAATA-1,HTAPP-944-SMP-7479-TST-channel1_TTGAGTGTCTAGACCA-1,HTAPP-944-SMP-7479-TST-channel1_AGACACTGTCTCGACG-1
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
blood vessel smooth muscle cell,6.046081e-33,0.0,2.589625e-34,1.2427479999999999e-26,1.530412e-37
endothelial cell,9.295307e-33,0.0,1.796802e-33,1.617502e-26,6.159909e-37
macrophage,1.087805e-26,1.235575e-29,6.347991e-27,2.222953e-21,1.365307e-28
malignant cell,0.9999999,0.9999999,0.9999999,0.9999999,0.9999999
mature NK T cell,1.514486e-32,0.0,2.8215090000000003e-33,3.107428e-25,9.357691e-37


In [18]:
# Cell type for each cell
head(myPreds$cellTypes)

In [19]:
for ( myClust in unique(clusters_df)){
    myPreds <- scMLP_classifier(expr_df, clusters_df, myClust)
    message("Cell type: ", myClust, ", prediction: ", myPreds$prediction)
}


load model

Cell type: malignant cell, prediction: malignant cell

load model

Cell type: endothelial cell, prediction: endothelial cell

load model

Cell type: blood vessel smooth muscle cell, prediction: blood vessel smooth muscle cell

load model

Cell type: macrophage, prediction: macrophage

load model

Cell type: monocyte, prediction: monocyte

load model

Cell type: mature NK T cell, prediction: macrophage



## The function scMLP_classifier can be used as classifier with the adverSCarial package functions

# Run a single-gene attack
Example of single-gene attack with the positive-aberrant modification on the fibroblast cluster.
The argument returnFirstFound=T indicates that the process should stop the attack when a successful attack is detected and return the corresponding gene

In [20]:
start_time <- Sys.time()

In [21]:
myAttack <- advSingleGene(expr_df, clusters_df, 'macrophage', scMLP_classifier, returnFirstFound = T, advMethod = "positive_aberrant")

predictWithNewValue data.frame data.frame

load model

Split number: 1/100

predictWithNewValue data.frame data.frame

load model

cellType: monocyte

classifTarget: macrophage

target: macrophage

predictWithNewValue data.frame data.frame

load model

cellType: monocyte

classifTarget: macrophage

target: macrophage

predictWithNewValue data.frame data.frame

load model

cellType: endothelial cell

classifTarget: macrophage

target: macrophage

predictWithNewValue data.frame data.frame

load model

cellType: monocyte

classifTarget: macrophage

target: macrophage

predictWithNewValue data.frame data.frame

load model

cellType: endothelial cell

classifTarget: macrophage

target: macrophage

predictWithNewValue data.frame data.frame

load model

cellType: endothelial cell

classifTarget: macrophage

target: macrophage

predictWithNewValue data.frame data.frame

load model

cellType: blood vessel smooth muscle cell

classifTarget: macrophage

target: macrophage

predictWithNewValue dat

### Computation time

In [22]:
Sys.time() - start_time

Time difference of 3.544124 mins

In [23]:
myAttack

$ENSG00000238009
[1] "endothelial cell" "1"               


Build the modified RNA expression matrix.

In [24]:
mod_expr_df <- advModifications(expr_df, clusters=clusters_df, target='macrophage', genes=names(myAttack@values)[1], advMethod = "positive_aberrant")

Check if the attack was successful.

In [25]:
mod_myPreds <- scMLP_classifier(mod_expr_df, clusters_df, 'macrophage')
mod_myPreds$prediction

load model



In [26]:
sessionInfo()

R version 4.3.3 (2024-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.4 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=fr_FR.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=fr_FR.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=fr_FR.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=fr_FR.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Paris
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] adverSCarial_1.3.6 dplyr_1.1.4        keras_2.15.0       reticulate_1.36.1 

loaded via a namespace (and not attached):
 [1] Matrix_1.6-5          jsonlite_1.8.8        compiler_4.3.3       
 [4] crayon_1.5.3