# Train scRF, a random forest-based classifier on an axilla 10k cells dataset

- Train a random forest classifier
- Format the classifier to use it with adverSCarial
- Run a max-change attack

In [1]:
library(randomForest)
library(dplyr)
library(adverSCarial)

randomForest 4.7-1.1

Type rfNews() to see new features/changes/bug fixes.


Attaching package: ‘dplyr’


The following object is masked from ‘package:randomForest’:

    combine


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
packageVersion("randomForest")

[1] ‘4.7.1.1’

In [3]:
c_basen = c("hgnc_axilla_10k", "hgnc_brain_7k", "hgnc_liver_6k")

In [4]:
basen = c_basen[1]
df_train_boot <- read.table(paste0("data//v5/data//sc//",basen,"_train_balanced.txt"))
df_test <- read.table(paste0("data//v5/data//sc//",basen,"_test.txt"))

In [5]:
dim(df_train_boot)

In [1]:
sqrt(25346)

In [6]:
# Prepare data
x_train <- as.matrix(df_train_boot[, -which(names(df_train_boot) == "y")])
x_test <- as.matrix(df_test[, -which(names(df_test) == "y")])

In [7]:
y_train <- as.factor(df_train_boot$y)
y_test <- as.factor(df_test$y)

In [8]:
# Balanced training set
table(y_train)

y_train
  blood vessel endothelial cell blood vessel smooth muscle cell 
                            100                             100 
                     fibroblast                      macrophage 
                            100                             100 
                 malignant cell                          T cell 
                            100                             100 

Train model with default values

In [9]:
rf_model <- randomForest(x_train, y_train)

In [10]:
saveRDS(rf_model, paste0("repr_data/classifiers/scRF/",basen,"_random_forest_model.rds"))

In [11]:
rfModel <- readRDS(paste0("repr_data/classifiers/scRF/",basen,"_random_forest_model.rds"))

# Format the Classifier
To work with adverSCarial the classifier needs to be formated in a certain way.

In [12]:
scRF_classifier <- function(expr, clusters, target){
    library(randomForest)
    # Load the model
    rfModel <- readRDS(paste0("repr_data/classifiers/scRF/",basen,"_random_forest_model.rds"))
    # Predict with the model
    predictions <- predict(rfModel, expr, type="prob")
    if (sum(clusters == target) == 0 ){
        return( c("UNDETERMINED",1))
    }
    # Get the cell type with the highest odds
    cell_types <- apply(predictions[clusters == target,], 1, function(x){
        names(x[x == max(x)])[1]
    })
    table_cell_type <<- table(cell_types)
    str_class <- names(table_cell_type[order(table_cell_type, decreasing=T)][1])
    resSCtype <- list(
        # Cell type prediction for the cluster
        prediction=str_class,
        # Score of the predicted cell type
        odd=1,
        # Score for each cell type for each cell
        typePredictions=as.data.frame(t(predictions)),
        # Cell type for each cell
        cellTypes=cell_types)

    return(resSCtype)
}

In [13]:
df_pbmc_test <- df_test
expr_df <- df_pbmc_test[, -which(names(df_pbmc_test) == "y")]
clusters_df <- df_pbmc_test$y
names(clusters_df) <- rownames(df_pbmc_test)

Classification of a cluster.

In [14]:
myPreds <- scRF_classifier(expr_df, clusters_df, "macrophage")

In [15]:
# Cell type prediction for the cluster
myPreds$prediction

In [16]:
# Score for each cell type for each cell
myPreds$typePredictions[1:5,1:5]

Unnamed: 0_level_0,HTAPP-878-SMP-7149-TST-channel1_GATGTTGCAAACGTGG-1,HTAPP-878-SMP-7149-TST-channel1_CTTTCAAGTAGGTACG-1,HTAPP-878-SMP-7149-TST-channel1_CACCGTTGTTCTGACA-1,HTAPP-878-SMP-7149-TST-channel1_GAGTTTGCACAACGTT-1,HTAPP-878-SMP-7149-TST-channel1_GATGACTTCTTTGCTA-1
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
blood vessel endothelial cell,0.0,0.056,0.002,0.002,0.002
blood vessel smooth muscle cell,0.002,0.024,0.0,0.0,0.0
fibroblast,0.0,0.026,0.0,0.0,0.0
macrophage,0.004,0.026,0.0,0.0,0.0
malignant cell,0.99,0.862,0.998,0.998,0.998


In [17]:
# Cell type for each cell
head(myPreds$cellTypes)

## Check the prediction for each cell type

In [18]:
for ( myClust in unique(clusters_df)){
    myPreds <- scRF_classifier(expr_df, clusters_df, myClust)
    message("Cell type: ", myClust, ", prediction: ", myPreds$prediction)
}


Cell type: malignant cell, prediction: malignant cell

Cell type: blood vessel endothelial cell, prediction: blood vessel endothelial cell

Cell type: fibroblast, prediction: fibroblast

Cell type: macrophage, prediction: macrophage

Cell type: blood vessel smooth muscle cell, prediction: blood vessel smooth muscle cell

Cell type: T cell, prediction: T cell



## The function RF_classifier can be used as classifier with the adverSCarial package functions

# Run a max-change attack
Example of max-change attack with the positive-aberrant modification on the macrophage cluster.

In [19]:
start_time <- Sys.time()

In [20]:
myAttack <- advMaxChange(expr_df, clusters_df, "macrophage", scRF_classifier, advMethod = "perc99", maxSplitSize=1000)

predictWithNewValue data.frame data.frame

predictWithNewValue data.frame data.frame

predictWithNewValue data.frame data.frame

predictWithNewValue data.frame data.frame

predictWithNewValue data.frame data.frame

predictWithNewValue data.frame data.frame

predictWithNewValue data.frame data.frame

predictWithNewValue data.frame data.frame

result length: 15841

predictWithNewValue data.frame data.frame

predictWithNewValue data.frame data.frame

predictWithNewValue data.frame data.frame

predictWithNewValue data.frame data.frame

predictWithNewValue data.frame data.frame

predictWithNewValue data.frame data.frame

predictWithNewValue data.frame data.frame

predictWithNewValue data.frame data.frame

predictWithNewValue data.frame data.frame

predictWithNewValue data.frame data.frame

predictWithNewValue data.frame data.frame

predictWithNewValue data.frame data.frame

predictWithNewValue data.frame data.frame

predictWithNewValue data.frame data.frame

predictWithNewValue data.frame d

### Computation time

In [21]:
Sys.time() - start_time

Time difference of 1.119891 hours

In [22]:
length(myAttack)

Check if the attack was successful.

In [23]:
mod_expr_df <- advModifications(expr_df, clusters=clusters_df, target="macrophage", genes=myAttack@values, advMethod = "perc99")

In [24]:
mod_myPreds <- scRF_classifier(mod_expr_df, clusters_df, "macrophage")
mod_myPreds$prediction

The cluster is still classified as "macrophage" after the modification of the 13714 genes.

In [25]:
sessionInfo()

R version 4.3.3 (2024-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.4 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=fr_FR.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=fr_FR.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=fr_FR.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=fr_FR.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Paris
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] adverSCarial_1.3.6   dplyr_1.1.4          randomForest_4.7-1.1

loaded via a namespace (and not attached):
 [1] Matrix_1.6-5          jsonlite_1.8.8        compiler_4.3.3       
 [4] crayon_1.5.3          tid