# Train scRF, a random forest-based classifier

- How to train a random forest classifier
- How to format the classifier to use it with adverSCarial

In [1]:
library(randomForest)
library(dplyr)

randomForest 4.7-1.1

Type rfNews() to see new features/changes/bug fixes.


Attaching package: ‘dplyr’


The following object is masked from ‘package:randomForest’:

    combine


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
packageVersion("randomForest")

[1] ‘4.7.1.1’

In [3]:
df_train <- read.table("data//v2//seurat_scaled_pbmc_train.txt")
df_test <- read.table("data//v2//seurat_scaled_pbmc_test.txt")

In [4]:
# Unbalanced training set
table(df_train$y)


           B   CD14+ Mono        CD8 T           DC FCGR3A+ Mono Memory CD4 T 
         172          240          140           16           81          236 
 Naive CD4 T           NK     Platelet 
         356           72            7 

In [5]:
df_train[1:5, c(colnames(df_train)[1:5],"y")]

Unnamed: 0_level_0,AL627309.1,AP006222.2,RP11.206L10.2,RP11.206L10.9,LINC00115,y
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
TCCCGATGAGATCC-1,-0.05812316,-0.03357571,-0.04166819,-0.03364562,-0.08223981,Memory CD4 T
TTCATGTGTGGTGT-1,-0.05812316,-0.03357571,-0.04166819,-0.03364562,-0.08223981,Memory CD4 T
CCAGTGCTAACCAC-1,-0.05812316,-0.03357571,-0.04166819,-0.03364562,-0.08223981,Memory CD4 T
AACTCTTGCAGGAG-1,-0.05812316,-0.03357571,-0.04166819,-0.03364562,-0.08223981,Memory CD4 T
CGAGCCGAGGCGAA-1,-0.05812316,-0.03357571,-0.04166819,-0.03364562,-0.08223981,Memory CD4 T


In [6]:
head(df_train$y)

In [7]:
# Generate boostrap data
generate_bootstrap_samples <- function(subset_df, target_count) {
  n_needed <- target_count - nrow(subset_df)
  bootstrap_samples <- data.frame(matrix(ncol = ncol(subset_df), nrow = n_needed))
  
  names(bootstrap_samples) <- names(subset_df)
  
  for (col in names(subset_df)) {
    sampled_values <- sample(subset_df[[col]], n_needed, replace = TRUE)
    bootstrap_samples[[col]] <- sampled_values
  }
  
  return(bootstrap_samples)
}


In [8]:
# Generate a balanced dataset by using boostrap data if necessary
balance_data <- function(df, label_column, target_count) {
  balanced_df <- data.frame()
  
  labels <- unique(df[[label_column]])
  
  for (label in labels) {
    subset_df <- df[df[[label_column]] == label, ]
    
    if (nrow(subset_df) >= target_count) {
      subset_balanced <- subset_df[sample(nrow(subset_df), target_count), ]
    } else {
      extra_samples <- generate_bootstrap_samples(subset_df, target_count)
      subset_balanced <- rbind(subset_df, extra_samples)
    }
    
    balanced_df <- rbind(balanced_df, subset_balanced)
  }
  
  return(balanced_df)
}


In [9]:
df_train_boot <- balance_data(df_train, "y", 100)

In [10]:
dim(df_train_boot)

In [15]:
# Prepare data
x_train <- as.matrix(df_train_boot[, -which(names(df_train_boot) == "y")])
x_test <- as.matrix(df_test[, -which(names(df_test) == "y")])

In [16]:
y_train <- as.factor(df_train_boot$y)
y_test <- as.factor(df_test$y)

In [37]:
# Balanced training set
table(y_train)

y_train
           B   CD14+ Mono        CD8 T           DC FCGR3A+ Mono Memory CD4 T 
         100          100          100          100          100          100 
 Naive CD4 T           NK     Platelet 
         100          100          100 

Train model with default values

In [18]:
rf_model <- randomForest(x_train, y_train)

In [23]:
saveRDS(rf_model, "repr_data/classifiers/scRF/random_forest_model.rds")

In [24]:
rfModel <- readRDS("repr_data/classifiers/scRF/random_forest_model.rds")

# Format the Classifier
To work with adverSCarial the classifier needs to be formated in a certain way.

In [39]:
RF_classifier <- function(expr, clusters, target){
    library(randomForest)
    # Load the model
    rfModel <- readRDS("repr_data/classifiers/scRF/random_forest_model.rds")
    # Predict with the model
    predictions <- predict(rfModel, expr, type="prob")
    if (sum(clusters == target) == 0 ){
        return( c("UNDETERMINED",1))
    }
    # Get the cell type with the highest odds
    cell_types <- apply(predictions[clusters == target,], 1, function(x){
        names(x[x == max(x)])[1]
    })
    table_cell_type <<- table(cell_types)
    str_class <- names(table_cell_type[order(table_cell_type, decreasing=T)][1])
    resSCtype <- list(
        # Cell type prediction for the cluster
        prediction=str_class,
        # Score of the predicted cell type
        odd=1,
        # Score for each cell type for each cell
        typePredictions=as.data.frame(t(predictions)),
        # Cell type for each cell
        cellTypes=cell_types)

    return(resSCtype)
}

In [26]:
df_pbmc_test <- df_test
expr_df <- df_pbmc_test[, -which(names(df_pbmc_test) == "y")]
clusters_df <- df_pbmc_test$y
names(clusters_df) <- rownames(df_pbmc_test)

Classification of a cluster.

In [40]:
myPreds <- RF_classifier(expr_df, clusters_df, "NK")

In [34]:
# Cell type prediction for the cluster
myPreds$prediction

In [35]:
# Score for each cell type for each cell
myPreds$typePredictions[1:5,1:5]

Unnamed: 0_level_0,AAACATACAACCAC-1,AAACATTGATCAGC-1,AAACGCACTGGTAC-1,AAATGTTGCCACAA-1,AACACGTGCAGAGG-1
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
B,0.052,0.076,0.102,0.08,0.106
CD14+ Mono,0.016,0.04,0.038,0.054,0.034
CD8 T,0.254,0.19,0.198,0.242,0.228
DC,0.012,0.018,0.022,0.04,0.03
FCGR3A+ Mono,0.008,0.02,0.02,0.036,0.026


In [36]:
# Cell type for each cell
head(myPreds$cellTypes)

## Check the prediction for each cell type

In [41]:
for ( myClust in unique(clusters_df)){
    myPreds <- RF_classifier(expr_df, clusters_df, myClust)
    message("Cell type: ", myClust, ", prediction: ", myPreds$prediction)
}


Cell type: Memory CD4 T, prediction: Memory CD4 T

Cell type: B, prediction: B

Cell type: CD14+ Mono, prediction: CD14+ Mono

Cell type: NK, prediction: NK

Cell type: CD8 T, prediction: CD8 T

Cell type: Naive CD4 T, prediction: Naive CD4 T

Cell type: FCGR3A+ Mono, prediction: FCGR3A+ Mono

Cell type: DC, prediction: DC

Cell type: Platelet, prediction: Platelet



## The function RF_classifier can be used as classifier with the adverSCarial package functions

In [38]:
sessionInfo()

R version 4.3.3 (2024-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.4 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=fr_FR.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=fr_FR.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=fr_FR.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=fr_FR.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Paris
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] dplyr_1.1.2          randomForest_4.7-1.1

loaded via a namespace (and not attached):
 [1] digest_0.6.31    IRdisplay_1.1    R6_2.5.1         utf8_1.2.4      
 [5] base64enc_0.1-3  fastmap_1.1.1    tidyselect