# Annotations

## Setup

In [2]:
# LOAD LIBRARIES
library(Seurat)
library(tidyverse)
library(future)
library(ggplot2)
library(dplyr)
library(presto)
library(cowplot)
library(tictoc)

library(enrichR)
library(GPTCelltype)
library(openai)

In [3]:
# Load
load.data <- function(
    file_name,
    output = F,
    reduced.output = T 
) {
    if (output | reduced.output) {
        print(paste("Loading data for time point:", file_name))
    }

    # Load the data
    sc_data <- Read10X(data.dir = paste(path_to_data, "expression_", file_name, sep = ""), gene.column = 1)

    # Create Seurat object
    sc_data <- CreateSeuratObject(counts = sc_data, min.cells = 3, min.features = 500, project = file_name, names.delim = "-", names.field = 2)

    # Normalize the data
    sc_data <- NormalizeData(sc_data, normalization.method = "LogNormalize", scale.factor = 1e6, verbose = output)

    # Find variable features
    sc_data <- FindVariableFeatures(sc_data, selection.method = "mvp", nfeatures = 2000, verbose = output)

    # Scale the data
    sc_data <- ScaleData(sc_data, verbose = output)

    return(sc_data)
}

# Cluster
PCA.cluster <- function(
    data = sc_data_scaled, 
    file_name = timepoints[time_point], 
    res = 1, 
    n_dim = 40, 
    save = F,
    output = F,
    reduced.output = T    
) {  
    if (output | reduced.output) {
        print(paste("Running PCA and clustering for time point:", file_name))
        print(paste("- Resolution:", res))
        print(paste("- Dimensions:", n_dim))
    }
        
    # PCA
    data <- RunPCA(data, npcs = n_dim, verbose = output)
    #print(ElbowPlot(object = data, ndims = 50))

    # Cluster the cells
    data <- FindNeighbors(data, dims = 1:n_dim, verbose = output)
    data <- FindClusters(data, resolution = res, verbose = output)
    
    #print(table(Idents(data)))

    # Save the PCA plot
    if (save) {
        name_new_dir <- paste(name_new_dir_partial, "/", file_name, "/cluster", param, sep="")
        if (!dir.exists(name_new_dir)) {dir.create(name_new_dir)} 
    
        print(paste("Saving PCA for time point", file_name, "in", name_new_dir))
        save(data, file = paste(name_new_dir, "/PCA_res_", res, "_dim_", n_dim, "_", file_name, ".Robj", sep=""))
    }
    return(data)
}

# FIND ALL MARKERS
cluster.markers <- function(
    data, 
    file_name = timepoints[time_point],
    output = F,
    reduced.output = T
) {
    if (output | reduced.output) {
        print(paste("Finding all markers for time point:", file_name))
    }

    # Find all markers for every cluster compared to all remaining cells
    markers <- FindAllMarkers(data,
                              only.pos = TRUE,   # Considera solo i marker espressi positivamente
                              min.pct = 0.25,    # Percentuale minima di espressione nelle cellule del cluster
                              logfc.threshold = 0.25,  # Soglia minima di LogFC
                              verbose = output)
        
    return(markers)
}

In [4]:
tic("Setup Clusterization and Markers")

# SET UP NAMES
timepoints <- c("23days", "1month", "1.5month", "2month", "3month", "4month", "5month", "6month")
housekeeping_genes <- c("ACTB", "DLG4")
genes_of_interest <- c("SRCIN1", "KIAA1217", "CIT")
path_to_data <- "/sharedFolder/Data/"

dir_annotations <- "Results/Annotations"
if (!dir.exists(dir_annotations)) {dir.create(dir_annotations)}

f_name <- "23days"
clustering_resolution <- 0.5
n_of_dimesnions <- 20

# Load data
sc_data_scaled <- load.data(file_name = f_name, reduced.output = T)

# Clusterize
sc_data <- PCA.cluster(
    data = sc_data_scaled, 
    file_name = f_name, 
    res = clustering_resolution, 
    n_dim = n_of_dimesnions,
    reduced.output = T
)

# Find Markers
cluster_markers <- cluster.markers(data = sc_data, file_name = f_name, output = F, reduced.output = T)

toc()

[1] "Loading data for time point: 23days"
[1] "Running PCA and clustering for time point: 23days"
[1] "- Resolution: 0.5"
[1] "- Dimensions: 20"
[1] "Finding all markers for time point: 23days"
Setup Clusterization and Markers: 95.965 sec elapsed


In [5]:
print(table(Idents(sc_data)))


   0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
6742 2736 2618 2448 2042 2037 1941 1810 1800 1792 1203 1025  727  508  116  106 
  16 
  85 


In [6]:
length(table(Idents(sc_data)))

## Main

### Base

In [7]:
summary(cluster_markers)
head(cluster_markers)

     p_val             avg_log2FC          pct.1            pct.2       
 Min.   :0.0000000   Min.   : 0.2500   Min.   :0.1670   Min.   :0.0000  
 1st Qu.:0.0000000   1st Qu.: 0.3978   1st Qu.:0.3250   1st Qu.:0.1770  
 Median :0.0000000   Median : 0.6083   Median :0.4350   Median :0.2760  
 Mean   :0.0001703   Mean   : 0.9779   Mean   :0.5015   Mean   :0.3412  
 3rd Qu.:0.0000000   3rd Qu.: 1.0635   3rd Qu.:0.6370   3rd Qu.:0.4470  
 Max.   :0.0099990   Max.   :13.8465   Max.   :1.0000   Max.   :0.9990  
                                                                        
   p_val_adj            cluster         gene          
 Min.   :0.0000000   7      :1339   Length:13730      
 1st Qu.:0.0000000   9      :1232   Class :character  
 Median :0.0000000   8      :1209   Mode  :character  
 Mean   :0.1003849   15     :1197                     
 3rd Qu.:0.0000002   16     :1140                     
 Max.   :1.0000000   12     : 965                     
                     (Other):66

Unnamed: 0_level_0,p_val,avg_log2FC,pct.1,pct.2,p_val_adj,cluster,gene
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<chr>
DCT,0,1.780295,0.383,0.161,0,0,DCT
AMBN,0,1.767947,0.431,0.175,0,0,AMBN
HEY1,0,1.708512,0.464,0.202,0,0,HEY1
LINC01551,0,1.623801,0.767,0.311,0,0,LINC01551
OLFM3,0,1.543925,0.396,0.189,0,0,OLFM3
SFRP1,0,1.514941,0.968,0.51,0,0,SFRP1


In [8]:
cl_0 <- cluster_markers %>% filter(cluster %in% 0) 

In [9]:
nrow(cl_0)

In [10]:
a <- as.data.frame(summary(cluster_markers$cluster))
a

Unnamed: 0_level_0,summary(cluster_markers$cluster)
Unnamed: 0_level_1,<int>
0,679
1,885
2,638
3,453
4,666
5,837
6,510
7,1339
8,1209
9,1232


In [78]:
top_genes <- cluster_markers %>% group_by(cluster) %>% top_n(n = 100, wt = avg_log2FC) %>% as.data.frame()
nrow(top_genes)

In [79]:
head(top_genes)

Unnamed: 0_level_0,p_val,avg_log2FC,pct.1,pct.2,p_val_adj,cluster,gene
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<chr>
1,0,1.780295,0.383,0.161,0,0,DCT
2,0,1.767947,0.431,0.175,0,0,AMBN
3,0,1.708512,0.464,0.202,0,0,HEY1
4,0,1.623801,0.767,0.311,0,0,LINC01551
5,0,1.543925,0.396,0.189,0,0,OLFM3
6,0,1.514941,0.968,0.51,0,0,SFRP1


### EnrichR

In [52]:
dbs <- listEnrichrDbs()
unique(dbs$libraryName)

In [6]:
# Install if needed
if (!requireNamespace("org.Hs.eg.db", quietly = TRUE)) BiocManager::install("org.Hs.eg.db")
if (!requireNamespace("AnnotationDbi", quietly = TRUE)) BiocManager::install("AnnotationDbi")

library(org.Hs.eg.db)
library(AnnotationDbi)

Bioconductor version 3.16 (BiocManager 1.30.22), R 4.2.0 (2022-04-22)

Installing package(s) 'org.Hs.eg.db'

Old packages: 'abind', 'ape', 'BH', 'BiocManager', 'bit', 'bit64', 'bitops',
  'bslib', 'cachem', 'caTools', 'colorspace', 'commonmark', 'cowplot',
  'crayon', 'crosstalk', 'curl', 'data.table', 'DBI', 'deldir', 'DEoptimR',
  'digest', 'docopt', 'dotCall64', 'dqrng', 'farver', 'fastDummies', 'fastICA',
  'fastmap', 'fitdistrplus', 'FNN', 'fontawesome', 'fs', 'future',
  'future.apply', 'ggplot2', 'ggrepel', 'ggridges', 'globals', 'gplots',
  'gtable', 'gtools', 'hdf5r', 'highr', 'htmltools', 'htmlwidgets', 'httpuv',
  'igraph', 'kernlab', 'knitr', 'later', 'leiden', 'leidenbase', 'listenv',
  'littler', 'locfit', 'mathjaxr', 'MatrixModels', 'matrixStats', 'metap',
  'miniUI', 'mixtools', 'multcomp', 'munsell', 'mvtnorm', 'parallelly',
  'patchwork', 'pheatmap', 'plotly', 'plotrix', 'polyclip', 'progress',
  'progressr', 'promises', 'qlcMatrix', 'quantreg', 'R.oo', 'R.utils', 'RA

In [81]:
# Assuming your dataframe is called top_genes and you want to map the gene column named "gene"
# Example: top_genes$gene contains gene symbols for one cluster

# Convert gene symbols to Entrez IDs
gene_symbols <- top_genes$gene
entrez_ids <- suppressMessages(mapIds(org.Hs.eg.db, keys = gene_symbols, column = "ENTREZID", keytype = "SYMBOL", multiVals = "first"))

# Add Entrez IDs to your data.frame
top_genes$entrez <- entrez_ids

In [82]:
head(top_genes)

Unnamed: 0_level_0,p_val,avg_log2FC,pct.1,pct.2,p_val_adj,cluster,gene,entrez
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<chr>,<chr>
1,0,1.780295,0.383,0.161,0,0,DCT,1638
2,0,1.767947,0.431,0.175,0,0,AMBN,258
3,0,1.708512,0.464,0.202,0,0,HEY1,23462
4,0,1.623801,0.767,0.311,0,0,LINC01551,387978
5,0,1.543925,0.396,0.189,0,0,OLFM3,118427
6,0,1.514941,0.968,0.51,0,0,SFRP1,6422


In [83]:
sum(is.na(top_genes$entrez)) # Number of genes not mapped

In [84]:
unmapped_genes <- top_genes[is.na(top_genes$entrez), "gene"]
unmapped_genes
top_genes[is.na(top_genes$entrez) & top_genes$gene %in% genes_of_interest, ]

p_val,avg_log2FC,pct.1,pct.2,p_val_adj,cluster,gene,entrez
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<chr>,<chr>


In [85]:
# Remove rows with NA Entrez IDs
top_genes <- top_genes[!is.na(top_genes$entrez), ]

In [59]:
?capture.output

0,1
capture.output {utils},R Documentation

0,1
...,Expressions to be evaluated.
file,"A file name or a connection, or NULL to return the output as a character vector. If the connection is not open, it will be opened initially and closed on exit."
append,"logical. If file a file name or unopened connection, append or overwrite?"
"type, split","are passed to sink(), see there."


In [86]:
tic()

genes_cluster <- top_genes %>% filter(cluster == 0) %>% pull(gene)
# Optionally, use Entrez IDs instead:
# genes_cluster <- top_genes %>% filter(cluster == cl) %>% pull(entrez)

# Perform enrichment analysis
enriched <- suppressMessages(suppressWarnings((enrichr(genes_cluster, databases = c(
        "Allen_Brain_Atlas_10x_scRNA_2021"#, 
        # "PanglaoDB_Augmented_2021", 
        # "CellMarker_Augmented_2021/2024", 
        # "Azimuth_Cell_Types_2021/2023", 
        # "Tabula_Sapiens/Tabula_Muris", 
        # "Descartes_Cell_Types_and_Tissue_2021", 
        # "HuBMAP_ASCT_plus_B_*", 
        # "Allen_Brain_Atlas_up/down"
)))))

toc()
#class(enriched)
head(enriched)
print("a")

Uploading data to Enrichr... Done.
  Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
Parsing results... Done.
12.911 sec elapsed


Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes
<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>
Mouse 372 SMC up,4/73,0.0004046182,0.08264024,0,0,12.636407,98.722771,CCND2;COL4A2;FILIP1;SYNE2
Human Inh L2 PAX6 FREM2 up,7/328,0.0009632692,0.08264024,0,0,4.853016,33.705059,EFNB2;LINC01551;FOXG1;TRIM24;CACHD1;RAB8B;CNTLN
Human Inh L1 PAX6 MIR101-1 up,3/44,0.0011940188,0.08264024,0,0,15.798515,106.330809,EFNB2;TRIM24;CNTLN
Mouse 358 Astro up,4/100,0.0013222439,0.08264024,0,0,9.070055,60.120180,TTYH1;ID4;SOX9;SAT1
Mouse 356 Astro up,3/72,0.0048859972,0.23162675,0,0,9.374291,49.884183,TTYH1;SOX9;SAT1
Mouse 334 L6b/CT ENT up,2/26,0.0067368642,0.23162675,0,0,17.814516,89.075444,URI1;RAB8B
Mouse 31 Sncg up,8/606,0.0082013131,0.23162675,0,0,2.968823,14.260627,SOX2;RSL1D1;URI1;GAR1;GPATCH4;MPDZ;EIF3D;GNL3
Mouse 343 L6b CTX up,3/87,0.0082513789,0.23162675,0,0,7.694488,36.913342,EMX1;URI1;RAB8B
Mouse 268 ProS up,2/29,0.0083385630,0.23162675,0,0,15.832736,75.789160,EFNB2;RAB8B
Human Astro L1 FGFR3 SERPINI2 up,3/101,0.0123712614,0.27676560,0,0,6.590617,28.948487,RFX4;CACHD1;ZFP36L1


[1] "a"


In [87]:
head(enriched$Allen_Brain_Atlas_10x_scRNA_2021)

Unnamed: 0_level_0,Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>
1,Mouse 372 SMC up,4/73,0.0004046182,0.08264024,0,0,12.636407,98.72277,CCND2;COL4A2;FILIP1;SYNE2
2,Human Inh L2 PAX6 FREM2 up,7/328,0.0009632692,0.08264024,0,0,4.853016,33.70506,EFNB2;LINC01551;FOXG1;TRIM24;CACHD1;RAB8B;CNTLN
3,Human Inh L1 PAX6 MIR101-1 up,3/44,0.0011940188,0.08264024,0,0,15.798515,106.33081,EFNB2;TRIM24;CNTLN
4,Mouse 358 Astro up,4/100,0.0013222439,0.08264024,0,0,9.070055,60.12018,TTYH1;ID4;SOX9;SAT1
5,Mouse 356 Astro up,3/72,0.0048859972,0.23162675,0,0,9.374291,49.88418,TTYH1;SOX9;SAT1
6,Mouse 334 L6b/CT ENT up,2/26,0.0067368642,0.23162675,0,0,17.814516,89.07544,URI1;RAB8B


In [90]:
allen <- enriched$Allen_Brain_Atlas_10x_scRNA_2021 %>% as.data.table()
allen$cluster <- 0
nrow(allen)
head(allen)

Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes,cluster
<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>,<dbl>
Mouse 372 SMC up,4/73,0.0004046182,0.08264024,0,0,12.636407,98.72277,CCND2;COL4A2;FILIP1;SYNE2,0
Human Inh L2 PAX6 FREM2 up,7/328,0.0009632692,0.08264024,0,0,4.853016,33.70506,EFNB2;LINC01551;FOXG1;TRIM24;CACHD1;RAB8B;CNTLN,0
Human Inh L1 PAX6 MIR101-1 up,3/44,0.0011940188,0.08264024,0,0,15.798515,106.33081,EFNB2;TRIM24;CNTLN,0
Mouse 358 Astro up,4/100,0.0013222439,0.08264024,0,0,9.070055,60.12018,TTYH1;ID4;SOX9;SAT1,0
Mouse 356 Astro up,3/72,0.0048859972,0.23162675,0,0,9.374291,49.88418,TTYH1;SOX9;SAT1,0
Mouse 334 L6b/CT ENT up,2/26,0.0067368642,0.23162675,0,0,17.814516,89.07544,URI1;RAB8B,0


In [91]:
allel_human <- allen[grepl("Human", allen$Term, ignore.case = TRUE) & allen$Adjusted.P.value < 10e-2]
nrow(allel_human)
head(allel_human)
#head(allel_human$Term, 10)

Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes,cluster
<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>,<dbl>
Human Inh L2 PAX6 FREM2 up,7/328,0.0009632692,0.08264024,0,0,4.853016,33.70506,EFNB2;LINC01551;FOXG1;TRIM24;CACHD1;RAB8B;CNTLN,0
Human Inh L1 PAX6 MIR101-1 up,3/44,0.0011940188,0.08264024,0,0,15.798515,106.33081,EFNB2;TRIM24;CNTLN,0


In [65]:
annotation.enrichR <- function (
    markers = cluster_markers,
    top_n = 50,
    database = "Allen_Brain_Atlas_10x_scRNA_2021",
    output = NULL #or paste0("Results/Annotations/EnrichR_output_cluster", cl, ".txt")
) {
    tic("EnrichR")
    top_genes <- cluster_markers %>% group_by(cluster) %>% top_n(n = top_n, wt = avg_log2FC) %>% as.data.frame()

    # if (!requireNamespace("org.Hs.eg.db", quietly = TRUE)) BiocManager::install("org.Hs.eg.db")
    # if (!requireNamespace("AnnotationDbi", quietly = TRUE)) BiocManager::install("AnnotationDbi")
    # library(org.Hs.eg.db)
    # library(AnnotationDbi)
    # Convert gene symbols to Entrez IDs
    gene_symbols <- top_genes$gene
    entrez_ids <- suppressMessages(
        mapIds(org.Hs.eg.db, keys = gene_symbols, column = "ENTREZID", keytype = "SYMBOL", multiVals = "first")
    )
    # Add Entrez IDs to your data.frame
    top_genes$entrez <- entrez_ids
    # Remove rows with NA Entrez IDs
    top_genes <- top_genes[!is.na(top_genes$entrez), ]

    annotation_list <- list()
    
    for (cl in unique(top_genes$cluster)) {
        genes_cluster <- top_genes %>% filter(cluster == cl) %>% pull(entrez)
        # Optionally, use Entrez IDs instead:
        # genes_cluster <- top_genes %>% filter(cluster == cl) %>% pull(entrez)
        
        # Perform enrichment analysis
        enriched <- enrichr(genes_cluster, databases = "Allen_Brain_Atlas_10x_scRNA_2021")

        annotation <- enriched$Allen_Brain_Atlas_10x_scRNA_2021 %>% as.data.table()
        annotation$cluster <- cl

        annotation_list[[cl]] <- allen[grepl("Human", allen$Term, ignore.case = TRUE) & allen$Adjusted.P.value < 5e-2]
    }
    
    toc()

    return(annotation_list)
}

In [226]:
markers = cluster_markers
top_n = 200
database = "Allen_Brain_Atlas_10x_scRNA_2021"
output = NULL

In [227]:
top_genes <- cluster_markers %>% group_by(cluster) %>% top_n(n = top_n, wt = avg_log2FC) %>% as.data.frame()

In [228]:
gene_symbols <- top_genes$gene
entrez_ids <- suppressMessages(
    mapIds(org.Hs.eg.db, keys = gene_symbols, column = "ENTREZID", keytype = "SYMBOL", multiVals = "first")
)
# Add Entrez IDs to your data.frame
top_genes$entrez <- entrez_ids
# Remove rows with NA Entrez IDs
top_genes <- top_genes[!is.na(top_genes$entrez), ]

In [97]:
a <- unique(top_genes$cluster)[1]

In [101]:
as.numeric(a)+1

In [69]:
annotation_list <- list()

In [70]:
for (cl in unique(top_genes$cluster)) {
    genes_cluster <- top_genes %>% filter(cluster == cl) %>% pull(entrez)
    # Optionally, use Entrez IDs instead:
    # genes_cluster <- top_genes %>% filter(cluster == cl) %>% pull(entrez)
    
    # Perform enrichment analysis
    enriched <- enrichr(genes_cluster, databases = "Allen_Brain_Atlas_10x_scRNA_2021")

    annotation <- enriched$Allen_Brain_Atlas_10x_scRNA_2021 %>% as.data.table()
    annotation$cluster <- cl

    annotation_list[[cl]] <- allen[grepl("Human", allen$Term, ignore.case = TRUE) & allen$Adjusted.P.value < 50e-2]
}

Uploading data to Enrichr... Done.
  Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
Parsing results... Done.
Uploading data to Enrichr... Done.
  Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
Parsing results... Done.
Uploading data to Enrichr... Done.
  Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
Parsing results... Done.
Uploading data to Enrichr... Done.
  Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
Parsing results... Done.
Uploading data to Enrichr... Done.
  Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
Parsing results... Done.
Uploading data to Enrichr... Done.
  Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
Parsing results... Done.
Uploading data to Enrichr... Done.
  Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
Parsing results... Done.
Uploading data to Enrichr... Done.
  Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
Parsing results... Done.
Uploading data to Enrichr... Done.
  Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
Parsing 

In [75]:
cl <- 0
genes_cluster <- top_genes %>% filter(cluster == cl) %>% pull(entrez)
# Optionally, use Entrez IDs instead:
# genes_cluster <- top_genes %>% filter(cluster == cl) %>% pull(entrez)

# Perform enrichment analysis
enriched <- enrichr(genes_cluster, databases = "Allen_Brain_Atlas_10x_scRNA_2021")

annotation <- enriched$Allen_Brain_Atlas_10x_scRNA_2021 %>% as.data.table()

Uploading data to Enrichr... Done.
  Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
Parsing results... Done.


In [77]:
enriched

Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes
<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>


In [73]:
annotation_list$'0'

Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes
<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>
Human Inh L2 PAX6 FREM2 up,7/328,0.0009632692,0.08264024,0,0,4.853016,33.705059,EFNB2;LINC01551;FOXG1;TRIM24;CACHD1;RAB8B;CNTLN
Human Inh L1 PAX6 MIR101-1 up,3/44,0.0011940188,0.08264024,0,0,15.798515,106.330809,EFNB2;TRIM24;CNTLN
Human Astro L1 FGFR3 SERPINI2 up,3/101,0.0123712614,0.2767656,0,0,6.590617,28.948487,RFX4;CACHD1;ZFP36L1
Human Inh L1-5 VIP CD27-AS1 up,2/37,0.0133465555,0.2767656,0,0,12.208909,52.69972,FOXG1;RAB8B
Human Inh L5-6 SST KLHL1 up,2/40,0.0154880314,0.2767656,0,0,11.24335,46.858773,ARX;RAB8B
Human Endo L2-5 NOSTRIN SRGN up,2/43,0.0177664394,0.29610732,0,0,10.419093,41.993569,YES1;SYNE2
Human Inh L5-6 PVALB FAM150B up,2/48,0.0218566319,0.3104164,0,0,9.284245,35.495998,FILIP1;CREB5
Human Astro L1-6 FGFR3 PLCG1 up,2/64,0.0372052475,0.37459555,0,0,6.882761,22.653269,TTYH1;RFX4
Human Exc L6 FEZF2 KLK7 down,1/11,0.0510387981,0.37523538,0,0,21.164894,62.969139,QKI
Human Exc L3-5 RORB LAMA4 down,1/12,0.0555488338,0.37523538,0,0,19.239845,55.612633,QKI


In [31]:
genes_cluster <- top_genes %>% filter(cluster == 1) %>% pull(entrez)
# Optionally, use Entrez IDs instead:
# genes_cluster <- top_genes %>% filter(cluster == cl) %>% pull(entrez)

# Perform enrichment analysis
enriched <- enrichr(genes_cluster, databases = "Allen_Brain_Atlas_10x_scRNA_2021")

Uploading data to Enrichr... Done.
  Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
Parsing results... Done.


In [32]:
enriched

Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes
<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>


In [211]:
annotation <- enriched$Allen_Brain_Atlas_10x_scRNA_2021 %>% as.data.table()
annotation$cluster <- 1

ERROR: Error in enriched$Allen_Brain_Atlas_10x_scRNA_2021: $ operator is invalid for atomic vectors


In [66]:
enrichR_list <- annotation.enrichR()

Uploading data to Enrichr... Done.
  Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
Parsing results... Done.
Uploading data to Enrichr... Done.
  Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
Parsing results... Done.
Uploading data to Enrichr... Done.
  Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
Parsing results... Done.
Uploading data to Enrichr... Done.
  Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
Parsing results... Done.
Uploading data to Enrichr... Done.
  Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
Parsing results... Done.
Uploading data to Enrichr... Done.
  Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
Parsing results... Done.
Uploading data to Enrichr... Done.
  Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
Parsing results... Done.
Uploading data to Enrichr... Done.
  Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
Parsing results... Done.
Uploading data to Enrichr... Done.
  Querying Allen_Brain_Atlas_10x_scRNA_2021... Done.
Parsing 

In [67]:
enrichR_list

Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes
<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>

Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes
<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>

Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes
<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>

Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes
<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>

Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes
<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>

Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes
<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>

Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes
<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>

Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes
<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>

Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes
<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>

Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes
<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>

Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes
<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>

Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes
<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>

Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes
<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>

Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes
<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>

Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes
<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>

Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes
<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>

Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes
<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>


In [None]:
library(dplyr)
library(enrichR)

clusters <- unique(top_genes$cluster)
all_results <- list()

for (cl in clusters) {
  gene_list <- top_genes %>% filter(cluster == cl) %>% pull(gene)
  enr <- enrichr(gene_list, databases = "Allen_Brain_Atlas_10x_scRNA_2021")
  res <- enr$Allen_Brain_Atlas_10x_scRNA_2021
  res$cluster <- cl
  all_results[[cl]] <- res
}

final_df <- bind_rows(all_results)
write.csv(final_df, "enrichr_all_clusters.csv", row.names = FALSE)

In [188]:
capture.output(enrichR_list, file = "aa.pdf")

In [122]:
enrichR_list[[2]]$Term

In [None]:
tic("Perform enrichment analysis")
for (cl in unique(top_genes$cluster)) {
    genes_cluster <- top_genes %>% filter(cluster == cl) %>% pull(entrez)
    # Optionally, use Entrez IDs instead:
    # genes_cluster <- top_genes %>% filter(cluster == cl) %>% pull(entrez)
    
    # Perform enrichment analysis
    enriched <- enrichr(genes_cluster, databases = "Allen_Brain_Atlas_10x_scRNA_2021")
    

}

toc()

In [35]:
?enrichr

0,1
enrichr {enrichR},R Documentation

0,1
genes,"(Required). Character vector of Entrez gene symbols as input. A data.frame of gene symbols in first column is also acceptable, optionally a score denoting the degree of membership between 0 and 1 in the second column."
databases,(Required). Character vector of databases to search. See https://maayanlab.cloud/Enrichr/ for available databases.
background,"(Optional). Character vector of Entrez gene symbols to be used as background. A data.frame of gene symbols in first column is also acceptable. Default is ""NULL"". Enrichment analysis with background genes is only available on the main site (Enrichr). Also, it is using a different API service (Speedrichr), hence it is a little slower to complete and return the results."
include_overlap,"(Optional). Download database in GMT format to include 'Overlap' in the resulting data.frame when analysing with a background. Default is ""FALSE""."
sleepTime,(Optional) Time to wait (in seconds) between sending requests to the server to prevent the same results being returned as the previous request. Default is 1.


### singleR

In [1]:
BiocManager::install("devtools")

Bioconductor version 3.16 (BiocManager 1.30.22), R 4.2.0 (2022-04-22)

Installing package(s) 'devtools'

also installing the dependencies ‘credentials’, ‘zip’, ‘gitcreds’, ‘ini’, ‘diffobj’, ‘gert’, ‘gh’, ‘whisker’, ‘downlit’, ‘httr2’, ‘rmarkdown’, ‘xopen’, ‘brew’, ‘brio’, ‘evaluate’, ‘praise’, ‘waldo’, ‘usethis’, ‘desc’, ‘pkgbuild’, ‘pkgdown’, ‘pkgload’, ‘profvis’, ‘rcmdcheck’, ‘roxygen2’, ‘rversions’, ‘sessioninfo’, ‘testthat’, ‘urlchecker’


Old packages: 'abind', 'ape', 'BH', 'BiocManager', 'bit', 'bit64', 'bitops',
  'bslib', 'cachem', 'caTools', 'colorspace', 'commonmark', 'cowplot',
  'crayon', 'crosstalk', 'curl', 'data.table', 'DBI', 'deldir', 'DEoptimR',
  'digest', 'docopt', 'dotCall64', 'dqrng', 'farver', 'fastDummies', 'fastICA',
  'fastmap', 'fitdistrplus', 'FNN', 'fontawesome', 'fs', 'future',
  'future.apply', 'ggplot2', 'ggrepel', 'ggridges', 'globals', 'gplots',
  'gtable', 'gtools', 'hdf5r', 'highr', 'htmltools', 'htmlwidgets', 'httpuv',
  'igraph', 'kernlab', 'knitr'

In [2]:
devtools::install_github('dviraran/SingleR')
# this might take long, though mostly because of the installation of Seurat.

Downloading GitHub repo dviraran/SingleR@HEAD



Error in utils::download.file(url, path, method = method, quiet = quiet,  : 
  download from 'https://api.github.com/repos/dviraran/SingleR/tarball/HEAD' failed


In [4]:
BiocManager::install("SingleR")

Bioconductor version 3.16 (BiocManager 1.30.22), R 4.2.0 (2022-04-22)

Installing package(s) 'SingleR'

also installing the dependencies ‘ScaledMatrix’, ‘rsvd’, ‘BiocSingular’, ‘BiocNeighbors’


Old packages: 'abind', 'ape', 'BH', 'BiocManager', 'bit', 'bit64', 'bitops',
  'bslib', 'cachem', 'caTools', 'colorspace', 'commonmark', 'cowplot',
  'crayon', 'crosstalk', 'curl', 'data.table', 'DBI', 'deldir', 'DEoptimR',
  'digest', 'docopt', 'dotCall64', 'dqrng', 'farver', 'fastDummies', 'fastICA',
  'fastmap', 'fitdistrplus', 'FNN', 'fontawesome', 'fs', 'future',
  'future.apply', 'ggplot2', 'ggrepel', 'ggridges', 'globals', 'gplots',
  'gtable', 'gtools', 'hdf5r', 'highr', 'htmltools', 'htmlwidgets', 'httpuv',
  'igraph', 'kernlab', 'knitr', 'later', 'leiden', 'leidenbase', 'listenv',
  'littler', 'locfit', 'mathjaxr', 'MatrixModels', 'matrixStats', 'metap',
  'miniUI', 'mixtools', 'multcomp', 'munsell', 'mvtnorm', 'parallelly',
  'patchwork', 'pheatmap', 'plotly', 'plotrix', 'polyclip', 

In [5]:
library(SingleR)

Loading required package: SummarizedExperiment

Loading required package: MatrixGenerics

Loading required package: matrixStats


Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats, rowProds, rowQuantiles, rowRanges

### GPTCelltype

In [None]:
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

In [138]:
Sys.setenv(OPENAI_API_KEY = "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")

In [142]:
openai::list_models()

Unnamed: 0_level_0,id,object,created,owned_by
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>
1,text-embedding-ada-002,model,1671217299,openai-internal
2,whisper-1,model,1677532384,openai-internal
3,gpt-3.5-turbo,model,1677610602,openai
4,tts-1,model,1681940951,openai-internal
5,gpt-3.5-turbo-16k,model,1683758102,openai-internal
6,davinci-002,model,1692634301,system
7,babbage-002,model,1692634615,system
8,gpt-3.5-turbo-instruct,model,1692901427,system
9,gpt-3.5-turbo-instruct-0914,model,1694122472,system
10,dall-e-3,model,1698785189,system


In [154]:
nrow(cluster_markers)
dt <- cluster_markers[cluster_markers$cluster %in% c(0:10),]
nrow(dt)

In [167]:
length(unique(dt$cluster))

In [189]:
dt <- cluster_markers %>% group_by(cluster) %>% top_n(n = 50, wt = avg_log2FC) %>% as.data.frame()
nrow(dt)
dt <- dt[dt$cluster %in% c(0),]
nrow(dt)

In [190]:
# Suppose df is your FindAllMarkers result
csv_string <- paste(capture.output(write.csv(dt, row.names = FALSE)), collapse = "\n")
char_count <- nchar(csv_string)
token_estimate <- char_count / 4
cat("Estimated tokens:", token_estimate, "\n")

Estimated tokens: 837.5 


In [163]:
# Suppose df is your FindAllMarkers result
csv_string <- paste(capture.output(write.csv(cluster_markers, row.names = FALSE)), collapse = "\n")
char_count <- nchar(csv_string)
token_estimate <- char_count / 4
cat("Estimated tokens:", token_estimate, "\n")

Estimated tokens: 272570.2 


In [141]:
library(GPTCelltype)
library(openai)

tic()
res <- gptcelltype(
    cluster_markers, tissuename = "Human brain",
    model = "gpt-4"
)
toc()

[1] "Note: OpenAI API key found: returning the cell type annotations."


ERROR: Error: OpenAI API request failed [429]:

You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.


In [143]:
openai::num_tokens_from_messages(
  messages = list(list(role = "user", content = "how can i calculate how many tocken will i need to do soemthing with the gpt 4 api? I what to use gptcelltype to annotate some cells. I need to do it a few times")),
  model = "gpt-4"
)

ERROR: Error: 'num_tokens_from_messages' is not an exported object from 'namespace:openai'


In [132]:
?gptcelltype

0,1
gptcelltype {GPTCelltype},R Documentation

0,1
input,"Either the differential gene table returned by Seurat FindAllMarkers() function, or a list of genes."
tissuename,Optional input of tissue name.
model,A valid GPT-4 or GPT-3.5 model name list on https://platform.openai.com/docs/models. Default is 'gpt-4-32k'.
topgenenumber,Number of top differential genes to be used if input is Seurat differential genes.
openai_key,"The OpenAI key obtained from https://platform.openai.com/account/api-keys The default is NA, which will resulting outputing the prompt itself. If an actual key is provided, then the output will be the celltype annotations from the GPT model specified by the user."
