In [30]:
# 载入需要的R包
# Load the required packages
library(stringr)
library(tidyverse)
library(data.table)
library(ape)
library(kmer)
library(amap)
library(igraph)
library(pheatmap)
library(psych)
library(Seurat)
library(Biostrings)
library(reshape2)
library(ggplot2)
library(dplyr)

# Collection and processing of RNA editing sites datasets

In [2]:
# 设置RNA编辑位点的侧翼序列长度，
# setting length of flanking sequence of C-to-U RNA editing sites
flank_len <- 100

In [3]:
# 读取来自REDIdb 3.0数据库的RNA编辑位点的信息
# load the detailed information of chloroplast RNA editing sites collected in REDIdb 3.0
dna <- read.csv("../REDIdb/REDIdb Table report.csv",head=T)
dna <- dna %>%
    dplyr::select(Genbank,Organism,Gene_name,base_seq,cdna_seq) %>%
    unique() 
REDIdb <- read.csv("../REDIdb/REDIdb_Table2.csv")
REDIdb <- REDIdb %>%
    inner_join(dna) %>%
    dplyr::select(-c(Number_of_events,Genomic.position,Genomic.codon,Edited.codon,Genomic.AA,Edited.AA)) 

[1m[22mJoining with `by = join_by(Genbank, Organism, Gene_name)`


In [4]:
head(REDIdb)

Unnamed: 0_level_0,Genbank,Organism,Location,Gene_name,cDNA.position,base_seq,cdna_seq
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>
1,Z80874,Bazzania trilobata,chloroplast,ndhB,383,TTTGTATCTCTAGAATGTTTAAGTTTATGTTCATATTTATTATCTGGTTATACTAAAAAAGACGTTCGATCTAACGAAGCTGTTATGAAATATTTACTTATAGGTGGAACAAGTTCATCTATATTAATTTATGGTTTTTCTTGGTTATATGGCTTATCGGGAGGAGAATTTCAACTTCAAAAAATAGCGAATGGACTTATAAATACAGAAATGTATAATTCTTCGGGAACTTTACTTGGACTTATATTTATCATTGCAGGAATTGGATTTAAACTTTCCTTAGTACCTTTTCATCAATGGACTCCCGATGTATATGAAGGATCCCCCACCCCAGTTGTTGCTTTTCTTTCGGTTGCTTCGAAAATAGCAGGTTTAGCTTTATCAGCTCGTCCCTTGAATATTGTTTTTCCCTTTCTTTTTAACCAATGGCATTTTATTCTAGAA,TTTGTATCTCTAGAATGTTTAAGTTTATGTTCATATTTATTATCTGGTTATACTAAAAAAGACGTTCGATCTAACGAAGCTGTTATGAAATATTTACTTATAGGTGGAACAAGTTCATCTATATTAATTTATGGTTTTTCTTGGTTATATGGCTTATCGGGAGGAGAATTTCAACTTCAAAAAATAGCGAATGGACTTATAAATACAGAAATGTATAATTCTTCGGGAACTTTACTTGGACTTATATTTATCATTGCAGGAATTGGATTTAAACTTTCCTTAGTACCTTTTCATCAATGGACTCCCGATGTATATGAAGGATCCCCCACCCCAGTTGTTGCTTTTCTTTCGGTTGCTTCGAAAATAGCAGGTTTAGCTTTATTAGCTCGTCTCTTGAATATTGTTTTTCCCTTTCTTTTTAACCAATGGCATTTTATTCTAGAA
2,Z80874,Bazzania trilobata,chloroplast,ndhB,392,TTTGTATCTCTAGAATGTTTAAGTTTATGTTCATATTTATTATCTGGTTATACTAAAAAAGACGTTCGATCTAACGAAGCTGTTATGAAATATTTACTTATAGGTGGAACAAGTTCATCTATATTAATTTATGGTTTTTCTTGGTTATATGGCTTATCGGGAGGAGAATTTCAACTTCAAAAAATAGCGAATGGACTTATAAATACAGAAATGTATAATTCTTCGGGAACTTTACTTGGACTTATATTTATCATTGCAGGAATTGGATTTAAACTTTCCTTAGTACCTTTTCATCAATGGACTCCCGATGTATATGAAGGATCCCCCACCCCAGTTGTTGCTTTTCTTTCGGTTGCTTCGAAAATAGCAGGTTTAGCTTTATCAGCTCGTCCCTTGAATATTGTTTTTCCCTTTCTTTTTAACCAATGGCATTTTATTCTAGAA,TTTGTATCTCTAGAATGTTTAAGTTTATGTTCATATTTATTATCTGGTTATACTAAAAAAGACGTTCGATCTAACGAAGCTGTTATGAAATATTTACTTATAGGTGGAACAAGTTCATCTATATTAATTTATGGTTTTTCTTGGTTATATGGCTTATCGGGAGGAGAATTTCAACTTCAAAAAATAGCGAATGGACTTATAAATACAGAAATGTATAATTCTTCGGGAACTTTACTTGGACTTATATTTATCATTGCAGGAATTGGATTTAAACTTTCCTTAGTACCTTTTCATCAATGGACTCCCGATGTATATGAAGGATCCCCCACCCCAGTTGTTGCTTTTCTTTCGGTTGCTTCGAAAATAGCAGGTTTAGCTTTATTAGCTCGTCTCTTGAATATTGTTTTTCCCTTTCTTTTTAACCAATGGCATTTTATTCTAGAA
3,Z80873,Phaseolus vulgaris,chloroplast,ndhB,86,TTTGTAGCTCTAGAATGTTTCAGTTTATGTTCCTATCTACTATCTGGATATACCAAGAAAGATGTACGGTCTAATGAGGCTACTACGAAATATTTACTCATGGGTGGGGCAAGCTCTTCTATTCTGGTTCATGGTTTCTCTTGGCTATATGGTTCATCCGGGGGAGAGATCGAGCTTCAAGAAATAGTGAATGGTCTTATCAATACACAAATGTATAACTCCCCAGGAATTTTAATTGCACTTTTATTCATCACTGTAGGAATTGGGTTCAAGCTTTCCCCAGCCCCTTCTCATCAATGGACTCCTGACGTATACGAAGGATCTCCCACTCCAGTCGTTGCTTTTCTTTCTGTTACTTCGAAAGTAGCTGCTTCAGCTTCAGCCACTCGAATTTTCGATATCCCTTTTTATTTCTCATCAAACGAATGGCATCTTCTTCTGGAA,TTTGTAGCTCTAGAATGTTTCAGTTTATGTTCCTATCTACTATCTGGATATACCAAGAAAGATGTACGGTCTAATGAGGCTACTATGAAATATTTACTCATGGGTGGGGCAAGCTCTTCTATTCTGGTTTATGGTTTCTCTTGGCTATATGGTTCATCCGGGGGAGAGATCGAGCTTCAAGAAATAGTGAATGGTCTTATCAATACACAAATGTATAACTCCCCAGGAATTTTAATTGCACTTTTATTCATCACTGTAGGAATTGGGTTCAAGCTTTCCCTAGCCCCTTTTCATCAATGGACTCCTGACGTATACGAAGGATCTCCCACTCCAGTCGTTGCTTTTCTTTCTGTTACTTCGAAAGTAGCTGCTTTAGCTTTAGCCACTCGAATTTTCGATATCCCTTTTTATTTCTCATCAAACGAATGGCATCTTCTTCTGGAA
4,Z80873,Phaseolus vulgaris,chloroplast,ndhB,130,TTTGTAGCTCTAGAATGTTTCAGTTTATGTTCCTATCTACTATCTGGATATACCAAGAAAGATGTACGGTCTAATGAGGCTACTACGAAATATTTACTCATGGGTGGGGCAAGCTCTTCTATTCTGGTTCATGGTTTCTCTTGGCTATATGGTTCATCCGGGGGAGAGATCGAGCTTCAAGAAATAGTGAATGGTCTTATCAATACACAAATGTATAACTCCCCAGGAATTTTAATTGCACTTTTATTCATCACTGTAGGAATTGGGTTCAAGCTTTCCCCAGCCCCTTCTCATCAATGGACTCCTGACGTATACGAAGGATCTCCCACTCCAGTCGTTGCTTTTCTTTCTGTTACTTCGAAAGTAGCTGCTTCAGCTTCAGCCACTCGAATTTTCGATATCCCTTTTTATTTCTCATCAAACGAATGGCATCTTCTTCTGGAA,TTTGTAGCTCTAGAATGTTTCAGTTTATGTTCCTATCTACTATCTGGATATACCAAGAAAGATGTACGGTCTAATGAGGCTACTATGAAATATTTACTCATGGGTGGGGCAAGCTCTTCTATTCTGGTTTATGGTTTCTCTTGGCTATATGGTTCATCCGGGGGAGAGATCGAGCTTCAAGAAATAGTGAATGGTCTTATCAATACACAAATGTATAACTCCCCAGGAATTTTAATTGCACTTTTATTCATCACTGTAGGAATTGGGTTCAAGCTTTCCCTAGCCCCTTTTCATCAATGGACTCCTGACGTATACGAAGGATCTCCCACTCCAGTCGTTGCTTTTCTTTCTGTTACTTCGAAAGTAGCTGCTTTAGCTTTAGCCACTCGAATTTTCGATATCCCTTTTTATTTCTCATCAAACGAATGGCATCTTCTTCTGGAA
5,Z80873,Phaseolus vulgaris,chloroplast,ndhB,281,TTTGTAGCTCTAGAATGTTTCAGTTTATGTTCCTATCTACTATCTGGATATACCAAGAAAGATGTACGGTCTAATGAGGCTACTACGAAATATTTACTCATGGGTGGGGCAAGCTCTTCTATTCTGGTTCATGGTTTCTCTTGGCTATATGGTTCATCCGGGGGAGAGATCGAGCTTCAAGAAATAGTGAATGGTCTTATCAATACACAAATGTATAACTCCCCAGGAATTTTAATTGCACTTTTATTCATCACTGTAGGAATTGGGTTCAAGCTTTCCCCAGCCCCTTCTCATCAATGGACTCCTGACGTATACGAAGGATCTCCCACTCCAGTCGTTGCTTTTCTTTCTGTTACTTCGAAAGTAGCTGCTTCAGCTTCAGCCACTCGAATTTTCGATATCCCTTTTTATTTCTCATCAAACGAATGGCATCTTCTTCTGGAA,TTTGTAGCTCTAGAATGTTTCAGTTTATGTTCCTATCTACTATCTGGATATACCAAGAAAGATGTACGGTCTAATGAGGCTACTATGAAATATTTACTCATGGGTGGGGCAAGCTCTTCTATTCTGGTTTATGGTTTCTCTTGGCTATATGGTTCATCCGGGGGAGAGATCGAGCTTCAAGAAATAGTGAATGGTCTTATCAATACACAAATGTATAACTCCCCAGGAATTTTAATTGCACTTTTATTCATCACTGTAGGAATTGGGTTCAAGCTTTCCCTAGCCCCTTTTCATCAATGGACTCCTGACGTATACGAAGGATCTCCCACTCCAGTCGTTGCTTTTCTTTCTGTTACTTCGAAAGTAGCTGCTTTAGCTTTAGCCACTCGAATTTTCGATATCCCTTTTTATTTCTCATCAAACGAATGGCATCTTCTTCTGGAA
6,Z80873,Phaseolus vulgaris,chloroplast,ndhB,290,TTTGTAGCTCTAGAATGTTTCAGTTTATGTTCCTATCTACTATCTGGATATACCAAGAAAGATGTACGGTCTAATGAGGCTACTACGAAATATTTACTCATGGGTGGGGCAAGCTCTTCTATTCTGGTTCATGGTTTCTCTTGGCTATATGGTTCATCCGGGGGAGAGATCGAGCTTCAAGAAATAGTGAATGGTCTTATCAATACACAAATGTATAACTCCCCAGGAATTTTAATTGCACTTTTATTCATCACTGTAGGAATTGGGTTCAAGCTTTCCCCAGCCCCTTCTCATCAATGGACTCCTGACGTATACGAAGGATCTCCCACTCCAGTCGTTGCTTTTCTTTCTGTTACTTCGAAAGTAGCTGCTTCAGCTTCAGCCACTCGAATTTTCGATATCCCTTTTTATTTCTCATCAAACGAATGGCATCTTCTTCTGGAA,TTTGTAGCTCTAGAATGTTTCAGTTTATGTTCCTATCTACTATCTGGATATACCAAGAAAGATGTACGGTCTAATGAGGCTACTATGAAATATTTACTCATGGGTGGGGCAAGCTCTTCTATTCTGGTTTATGGTTTCTCTTGGCTATATGGTTCATCCGGGGGAGAGATCGAGCTTCAAGAAATAGTGAATGGTCTTATCAATACACAAATGTATAACTCCCCAGGAATTTTAATTGCACTTTTATTCATCACTGTAGGAATTGGGTTCAAGCTTTCCCTAGCCCCTTTTCATCAATGGACTCCTGACGTATACGAAGGATCTCCCACTCCAGTCGTTGCTTTTCTTTCTGTTACTTCGAAAGTAGCTGCTTTAGCTTTAGCCACTCGAATTTTCGATATCCCTTTTTATTTCTCATCAAACGAATGGCATCTTCTTCTGGAA


In [5]:
# 提取RNA编辑信息
# extracting RNA editing type
Detail <- lapply(1:nrow(REDIdb),function(i){
    base <- substr(REDIdb$base_seq[i],REDIdb$cDNA.position[i],REDIdb$cDNA.position[i])
    cdna <- substr(REDIdb$cdna_seq[i],REDIdb$cDNA.position[i],REDIdb$cDNA.position[i])
    re <- data.frame(base,cdna)
})
Detail <- do.call('rbind',Detail)
Detail$Detail <- paste0(Detail$base,"->",Detail$cdna)

In [6]:
# 选择C->U类型的RNA编辑位点进行分析
# kept only genomic sequence with C-to-T substitution (C-to-U in mRNA; C-to-T in DNA) for sequence feature extraction.
REDIdb <- data.frame(REDIdb,Detail)
REDIdb <- REDIdb %>%
            filter(Detail=="C->T") %>%
            unique()

In [7]:
# 对于混合的碱基，用N进行表示
# As for mix-bases, for instance M (Adenine/Cytidine), symbols for mix-bases from genomic sequence were replaced with Ns symbols.
ChangeN <- function(sequence){
    base_seq <- lapply(sequence,function(x){
        x <- toupper(x)
        x <- unlist(str_split(x,""))
        for(i in 1:length(x)){
            if(! x[i] %in% c("A","T","G","C")){
                x[i] <- "N"
            }
        }
        x <- paste0(x,collapse = "")
    })
    base_seq <- unlist(base_seq)
    return(base_seq)
}

In [8]:
# 提取RNA编辑位点的侧翼序列
# for each RNA editing site, we extracted the flanking sequence of different lengths nucleotides (nt) from a cytidine
extract_seq <- function(data, base_len=50){
    dataset <- lapply(1:nrow(data),function(i){
        Genbank <- data$Genbank[i]
        Org <- data$Organism[i]
        gene <- data$Gene_name[i]
        base <- data$base_seq[i]
        len <- str_length(base)
        pos <- data$cDNA.position[i]
        start <- pos-base_len
        end <- pos+base_len
        if(start<1){
            start_str <- paste(rep("N",abs(start)+1),collapse ='')
        }else{
            start_str <- ""
        }
        if(end>len){
            end_str <- paste(rep("N",end-len),collapse ='')
        }else{
            end_str <- ""
        }
        final_base <- paste0(start_str,substr(base,start,end),end_str)
        final_base <- paste0(substr(final_base,1,base_len),substr(final_base,base_len+2,base_len*2+1))
        base <- str_split(final_base,"")
        value <- sum(unlist(base)=="N")/(base_len*2)
        if(value<0.9){
            re <- data.frame(Genbank,Org,gene,pos,final_base)
            return(re)
        }
    }) 
    dataset <- do.call('rbind',dataset)
    dataset <- dataset[!duplicated(dataset$final_base),]
    return(dataset)
}

In [9]:
REDIdb$base_seq <- ChangeN(REDIdb$base_seq)
REDIdb$cdna_seq <- ChangeN(REDIdb$cdna_seq)
REDIdb <- REDIdb %>%
    mutate(ID=paste0(Organism,"_",Gene_name,"_",cDNA.position))

In [10]:
# 提取RNA编辑位点的侧翼序列
# for each RNA editing site, we extracted the flanking sequence of different lengths nucleotides (nt) from a cytidine as positive instances
positive <- extract_seq(REDIdb, base_len=flank_len)

In [11]:
dim(positive)

In [12]:
# 提取编辑基因上未编辑的胞嘧啶的侧翼序列构成初始的负样本集
# we also extracted the corresponding sequences on upstream and downstream region of cytidines without RNA editing modification 
# in the edited genes but excluded in the positive dataset, which were used to create our primary negative samples
REDIdb_back <- REDIdb %>%
    dplyr::select(Genbank,Organism,Gene_name, base_seq) %>%
    unique()
REDIdb_back <- lapply(1:nrow(REDIdb_back),function(i){
    base <- REDIdb_back$base_seq[i]
    base <- unlist(str_split(base,""))
    cDNA.position <- which(base=="C")
    Organism <- REDIdb_back$Organism[i]
    Gene_name <- REDIdb_back$Gene_name[i]
    Genbank <- REDIdb_back$Genbank[i]
    re <- data.frame(Genbank,Organism, Gene_name, cDNA.position,base_seq=REDIdb_back$base_seq[i])
})
REDIdb_back <- do.call('rbind',REDIdb_back)
REDIdb_back <- unique(REDIdb_back)
REDIdb_back <- REDIdb_back %>%
    mutate(ID=paste0(Organism,"_",Gene_name,"_",cDNA.position)) %>%
    filter(!ID %in% REDIdb$ID)

In [13]:
dim(REDIdb_back)

In [14]:
# 提取未编辑的胞嘧啶的侧翼序列
# extracted the corresponding sequences on upstream and downstream region of cytidines without RNA editing modification
negative <- extract_seq(REDIdb_back, base_len=flank_len)
negative <- negative[!negative$final_base %in% positive$final_base,]


In [15]:
dim(negative)

In [16]:
# 保存正样本集
# save positive instances
positive2 <- DNAStringSet(positive$final_base)
names(positive2) <- paste("positive",1:nrow(positive),positive$Genbank,positive$Org,positive$gene,positive$pos,sep = "_")
writeXStringSet(positive2,paste0("../model/Choose_flank_",flank_len,"_positive_seq.fasta"))

In [17]:
positive2

DNAStringSet object of length 1586:
       width seq                                            names               
   [1]   200 [47m[30mG[39m[49m[47m[30mT[39m[49m[47m[30mA[39m[49m[47m[30mC[39m[49m[47m[30mC[39m[49m[47m[30mT[39m[49m[47m[30mT[39m[49m[47m[30mT[39m[49m[47m[30mT[39m[49m[47m[30mC[39m[49m[47m[30mA[39m[49m[47m[30mT[39m[49m[47m[30mC[39m[49m[47m[30mA[39m[49m[47m[30mA[39m[49m[47m[30mT[39m[49m[47m[30mG[39m[49m[47m[30mG[39m[49m[47m[30mA[39m[49m[47m[30mC[39m[49m[47m[30mT[39m[49m[47m[30mC[39m[49m...[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m

In [18]:
# 保存负样本集
# save negative instances
negative2 <- DNAStringSet(negative$final_base)
names(negative2) <- paste("negative",1:nrow(negative),negative$Genbank,negative$Org,negative$gene,negative$pos,sep = "_")
writeXStringSet(negative2,paste0("../model/Choose_flank_",flank_len,"_negative_seq.fasta"))

In [19]:
negative2

DNAStringSet object of length 95960:
        width seq                                           names               
    [1]   200 [47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m...[47m[30mA[39m[49m[47m[30mA[39m[49m[47m[30mA[39m[49m[47m[30mT[39m[49m[47m[30mA[39m[49m[47m[30mT[39m[49m[47m[30mT[39m[49m[47m[30mT[39m[49m[47m[30mA[39m[49m[47m[30mC[39m[49m[47m[30mT[39m[49m[47m[30mT[39m[49m[47m[30mA[39m[49m[47m[30mT[39m[49m[47m[30mA[39m[49m[47m[30mG[39m[49m[47m[30mG[39m[49m[47m[30mT[39m[49m[47m[30mG[39m[49m[47m[30mG[39m[49m[47

In [20]:
# 保存正样本集和负样本集
# save positive and negative instances
pos_neg <- DNAStringSet(c(positive$final_base,negative$final_base))
names(pos_neg) <- c(names(positive2),names(negative2))
writeXStringSet(pos_neg,paste0("../model/Choose_flank_",flank_len,"_sequence.fasta"))

In [21]:
pos_neg

DNAStringSet object of length 97546:
        width seq                                           names               
    [1]   200 [47m[30mG[39m[49m[47m[30mT[39m[49m[47m[30mA[39m[49m[47m[30mC[39m[49m[47m[30mC[39m[49m[47m[30mT[39m[49m[47m[30mT[39m[49m[47m[30mT[39m[49m[47m[30mT[39m[49m[47m[30mC[39m[49m[47m[30mA[39m[49m[47m[30mT[39m[49m[47m[30mC[39m[49m[47m[30mA[39m[49m[47m[30mA[39m[49m[47m[30mT[39m[49m[47m[30mG[39m[49m[47m[30mG[39m[49m[47m[30mA[39m[49m[47m[30mC[39m[49m[47m[30mT[39m[49m...[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47

# Selection of negative samples

In [22]:
# 将正负样本集转换成k-mer矩阵
# each positive sample and primary negative sample sequence was represented by a matrix of k-mer count 
# that consisted of different short subsequences of length k
data <- read.dna(paste0("../model/Choose_flank_",flank_len,"_sequence.fasta"), format="fasta")
k <- 5
data_kmer <- kcount(data, k = k)
count <- data.frame(data_kmer)
count <- t(count)

In [23]:
# 根据Seurat流程对k-mer矩阵进行标准化、降维和聚类等分析
# perform data normalization, data scaling, dimensional reduction, and nucleotide sequences clustering using the Seurat v.4.1.1 R package
# Setup the Seurat objects
objs <- CreateSeuratObject(counts = count, project = "all", min.cells = 1, min.features = 1)
objs <- NormalizeData(objs, normalization.method = "LogNormalize", scale.factor = 30)
objs <- FindVariableFeatures(objs, selection.method = "vst", nfeatures = 3000)
all.genes <- rownames(objs)
objs <- ScaleData(objs, features = all.genes)
objs <- RunPCA(objs,npcs =10, features = VariableFeatures(object = objs))
# Find nearest neighbors
objs <- FindNeighbors(objs, dims = 1:10)
objs <- FindClusters(objs, resolution = 0.5)
objs <- RunUMAP(objs, dims = 1:10)

Centering and scaling data matrix

PC_ 1 
Positive:  GGTCC, CCCAC, CTGGT, CTGGG, CCGGT, GGTGC, CTGCT, GCTGG, TCTGG, GCTTT 
	   CGCTT, TTGGG, GCTGC, CGCTC, GCTCA, CCGCT, GGTTT, GGCTG, TCCGG, TGGTC 
	   GGTTG, GTGCC, TGGGT, CGGTC, GCCGC, TAGCT, TTGGT, CTTCG, GTCCC, GGACC 
Negative:  AAAAA, AAATT, AAAAT, AGAAA, AAGAA, AAAAG, AAAGA, ATAAA, TAAAA, GAAAA 
	   AAATA, GAAAT, CAAAA, AATAA, AGAAT, AATTA, TAGAA, TGAAA, AATTT, TCAAA 
	   AATAT, ATGAA, ATAGA, TTAAA, TATAA, GAATT, ATATA, ACAAA, AACAA, GAAGA 
PC_ 2 
Positive:  GAAAA, GAAGA, AGAGG, AGAAG, GTAGA, AGACG, AAGAG, GAGGG, GTCGA, AGGGA 
	   AAGGG, CGAGA, TCGAG, AGACA, GAGAA, AGAAA, GAAGC, CGAGG, CCGAG, GAAAC 
	   GAGAC, TGAGA, GCAGA, CCGTG, GCGGA, TAGAC, AAAGA, GACGA, ATTGA, CCCCG 
Negative:  TTTCT, CTTTT, TTCTT, TTTTC, TTTAT, TTTTT, ATTTT, TATTT, TTTTA, TTATT 
	   CTTCT, CTTAT, TCTTT, TTCTA, GCTTT, CTTTC, TTCTC, TCTAT, TCCTT, TTCCT 
	   TTTCA, TTCAT, ACTCC, TGCTT, ATTTC, TTACT, TCTTA, TCTTC, TTATA, CCTTT 
PC_ 3 
Positive:  GGTGG, TGGGC, TTA

Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 97546
Number of edges: 2312692

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9272
Number of communities: 93
Elapsed time: 7 seconds


"The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session"
09:05:28 UMAP embedding parameters a = 0.9922 b = 1.112

09:05:28 Read 97546 rows and found 10 numeric columns

09:05:28 Using Annoy for neighbor search, n_neighbors = 30

09:05:28 Building Annoy index with metric = cosine, n_trees = 50

0%   10   20   30   40   50   60   70   80   90   100%

[----|----|----|----|----|----|----|----|----|----|

*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
|

09:05:37 Writing NN index file to temp file C:\Users\jasonxu\AppData\Local\Temp\RtmpcN2nML\file366427cd2468

09:05:37 Searching Annoy index using 1 thread, search_k = 3000

09:06:05 Annoy recall = 100%

09:06:05 Commencing smooth kNN distance calibration using 1 thread
 with targe

In [24]:
tbl <- table(objs@meta.data[,c("seurat_clusters","orig.ident")])
tbl <- tbl %>%
    data.frame() %>%
    spread(.,orig.ident,Freq) %>%
    arrange(positive)
meta <- objs@meta.data
meta$samples <- rownames(meta)

In [25]:
Negative <- readDNAStringSet(paste0("../model/Choose_flank_",flank_len,"_negative_seq.fasta"))

In [26]:
Negative

DNAStringSet object of length 95960:
        width seq                                           names               
    [1]   200 [47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m[47m[37mN[39m[49m...[47m[30mA[39m[49m[47m[30mA[39m[49m[47m[30mA[39m[49m[47m[30mT[39m[49m[47m[30mA[39m[49m[47m[30mT[39m[49m[47m[30mT[39m[49m[47m[30mT[39m[49m[47m[30mA[39m[49m[47m[30mC[39m[49m[47m[30mT[39m[49m[47m[30mT[39m[49m[47m[30mA[39m[49m[47m[30mT[39m[49m[47m[30mA[39m[49m[47m[30mG[39m[49m[47m[30mG[39m[49m[47m[30mT[39m[49m[47m[30mG[39m[49m[47m[30mG[39m[49m[47

In [27]:
# 按照正负样本集的比例进行负样本的选择
# the ratio of the positive and negative instances was set as 1∶1, 1∶2, 1∶3, 1∶4 and 1∶5, respectively
Ratio <- c("1:1","1:2","1:3","1:4","1:5")

In [28]:
for(ratio in 1:5){
    print(ratio)
    print("Sampling..............................................")
    print(paste0("The ratio of positive and negative instance is ",Ratio[ratio]))
    Number <- 0
    inx <- c()
    for(i in 1:nrow(tbl)){
        re <- tbl$negative[i]
        Number <- Number + re
        if(Number<nrow(positive)*ratio){
            inx <- c(inx,i)
        }else{
            inx <- c(inx,i)
            break
        }
    }
    need <- meta %>%
        filter(seurat_clusters %in% tbl$seurat_clusters[inx])
    need <- need$samples
    need <- need[grepl('negative',need)]
    print(length(need))
    set.seed(123)
    need2 <- sample(need,nrow(positive)*ratio)
    tmp <- Negative[need2]
    print(paste0("model/Choose_flank_",flank_len,"_negative_seq_",ratio,".fasta"))
    writeXStringSet(tmp,paste0("../model/Choose_flank_",flank_len,"_negative_seq_",ratio,".fasta"))
}

[1] 1
[1] "Sampling.............................................."
[1] "The ratio of positive and negative instance is 1:1"
[1] 1743
[1] "model/Choose_flank_100_negative_seq_1.fasta"
[1] 2
[1] "Sampling.............................................."
[1] "The ratio of positive and negative instance is 1:2"
[1] 4817
[1] "model/Choose_flank_100_negative_seq_2.fasta"
[1] 3
[1] "Sampling.............................................."
[1] "The ratio of positive and negative instance is 1:3"
[1] 4817
[1] "model/Choose_flank_100_negative_seq_3.fasta"
[1] 4
[1] "Sampling.............................................."
[1] "The ratio of positive and negative instance is 1:4"
[1] 6537
[1] "model/Choose_flank_100_negative_seq_4.fasta"
[1] 5
[1] "Sampling.............................................."
[1] "The ratio of positive and negative instance is 1:5"
[1] 9570
[1] "model/Choose_flank_100_negative_seq_5.fasta"


# R Session Information

In [29]:
utils::sessionInfo()

R version 4.3.1 (2023-06-16 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19044)

Matrix products: default


locale:
[1] LC_COLLATE=Chinese (Simplified)_China.utf8 
[2] LC_CTYPE=Chinese (Simplified)_China.utf8   
[3] LC_MONETARY=Chinese (Simplified)_China.utf8
[4] LC_NUMERIC=C                               
[5] LC_TIME=Chinese (Simplified)_China.utf8    

time zone: Asia/Shanghai
tzcode source: internal

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] reshape2_1.4.4      Biostrings_2.68.1   GenomeInfoDb_1.36.1
 [4] XVector_0.40.0      IRanges_2.34.1      S4Vectors_0.38.1   
 [7] BiocGenerics_0.46.0 SeuratObject_4.1.3  Seurat_4.3.0.1     
[10] psych_2.3.6         pheatmap_1.0.12     igraph_1.5.0.1     
[13] amap_0.8-19         kmer_1.1.2          ape_5.7-1          
[16] data.table_1.14.8   lubridate_1.9.2     forcats_1.0.0      
[19] dplyr_1.1.2         