# Running codes in R environment

In [None]:
# load R package
# 加载R包
library(readxl)
library(stringr)
library(Biostrings)
library(tidyverse)
library(ggplot2)
library(ggsci)
library(patchwork)

In [3]:
# read protein sequence in Arabidopsis
# 读取拟南芥的蛋白序列
protein <- readAAStringSet("genome/Athaliana_167_protein_flt.fa")

In [4]:
# read Sumoylation sites
# 读取Sumoylation sites信息
high <- read_excel("Sumoylation from highly sensitive.xlsx")
high <- unique(high)

In [5]:
high2 <- NULL
for(i in 1:nrow(high)){
    Gene <- unlist(str_split(high$`Fasta headers`[i],"\\|"))[1]
    Gene <- gsub(" ","",Gene)
    Position <- unlist(str_split(high$`Positions within proteins`[i],";"))[1]
    re <- data.frame(Gene,Position,stringsAsFactors = F)
    high2 <- rbind(high2,re)
}

In [6]:
# read Sumoylation sites
# 读取Sumoylation sites信息
low <- read_excel("Sumoylation from low-throughput studies.xlsx")
proteomic <- read_excel("Sumoylation from proteomic studies.xlsx")
low <- rbind(low,proteomic)

In [7]:
low <- low %>%
    dplyr::select(Protein,Position=`Positions within proteins`) %>%
    unique()

In [8]:
Name <- data.frame(Gene=names(protein))
Protein <- lapply(Name$Gene,function(g){
    g <- unlist(str_split(g,"\\."))[1]
})
Protein <- unlist(Protein)
Name$Protein <- Protein
Name <- unique(Name)

In [9]:
low <- low %>%
    inner_join(Name) %>%
    dplyr::select(Gene,Position) %>%
    unique()

[1m[22mJoining with `by = join_by(Protein)`
"[1m[22mDetected an unexpected many-to-many relationship between `x` and `y`.
[36mℹ[39m Row 7 of `x` matches multiple rows in `y`.
[36mℹ[39m Row 17200 of `y` matches multiple rows in `x`.


In [10]:
data <- rbind(high2,low)
data <- unique(data)

In [11]:
# check Sumoylation sites
# 核对一下Sumoylation sites信息
positive <- NULL
for(i in 1:nrow(data)){
    g <- data$Gene[i]
    pro <- as.character(protein[g])
    pos <- data$Position[i]
    AA <- substr(pro,pos,pos)
    if(AA=="K"){
        re <- data.frame(Gene=g,Position=pos)
        positive <- rbind(positive,re)
    }
}

In [12]:
positive <- positive %>%
    mutate(Name = paste0(Gene,":",Position)) %>%
    mutate(Type = 'positive') 

In [13]:
#save(positive,file="positive.Rdata")

In [14]:
nrow(positive)

In [15]:
# extract non-sumoylation sites
# 提取没有发生sumoylation的赖氨酸位点作为负样本集
negative <- lapply(unique(positive$Gene),function(g){
    pro <- as.character(protein[g])
    aa <- unlist(str_split(pro,""))
    pos <- which(aa=="K")
    Name <- paste0(g,":",pos)
    re <- data.frame(Gene=g, Position=pos,Name)
})

In [16]:
negative <- do.call('rbind',negative)
negative <- unique(negative)
negative <- negative %>%
    dplyr::filter(!Name %in% positive$Name) %>%
    mutate(Type = 'negative')     

In [17]:
nrow(negative)

In [18]:
Data <- rbind(positive,negative)

In [19]:
Original <- Data %>%
                group_by(Type) %>%
                summarise(Number=n())

In [20]:
Original

Type,Number
<chr>,<int>
negative,61342
positive,2390


In [None]:
# taking n residues upstream and downstream of each lysine
# 提取上下游的序列
# n is 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, respectively.
# n表示上下游序列的长度
for(base_len in seq(16,34,2)){
    AA <- c()
    for(i in 1:nrow(Data)){
        pro <- protein[Data$Gene[i]]
        pos <- as.numeric(Data$Position[i])
        len <- str_length(as.character(pro))
        start <- pos-base_len
        end <- pos+base_len
        if(start<1){
            start_str <- paste(rep("X",abs(start)+1),collapse ='')
        }else{
            start_str <- ""
        }
        if(end>len){
            end_str <- paste(rep("X",end-len),collapse ='')
        }else{
            end_str <- ""
        }
        aa <- paste0(start_str,substr(pro,start,end),end_str)
        aa <- paste0(substr(aa,1,base_len),substr(aa,base_len+2,base_len*2+1))
        AA <- c(AA,aa)
    }
    Data$Sequence <- AA
    neg <- subset(Data,Type=="negative")
    neg2 <- AAStringSet(neg$Sequence)
    names(neg2) <- neg$Name
    writeXStringSet(neg2,paste0("CNN/negative_f",base_len,"_for_cd-hit.fa"))
    pos <- subset(Data,Type=="positive")
    Positive <- pos %>%
                dplyr::select(Gene=Name,Sequence) %>%
                mutate(Label="positive")
    sim <- c(0.4,0.5,0.6,0.7) # sequence similarity
    N <- c(2,3,4,5)
    for(inx in 1:4){
        s <- sim[inx]
        # reduce redundant sequence for negative datasets using CD-HIT
        # 使用CD-HIT软件进行冗余序列去除
        system(paste0("cd-hit -i CNN/negative_f",base_len,"_for_cd-hit.fa -o CNN/negative_f",base_len,
                      "_for_cd-hit_c",s,"_uniq.fa -c ",s," -T 30 -n ",N[inx]))
        Negative <- readBStringSet(paste0("CNN/","negative_f",base_len,"_for_cd-hit_c",s,"_uniq.fa")) 
        Negative <- data.frame(Gene=names(Negative),Sequence=Negative,Label="negative")
        Data2 <- rbind(Positive,Negative)
        # train/test split
        # 将数据集划分为训练集和测试集
        set.seed(123)
        inx <- sample(1:nrow(Data2),0.75*nrow(Data2))
        train <- Data2[inx,]
        test <- Data2[-inx,]
        write.csv(train,paste0("CNN/f_",base_len,"_c",s,"_train_positive_negative.csv"),quote=F,row.names=F)
        write.csv(test,paste0("CNN/f_",base_len,"_c",s,"_test_positive_negative.csv"),quote=F,row.names=F)
    }
}

# Running codes in Python environment

In [None]:
# balance.py
# The python script is used to balance positive and negative samples
# 使用python脚本进行降采样