In [8]:
rm(list=ls()) 

In [9]:
figure_folder="Figures"
dir.create(figure_folder)

“'Figures' already exists”


In [11]:
install.packages('Rtsne')

Installing package into ‘/home/jupyter/.R/library’
(as ‘lib’ is unspecified)



In [13]:
library(ggplot2)
library(dplyr)
library(Rtsne)

In [14]:
departure_cell="ProB"
destination_cell="Mono"

In [15]:
imputed_data=read.table("Data/scimpute_count.csv", sep=",", header=TRUE, row.names=1)

In [16]:
raw_data=read.table("Data/raw_data.csv", sep=",", header=TRUE, row.names=1)

In [17]:
dim(raw_data)

In [18]:
dim(imputed_data)

In [19]:
rel_TFs=unlist(read.table("Data/TFS", header=FALSE))

In [20]:
rel_TFs[rel_TFs=="CEBPa"]="CEBPA"
rel_TFs[rel_TFs=="CEBPb"]="CEBPB"
rel_TFs[rel_TFs=="E2A"]="TCF3"
rel_TFs[rel_TFs=="EBF"]="EBF1"
rel_TFs[rel_TFs=="Eto2"]="CBFA2T3"
rel_TFs[rel_TFs=="Fli1"]="FLI1"
rel_TFs[rel_TFs=="Foxo1"]="FOXO1"
rel_TFs[rel_TFs=="Gata1"]="GATA1"
rel_TFs[rel_TFs=="Gata2"]="GATA2"
rel_TFs[rel_TFs=="Gfi1b"]="GFI1B"
rel_TFs[rel_TFs=="Ldb1"]="LDB1"
rel_TFs[rel_TFs=="Lmo2"]="LMO2"
rel_TFs[rel_TFs=="Lyl1"]="LYL1"
rel_TFs[rel_TFs=="Meis1"]="MEIS1"
rel_TFs[rel_TFs=="Mtgr1"]="CBFA2T2"
rel_TFs[rel_TFs=="Oct2"]="POU2F2"
rel_TFs[rel_TFs=="p300"]="EP300"
rel_TFs[rel_TFs=="P65"]="RELA"
rel_TFs[rel_TFs=="Pparg"]="PPARG"
rel_TFs[rel_TFs=="PU1"]="SPI1"
rel_TFs[rel_TFs=="Runx1"]="RUNX1"
rel_TFs[rel_TFs=="SCL"]="TAL1"
rel_TFs[rel_TFs=="Stat3"]="STAT3"
rel_TFs[rel_TFs=="Stat4"]="STAT4"
rel_TFs[rel_TFs=="Stat5a"]="STAT5A"
rel_TFs[rel_TFs=="Stat5b"]="STAT5B"
rel_TFs[rel_TFs=="Stat6"]="STAT6"

In [21]:
length(rel_TFs)

In [22]:
labels=unlist(read.table("Data/cell_types.txt"))

In [23]:
departure_ind=which(labels==departure_cell)
destination_ind=which(labels==destination_cell)

In [24]:
TF_data=imputed_data[rel_TFs,]

In [25]:
departure_data=TF_data[,departure_ind]
destination_data=TF_data[,destination_ind]
labels=c(rep.int(departure_cell, ncol(departure_data)),rep.int(destination_cell, ncol(destination_data)))

In [26]:
all_tf_data=cbind(departure_data, destination_data)
bin_tf_data=(all_tf_data>0)+0

In [27]:
TF_raw_data=raw_data[rel_TFs,]
departure_raw_data=TF_raw_data[,departure_ind]
destination_raw_data=TF_raw_data[,destination_ind]
all_tf_raw_data=cbind(departure_raw_data, destination_raw_data)

In [28]:
zero_raw=length(which(all_tf_raw_data==0))
non_zero_raw=length(which(all_tf_raw_data>0))
zero_all=length(which(all_tf_data==0))
non_zero_all=length(which(all_tf_data>0))
changed=length(intersect(which(all_tf_raw_data==0), which(all_tf_data>0)))

In [29]:
counts_mat_raw= as.data.frame(cbind(unlist(all_tf_raw_data[all_tf_raw_data>0]), "Before Imputation"))
counts_mat_imputed=as.data.frame(cbind(unlist(all_tf_data[all_tf_data>0]), "After Imputation"))
colnames(counts_mat_raw)=c("Counts", "Imputation")
colnames(counts_mat_imputed)=c("Counts", "Imputation")

counts_mat_raw$Counts=as.numeric(counts_mat_raw$Counts)
counts_mat_imputed$Counts=as.numeric(counts_mat_imputed$Counts)
counts_mat=rbind(counts_mat_raw,counts_mat_imputed)
counts_mat$Imputation_ordered = factor(counts_mat$Imputation, levels=c('Before Imputation','After Imputation'))

In [45]:
which(is.na(counts_mat$Counts))

In [None]:
nonzero_counts_plot=ggplot(counts_mat, aes(x = Counts, fill = Imputation)) +    # Create boxplot chart in ggplot2
geom_histogram(binwidth=1,  boundary = 0)+theme_bw() +theme(text = element_text(size = 12)) + 
theme(axis.text.x = element_text(vjust = 0.5, hjust=1)) +  
facet_grid(~Imputation_ordered , 
             #scales = "free_x", # Let the x axis vary across facets.
             #space = "free_x" , #  # Let the width of facets vary and force all bars to have the same width.
             switch = "x" )  +    # Move the facet labels to the bottom.
xlab("Non-Zero Counts") + ylab("Frequency") 
labs(fill='Imputation Status') 


In [62]:
ggsave(nonzero_counts_plot, file=paste(figure_folder, "/no_zero_histogram.pdf", sep=""), width = 7, height = 4, units ="in",  dpi = 300)
ggsave(nonzero_counts_plot, file=paste(figure_folder, "/no_zero_histogram.png", sep=""), width = 7, height = 4, units ="in",  dpi = 300)

In [352]:
set.seed(1)
tsne_result=Rtsne(t(bin_tf_data), check_duplicates = FALSE,  theta=0)


In [353]:
tsne_out=data.frame(data=tsne_result$Y, labels=labels)

In [359]:
#colors = rainbow(length(unique(labels)))
#names(colors) = unique(labels)
df <- data.frame(x = tsne_out[,1],
                 y = tsne_out[,2], labels=tsne_out[,3])

#Cancer_tissue =unlist(labels)
tsne_plot=ggplot(df, aes(x=x, y=y)) +
geom_point(aes(colour = factor(labels))) +
xlab("tSNE Dimension 1")+ ylab("tSNE Dimension 2")+
labs(color="Cell Type")+ theme_bw() +theme(text = element_text(size = 12))  

In [360]:
ggsave(tsne_plot, file=paste(figure_folder, "/tsne_plot.pdf", sep=""), width = 7, height = 4, units ="in",  dpi = 300)
ggsave(tsne_plot, file=paste(figure_folder, "/tsne_plot.png", sep=""), width = 7, height = 4, units ="in",  dpi = 300)

In [None]:
# data inconsistency

length(intersect(apply(((departure_raw_data>0)+0),2, function(x){toString(x)}),
          apply(((destination_raw_data>0)+0),2, function(x){toString(x)})))