# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Setup" data-toc-modified-id="Setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup</a></div><div class="lev1 toc-item"><a href="#Overview-visualization" data-toc-modified-id="Overview-visualization-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Overview visualization</a></div><div class="lev1 toc-item"><a href="#Differential-expression" data-toc-modified-id="Differential-expression-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Differential expression</a></div>

# Setup

In [9]:
library(ggplot2)
library(ggfortify)
library(ggdendro)
library(limma)
library(gridExtra)

source("~/src/JupyterRReuse/proteomics_multivariate_vis.R")
source("~/src/JupyterRReuse/visualization_utils.R")
source("~/src/JupyterRReuse/proteomics_stats.R")

In [10]:
plot_pca <- function(data_m, design_m, pc1, pc2, color_factor, colors, custom_names, legend=T, title_app="", cont_scale=F) {
    
    if (!cont_scale) {
        getPalette <- colorRampPalette(RColorBrewer::brewer.pal(9, "Set1"))        
    }
    title <- paste0("PCA, PC", pc1, " vs PC", pc2, " ", title_app)
    plt <- make_expression_pca(data_m, design_m, color_factor=color_factor, title=title, 
                               pca_axis1=pc1, pca_axis2=pc2, show_labels=T,
                               only_text=T, color_text=T, custom_names=custom_names)
    if (!cont_scale) {
        plt <- plt + scale_color_manual(values=getPalette(colors))        
    }
    
    if (!legend) {
        plt <- plt + theme(legend.position="none")
    }
    
    plt
}

In [11]:
run <- "batch6_pure"
expression_fp <- paste0(run, "/combined.final.tsv")
design_fp <- paste0(run, "/design.tsv")

In [12]:
design_df <- read.csv(design_fp, sep="\t")
design_df$sample <- design_df$name
head(design_df)

name,biorepgroup,techrepgroup,condition,sample
l1,1,1,low,l1
l2,2,1,low,l2
l3,3,1,low,l3
h1,1,1,high,h1
h2,2,1,high,h2
h3,3,1,high,h3


In [13]:
raw_data_df <- read.csv(expression_fp, sep="\t", skip=1)
# head(raw_data_df)
data_df <- raw_data_df[, as.character(design_df$name)]
head(data_df)

l1,l2,l3,h1,h2,h3
14.7235,14.7367,14.7332,14.7332,14.6915,14.7066
5.24973,5.31606,3.18005,2.00266,5.63899,2.83046
7.77064,6.21096,7.43814,7.14611,6.54429,7.11163
7.75447,7.24843,5.89693,7.91458,2.63139,5.05307
4.75854,3.91385,3.79878,3.19971,3.72088,3.77434
5.56952,5.56952,5.56952,5.56952,5.56952,5.56952


In [14]:
head(raw_data_df)

X.rt_cf,mz_cf,intensity_cf,charge_cf,quality_cf,peptide_0,protein_0,l1,l2,l3,h1,h2,h3
758.0202,251.2006,14.7208,1,0.98257,,,14.7235,14.7367,14.7332,14.7332,14.6915,14.7066
619.2683,598.3479,4.03632,2,0.626087,CYAVSGWPGKK/DIHDISLNLR/ISVDSATMMNK/IYPSVKDFVK,sp|P17802|MUTY_ECOLI/sp|P45568|DXR_ECOLI/sp|Q43845|SPSA_SOLTU/sp|Q9ST62|NDB1_SOLTU,5.24973,5.31606,3.18005,2.00266,5.63899,2.83046
641.4936,1049.5615,7.03696,1,0.868349,GALGGDVYLGK/GFLTVDEIR/MGFAIAAAAAR/VGNLNAYFR,sp|P0ABQ0|COABC_ECOLI/sp|P17979|API8_SOLTU/sp|P30745|MOAA_ECOLI/sp|P58519|API5_SOLTU/sp|Q03197|API10_SOLTU/sp|Q43646|API2_SOLTU,7.77064,6.21096,7.43814,7.14611,6.54429,7.11163
630.4054,446.5379,6.08315,2,0.56149,ELEVFVR/FWDRLR/FWRDLR/MSYLNLR/NPFLGCNK/TVALFRGK,sp|P0A8M3|SYT_ECOLI/sp|P30130|FIMD_ECOLI/sp|P32088|MATK_SOLTU/sp|P60584|CAIA_ECOLI/sp|P62517|OPGH_ECOLI/sp|P76084|PAAI_ECOLI,7.75447,7.24843,5.89693,7.91458,2.63139,5.05307
813.1883,645.9428,3.86102,3,0.864484,GFAPVIHGIARGTAQVTIK/ISSVETMEAWVSQQRGK/LQLRDMAFNSPNSEWK/VASTAARAITSPSSLVFTR/YRVALDSDAWEFGGHGR,sp|P30130|FIMD_ECOLI/sp|P30924|GLGB_SOLTU/sp|P31433|YICH_ECOLI/sp|Q07511|FDH_SOLTU/sp|Q43845|SPSA_SOLTU,4.75854,3.91385,3.79878,3.19971,3.72088,3.77434
676.12,272.2637,5.56952,1,0.983663,,,5.56952,5.56952,5.56952,5.56952,5.56952,5.56952


In [58]:
parse_annot <- function(annot_string) {
    
    if (is.na(annot_string)) {
        return (-2)
    }
    
    fields <- unlist(strsplit(as.character(annot_string), "/"))
#     print(fields)
    print(fields)
    annots <- sapply(fields, function(field) { unlist(strsplit(field, "_"))[[2]] })
#     print(unique(annots))
    uniq_annots <- unique(annots)
    print(uniq_annots)
    
    if (length(uniq_annots) != 1) {
        -1
    }
    else if (uniq_annots == "SOLTU") {
        1
    }
    else if (uniq_annots == "ECOLI") {
        0
    }
    else {
        stop(paste0("Unknown annotation: ", uniq_annots[1]))
    }
}

In [47]:
print(as.character(prot_col[1:10]))

 [1] NA                                                                                                                                                  
 [2] "sp|P17802|MUTY_ECOLI/sp|P45568|DXR_ECOLI/sp|Q43845|SPSA_SOLTU/sp|Q9ST62|NDB1_SOLTU"                                                                
 [3] "sp|P0ABQ0|COABC_ECOLI/sp|P17979|API8_SOLTU/sp|P30745|MOAA_ECOLI/sp|P58519|API5_SOLTU/sp|Q03197|API10_SOLTU/sp|Q43646|API2_SOLTU"                   
 [4] "sp|P0A8M3|SYT_ECOLI/sp|P30130|FIMD_ECOLI/sp|P32088|MATK_SOLTU/sp|P60584|CAIA_ECOLI/sp|P62517|OPGH_ECOLI/sp|P76084|PAAI_ECOLI"                      
 [5] "sp|P30130|FIMD_ECOLI/sp|P30924|GLGB_SOLTU/sp|P31433|YICH_ECOLI/sp|Q07511|FDH_SOLTU/sp|Q43845|SPSA_SOLTU"                                           
 [6] NA                                                                                                                                                  
 [7] "sp|P08200|IDH_ECOLI/sp|P0ADD5|YJJP_ECOLI/sp|P24082|TRAN_ECOLI/sp|P3928

In [59]:
sapply(as.character(prot_col[1:10]), parse_annot)

[1] "sp|P17802|MUTY_ECOLI" "sp|P45568|DXR_ECOLI"  "sp|Q43845|SPSA_SOLTU"
[4] "sp|Q9ST62|NDB1_SOLTU"
[1] "ECOLI" "SOLTU"
[1] "sp|P0ABQ0|COABC_ECOLI" "sp|P17979|API8_SOLTU"  "sp|P30745|MOAA_ECOLI" 
[4] "sp|P58519|API5_SOLTU"  "sp|Q03197|API10_SOLTU" "sp|Q43646|API2_SOLTU" 
[1] "ECOLI" "SOLTU"
[1] "sp|P0A8M3|SYT_ECOLI"  "sp|P30130|FIMD_ECOLI" "sp|P32088|MATK_SOLTU"
[4] "sp|P60584|CAIA_ECOLI" "sp|P62517|OPGH_ECOLI" "sp|P76084|PAAI_ECOLI"
[1] "ECOLI" "SOLTU"
[1] "sp|P30130|FIMD_ECOLI" "sp|P30924|GLGB_SOLTU" "sp|P31433|YICH_ECOLI"
[4] "sp|Q07511|FDH_SOLTU"  "sp|Q43845|SPSA_SOLTU"
[1] "ECOLI" "SOLTU"
[1] "sp|P08200|IDH_ECOLI"  "sp|P0ADD5|YJJP_ECOLI" "sp|P24082|TRAN_ECOLI"
[4] "sp|P39285|MSCM_ECOLI" "sp|P76346|MTFA_ECOLI"
[1] "ECOLI"
[1] "sp|P05194|AROD_ECOLI"  "sp|P08200|IDH_ECOLI"   "sp|P0A8A2|YEEN_ECOLI" 
[4] "sp|P16919|RHSD_ECOLI"  "sp|P34094|PHYB_SOLTU"  "sp|P52124|YFJI_ECOLI" 
[7] "sp|Q2VEI6|RPOC2_SOLTU"
[1] "ECOLI" "SOLTU"
[1] "sp|O64390|HXK1_SOLTU" "sp|P0A867|TALA_ECOLI" "sp|P12995|BIO

# Overview visualization

In [60]:
p1_1 <- plot_pca(data_df, design_df, 1, 2, "condition", colors=2, custom_names=design_df$name, legend=F)
p1_2 <- plot_pca(data_df, design_df, 3, 4, "condition", colors=2, custom_names=design_df$name, legend=T, title_app="(test)")
options(repr.plot.width=10, repr.plot.height=5)
multiplot(p1_1, p1_2, cols=2)


ERROR: Error in prcomp.default(t(expr_m_nona), scale = TRUE, center = TRUE): cannot rescale a constant/zero column to unit variance


# Differential expression

In [None]:
calculate_anova <- function(row, cond) {
    
    anova_df <- data.frame(cbind(Intensity=unlist(row), Cond=cond))
    
    av <- aov(Intensity~Cond, anova_df)
    av_summary <- summary(av)
    p_val <- av_summary[[1]]["Cond", "Pr(>F)"]
    p_val
}

In [None]:
head(data_df)

In [None]:
p_vals <- apply(data_df, 1, calculate_anova, cond=design_df$condition)


In [None]:
p_vals <- unlist(p_vals)
q_vals <- p.adjust(p_vals, method="BH")

In [None]:
length(p_vals)

In [None]:
length(p_vals[which(p_vals < 0.1)])
sig_indices <- which(q_vals < 0.1)

In [None]:
head(raw_data_df[sig_indices,])

In [None]:
test_df <- cbind(Y=unlist(data_df[1,]), Cond=design_df$condition)
print(test_df)
summary(aov(Y~Cond, data.frame(test_df)))[[1]]["Cond", "Pr(>F)"]

In [None]:
raw_data_df[1,]

In [None]:
aov(data_df[1,])

In [None]:
calculate_anova <- function(row, levels) {
    fit <- 
}