# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Generate-pre-processed-FASTA-files" data-toc-modified-id="Generate-pre-processed-FASTA-files-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Generate pre-processed FASTA files</a></div><div class="lev2 toc-item"><a href="#Setup" data-toc-modified-id="Setup-11"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Setup</a></div><div class="lev2 toc-item"><a href="#Prepare-low-spike" data-toc-modified-id="Prepare-low-spike-12"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Prepare low-spike</a></div><div class="lev2 toc-item"><a href="#Prepare-high-spikes" data-toc-modified-id="Prepare-high-spikes-13"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Prepare high-spikes</a></div><div class="lev1 toc-item"><a href="#Simulate-MS-data" data-toc-modified-id="Simulate-MS-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Simulate MS-data</a></div><div class="lev1 toc-item"><a href="#Post-process-data" data-toc-modified-id="Post-process-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Post-process data</a></div><div class="lev1 toc-item"><a href="#Normalization" data-toc-modified-id="Normalization-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Normalization</a></div>

# Generate pre-processed FASTA files

Using the custom-made `seqtk generate_spikein.py` script.

## Setup

In [1]:
run=batch2
mkdir ${run}

In [2]:
sample_names_low="l1 l2 l3"
sample_names_high="h1 h2 h3"
samples="${sample_names_low} ${sample_names_high}"

In [3]:
echo -e "name\tbiorepgroup\ttechrepgroup\tcondition" > ${run}/design.tsv
echo -e "l1\t1\t1\tlow" >> ${run}/design.tsv
echo -e "l2\t2\t1\tlow" >> ${run}/design.tsv
echo -e "l3\t3\t1\tlow" >> ${run}/design.tsv
echo -e "h1\t1\t1\thigh" >> ${run}/design.tsv
echo -e "h2\t2\t1\thigh" >> ${run}/design.tsv
echo -e "h3\t3\t1\thigh" >> ${run}/design.tsv

cat ${run}/design.tsv

name	biorepgroup	techrepgroup	condition
l1	1	1	low
l2	2	1	low
l3	3	1	low
h1	1	1	high
h2	2	1	high
h3	3	1	high


## Prepare low-spike

In [4]:
for name in ${sample_names_low}; do
    echo "Generating sample: ${name}"
    lfqtk generate_spikein \
        --background_fa data/uniprot_ecoli.fasta \
        --spikein_fa data/uniprot_potato.fasta \
        --output_fa ${run}/${name}.fa \
        --offset_mean 0 \
        --offset_std 0 \
        --back_int 1000000 \
        --back_noise_std 50000 \
        --back_count 100 \
        --spike_int 1000000 \
        --spike_noise_std 50000 \
        --spike_count 20 \
        --verbose
done

Generating sample: l1
5268 entries loaded from data/uniprot_ecoli.fasta as background
858 entries loaded from data/uniprot_potato.fasta as spike-in
100 entries picked as background, 20 as spike-in
120 entries written to batch2/l1.fa
Generating sample: l2
5268 entries loaded from data/uniprot_ecoli.fasta as background
858 entries loaded from data/uniprot_potato.fasta as spike-in
100 entries picked as background, 20 as spike-in
120 entries written to batch2/l2.fa
Generating sample: l3
5268 entries loaded from data/uniprot_ecoli.fasta as background
858 entries loaded from data/uniprot_potato.fasta as spike-in
100 entries picked as background, 20 as spike-in
120 entries written to batch2/l3.fa


## Prepare high-spikes

In [5]:
for name in ${sample_names_high}; do
    echo "Generating sample: ${name}"
    lfqtk generate_spikein \
        --background_fa data/uniprot_ecoli.fasta \
        --spikein_fa data/uniprot_potato.fasta \
        --output_fa ${run}/${name}.fa \
        --offset_mean 0 \
        --offset_std 0 \
        --back_int 1000000 \
        --back_noise_std 50000 \
        --back_count 100 \
        --spike_int 2000000 \
        --spike_noise_std 50000 \
        --spike_count 20 \
        --verbose
done

Generating sample: h1
5268 entries loaded from data/uniprot_ecoli.fasta as background
858 entries loaded from data/uniprot_potato.fasta as spike-in
100 entries picked as background, 20 as spike-in
120 entries written to batch2/h1.fa
Generating sample: h2
5268 entries loaded from data/uniprot_ecoli.fasta as background
858 entries loaded from data/uniprot_potato.fasta as spike-in
100 entries picked as background, 20 as spike-in
120 entries written to batch2/h2.fa
Generating sample: h3
5268 entries loaded from data/uniprot_ecoli.fasta as background
858 entries loaded from data/uniprot_potato.fasta as spike-in
100 entries picked as background, 20 as spike-in
120 entries written to batch2/h3.fa


# Simulate MS-data

Generate the OpenMS tool `MSSimulator`.

In [7]:
for sample in ${samples}; do
    echo "Processing sample: ${sample}"
    MSSimulator \
        -in ${run}/${sample}.fa \
        -out_fm ${run}/${sample}.featureXML \
        -out_id ${run}/${sample}.idXML \
        > ${run}/${sample}.mssim.log
done


l1
l2
l3
h1
h2
h3


In [10]:
for sample in ${samples}; do \
    echo "Processing sample: ${sample}"
    IDMapper \
        -id ${run}/${sample}.idXML \
        -in ${run}/${sample}.featureXML \
        -out ${run}/${sample}.mapped.featureXML \
        > ${run}/${sample}.mapped.featureXML.log
done

Processing sample: l1
Processing sample: l2
Processing sample: l3
Processing sample: h1
Processing sample: h2
Processing sample: h3


In [12]:
for sample in ${samples}; do 
    echo "Aligning sample: ${sample}"
    MapAlignerPoseClustering \
        -in ${run}/${sample}.mapped.featureXML \
        -out ${run}/${sample}.mapped.aligned.featureXML \
        > ${run}/${sample}.mapped.aligned.featureXML.log
done

Aligning sample: l1
Aligning sample: l2
Aligning sample: l3
Aligning sample: h1
Aligning sample: h2
Aligning sample: h3


In [None]:
FeatureLinkerUnlabeledQT \
    -in ${run}/*.mapped.aligned.featureXML \
    -out combined.consensusXML

Progress of 'reading input':


# Post-process data

Extract and prepare the consensus data for normalization - transform it to an appropriate format.

In [None]:
TextExporter \
    -in ${run}/combined.consensusXML \
    -out ${run}/combined.linked_features.csv \
    -consensus:features ${run}/combined.features.csv

In [None]:
util_scripts/openms_to_normalyzer.py \
    -i ${run}/merged.features.csv \
    -o ${run}/merged.final.tsv \
    --design ${run}/design.tsv \
    --delim_in '\t' \
    --delim_out '\t'

# Normalization

Perform Loess normalization (maybe even easier in R?)