# IDENTIFICATION OF NOVEL CLASSES OF NEOANTIGENS IN CANCER | Data processing

In [None]:
import os
import pandas as pd

In [None]:
GENERAL="/users/genomics/marta" # same as previous step
projects=["BLCA"] # more project can be added here
bash_projects = " ".join(projects)

GENOMEDIR="/genomics/users/marta/genomes"

TCGA_DATA="/datasets/marta/TCGA/BLCA"


## FastQScreen

In [None]:
%%bash -s "$bash_projects" "$TCGA_DATA" "$GENERAL" "$GENOMEDIR"

module load FastQ-Screen/0.14.1
module load Bowtie2/2.4.2-GCC-10.2.0             # Required for Fastqscreen
module load Miniconda3/4.9.2

mkdir $3/analysis/02_fastqscreen

config=$4/Index_Genomes_Bowtie2/fastq_screen.conf

for proj in $1; do
    echo $proj
    for file in $2/fastq_files/*r1*gz; do
        echo ${file##*/}
        fastq_screen --conf $config --outdir $3/$proj/analysis/02_fastqscreen $file
    done

    cd $3/$proj/analysis/02_fastqscreen
    multiqc .
done



## FastQC

In [None]:
%%bash -s "$bash_projects" "$TCGA_DATA" "$GENERAL"


module load Miniconda3/4.9.2
module load FastQC/0.11.7-Java-1.8.0_162

for proj in $1; do
    mkdir $3/$proj/analysis/03_fastqc

    for file in $2/fastq_files/*r1*gz; do
        echo ${file##*/}
        fastqc -O $3/$proj/analysis/03_fastqc $file
    done
    cd $3/$proj/analysis/03_fastqc
    multiqc .

done




## Remove adapters if necessary

## Alignment on *H.sapiens* Genome v.38

To do the alignment, STAR program is used with 2pass option and keeping only uniquely mapped reads.

First the index(es) must be generated

In [None]:
# %%bash -s "$GENOMEDIR"
#
# mkdir -p $1/Index_Genomes_STAR
# mkdir $1/Index_Genomes_STAR/Idx_Gencode_v38_hg38_readlength75

In [None]:
# %%bash -s "$GENERAL" "$GENOMEDIR"
#
# ######################################DONE IN CLUSTER###############################################
#
# sbatch scripts/1_processing/index_STAR.sh $1 $2

Now indexes are stored in `/users/genomics/sergiov/annotations_and_indexes/index_50`

In [None]:
%%bash -s "$GENERAL" "$CLUSTERDIR" "$GENOMEDIR"

######################################DONE IN CLUSTER###############################################

sbatch scripts/1_processing/STAR_TCGA.sh $1 $2

Make summary file with uniquely mapped reads and the percentage they represent from the whole alignment

In [None]:
%%bash -s "$GENERAL" "$bash_projects"

for proj in $2; do
    OUT=$1/$proj/results/uniquely_mapped_reads_v41.csv
    if [ -f "$OUT" ] ; then
        rm "$OUT"
    fi
    echo -e "Sample,Input_reads,Uniquely_mapped_reads,%alignment" >> $OUT

    for file in $1/$proj/analysis/05_STAR/uniquely_mapped_2pass_BAM_files/*Log.final.out; do
        name=${file%%Log*}
        name=${name##*BAM_files/}
        echo -e $name","$(sed '6q;d' $file | awk '{print $6}')","$(sed '9q;d' $file | awk '{print $6}')","$(sed '10q;d' $file | awk '{print $6}') >> $OUT
    done
done

## RSeQC

Are the reads oriented? If so, which orientation?


In [None]:
%%bash -s "$GENERAL" "$bash_projects" "$GENOMEDIR"

module load Miniconda3/4.9.2

for proj in $2; do

    RSEQC=$1/$proj/analysis/05.2_RSeQC
    mkdir -p $RSEQC

    cat $1/$proj/results/uniquely_mapped_reads.csv | tail -n +2 | while IFS=, read sample input uniq percent; do

        echo $sample
        infer_experiment.py -r $3/Annot_files_GTF/gencode.v38.primary_assembly.annotation_gene.bed -i $1/$proj/analysis/05_STAR/uniquely_mapped_2pass_BAM_files/${sample}Aligned.sortedByCoord.out.bam -s $uniq > $RSEQC/${sample}.out

    done #< $1/$2/results/uniquely_mapped_reads_norRNA.txt
done

In [None]:
%%bash -s "$GENERAL" "$bash_projects"

# output for stringtie

for proj in $2; do


    if [[ -f $1/$proj/analysis/05.2_RSeQC/summary_stringtie.csv ]]; then
        rm $1/$proj/analysis/05.2_RSeQC/summary_stringtie.csv
    fi

    for file in $1/$proj/analysis/05.2_RSeQC/*out; do
        sample=${file##*/}
        sample=${sample%%.out*}
        echo -e $sample"\t"$(python ~/Documents/scripts/Chris_decide_strandness_parameter.py -tab $file -tool stringtie) >> $1/$2/analysis/05.2_RSeQC/summary_stringtie.csv
    done
done

In [None]:
%%bash -s "$GENERAL" "$bash_projects"

# output for featureCounts

for proj in $2; do

    if [[ -f $1/$proj/analysis/05.2_RSeQC/summary_featureCounts.csv ]]; then
        rm $1/$proj/analysis/05.2_RSeQC/summary_featureCounts.csv
    fi

    for file in $1/$proj/analysis/05.2_RSeQC/*out; do
        sample=${file##*/}
        sample=${sample%%.out*}
        echo -e $sample"\t"$(python ~/Documents/scripts/Chris_decide_strandness_parameter.py -tab $file -tool featureCounts) >> $1/$2/analysis/05.2_RSeQC/summary_featureCounts.csv
    done
done