# IDENTIFICATION OF NOVEL CLASSES OF NEOANTIGENS IN CANCER | Data preprocessing

In [None]:
import os
import pandas as pd
from Bio import SeqIO


In [None]:
# establish paths and create folders to make it easier to work later on. This can be adjusted.
# This script is personalized per each dataset
PROJECT="BLCA"
GENERAL="/users/genomics/marta"
DIR=os.path.join(GENERAL,PROJECT)
try:
    os.makedirs(DIR) #path where to store all the itermediate steps and outputs of the pipeline
except:
    print("Directory for %s already exists" %PROJECT)

TCGA_DATA="/datasets/marta/TCGA/BLCA"

FASTQDIR=os.path.join(TCGA_DATA,"fastq_files") #path where to store fastq files
try:
    os.mkdir(FASTQDIR)
except:
    print("Fastq_files directory exists")

GENOMEDIR="/genomics/users/marta/genomes"

try:
    os.makedirs(os.path.join(DIR,"analysis"))
    os.makedirs(os.path.join(DIR,"results"))
    #os.makedirs(os.path.join(DIR,"scripts"))
except:
    print("Directory exists")



## Convert from bam to fastq data

In [None]:
%%bash -s "$PROJECT" "$TCGA_DATA" "$DIR"

module load Java/11.0.2
module load picard/2.25.1-Java-11


# cat $2/patients.csv | tail -n +2 | while IFS=, read patient normal tumor gender age; do
cat $2/paired_patients.csv | tail -n +2 | while IFS=, read patient normal tumor ; do

    for bam in $2/${normal}/*.bam; do
        echo ${bam##*/}
        java -jar $EBROOTPICARD/picard.jar SamToFastq I=$bam F=$2/${normal}/${patient}_normal_r1.fastq F2=$2/${normal}/${patient}_normal_r2.fastq
    done
    for bam in $2/$tumor/*bam; do
        echo ${bam##*/}
        java -jar $EBROOTPICARD/picard.jar SamToFastq I=$bam F=$2/${tumor}/${patient}_tumor_r1.fastq F2=$2/${tumor}/${patient}_tumor_r2.fastq
    done
done

In [None]:
%%bash -s "$TCGA_DATA"

outdir=$1/fastq_files

for file in $1/*/*.fastq; do
    echo ${file##*/}
    if [[ ! -f  ${file}.gz  ]]; then
        echo "starting gzip"
        gzip $file
        cp ${file}.gz $outdir
        echo "done"
    else
        echo "previously gziped"
    fi
done


### Generate a csv file with patients ids

In [None]:
full_files = [x.split("/")[-1].split("_")[0] for x in os.listdir(os.path.join(TCGA_DATA,"fastq_files")) if x.endswith(".gz")]
# files = [x.split("_")[0] for x in full_files]

patients = pd.DataFrame({'patient':full_files})
patients['normal'] = patients['patient'] + "_normal"
patients['tumor'] = patients['patient'] + "_tumor"
patients.to_csv(os.path.join(DIR,"results/paired_patients.csv"), index=None)
patients