# IDENTIFICATION OF NOVEL CLASSES OF NEOANTIGENS IN CANCER | Transcriptome Assembly

In [None]:
%load_ext rpy2.ipython

## 0. Data preparation

This first cell should be modified according to the data that is going to be used. It is only available for datasets with paired samples per patient: normal and tumor. 

The **PROJECT** variable should be changed according to the GEO identifier.

From the GEO website, the *SRR_Acc_List.txt* and *SraRunTable.txt* files should be manually downloaded and save in a directory. This directory should be specified in **SRR** variable.

The pipeline is developed with the intention of running the most computationally expensive programs in a cluster. 
In this case, a Gluster File System has been used. The code to run on a cluster may need to be adapted.

In [None]:
import os,re,shutil,glob,openpyxl
import pandas as pd
from Bio import SeqIO
from gtfparse import read_gtf
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib import pyplot as plt
from IPython.display import Image

PROJECT="GSE193567"

DIR=os.path.join("data",PROJECT)

try:
    os.makedirs(DIR) #path where to store all the itermediate steps and outputs of the pipeline
except:
    print("Directory for %s already exists" %PROJECT)
    
CLUSTERDIR="/users/genomics/marta" #path where to run and store things that run in a cluster
SRR="/projects_eg/datasets/"+PROJECT # path where SRR_Acc_List.txt and SraRunTable.txt are stored. It should be inside a folder named with GEO accession
SRR_ACC=os.path.join(SRR,"SRR_Acc_List.txt") 
SRA=os.path.join(SRR,"SraRunTable.txt")

FASTQDIR=os.path.join(DIR,"fastq_files") #path where to store fastq files
try:
    os.mkdir(FASTQDIR)
except:
    print("Fastq_files directory exists")
    
shutil.copy(SRR_ACC, os.path.join(FASTQDIR,"SRR_Acc_List.txt"))
shutil.copy(SRA, os.path.join(FASTQDIR,"SraRunTable.txt"))

GENOMEDIR="genomes"

try:
    os.makedirs(os.path.join(DIR,"analysis"))
    os.makedirs(os.path.join(DIR,"results"))
    #os.makedirs(os.path.join(DIR,"scripts"))
except:
    print("Directory exists")



In [None]:
%%R

require(tidyr)
require(dplyr)
require(rtracklayer)
#library(purrr)
require(ggplot2)
require(RColorBrewer)
require(devtools)
require(stringr)
require(edgeR)

Get a three column file with patient_id normal_id tumor_id for latter usage 

In [None]:
metadata = pd.read_csv(os.path.join(FASTQDIR.split("/fastq_files")[0],"SraRunTable.txt"))
metadata = metadata[['Run','Individual','tissue']]

normal = metadata[metadata['tissue'] == "non-tumor"]
normal = normal[['Individual','Run']]

tumor = metadata[metadata['tissue'] == "tumor"]
tumor = tumor[['Individual','Run']].rename(columns ={'Run' : 'Run_t'})

patients = pd.merge(normal, tumor, on=['Individual'])
patients['Individual'] = patients['Individual'].str.split(' ').str[1]
patients.to_csv(os.path.join(DIR,"results/patients.csv"),index=False, header=False)
patients_summary = os.path.join(DIR,"results/patients.csv")

patients_id=list(patients.iloc[:,0])
normal_id=list(patients.iloc[:,1])
tumor_id=list(patients.iloc[:,2])

patients

## 06.Transcript assembly with StringTie

In [None]:
%%bash -s "$DIR"

mkdir $1/analysis/06_stringtie

In [None]:
%%bash -s "$PROJECT" "$CLUSTERDIR" "$DIR" "$GENOMEDIR"

######################################DONE IN CLUSTER###############################################

sbatch $3/scripts/1_transcriptome_assembly/loop_stringtie.sh $1 $2 $3 $4

Make summary files with transcriptome

In [None]:
%%bash -s "$DIR"
## Total transcriptome + Presumably annotated + Presumably novel
OUT=$1/results/transcriptome.txt
if [ -f "$OUT" ] ; then
    rm "$OUT"
fi
echo -e "Sample\tTotal_transcripts\tAnnotated_transcripts\tNovel_transcript" >> $OUT

for file in $1/analysis/06_stringtie/*gtf; do
    if [[ ${file} != *"cov"* ]]; then 
        name="${file%%.*}"
        name="${name##*/}"
        echo -e $name"\t"$(awk '($3 == "transcript") {print}' $file | wc -l)"\t"$(awk '($3 == "transcript") {print}' $file | grep "reference" | wc -l)"\t"$(awk '($3 == "transcript") {print}' $file | grep -v "reference" | wc -l) >> $OUT
    fi
done


Pass to transcripts to genes

In [None]:
%%R -i DIR

# read the stringtie files
gtf_filenames <- list.files(file.path(DIR,"analysis/06_stringtie"), pattern = "SRR[0-9]+\\.gtf")
gtf_filepaths <- list.files(file.path(DIR,"analysis/06_stringtie"), pattern = "SRR[0-9]+\\.gtf", full.names = T)

for(i in 1:length(gtf_filepaths)) {
  name <- substr(gtf_filenames[i],1,11) # considering GEO accession ids of 10 characters
  gtf.gr <- rtracklayer::import(gtf_filepaths[i])
  gtf.df <- as.data.frame(gtf.gr)
  genes <- unique(gtf.df[,c("gene_id","ref_gene_name")])
  filename=paste0(DIR,"/analysis/06_stringtie/",name,"_genes.gtf") #get one of each to know the number of genes if we consider one transcript per gene.
  write.table(genes, file=filename, sep="\t", quote = FALSE, row.names = FALSE, col.names=FALSE)
}


In [None]:
%%bash -s "$DIR"
## Total transcriptome + Presumably annotated + Presumably novel
OUT=$1/results/transcriptome_genes.txt
if [ -f "$OUT" ] ; then
    rm "$OUT"
fi
echo -e "Sample\tTotal_genes\tAnnotated_genes\tNovel_genes" >> $OUT

for file in $1/analysis/06_stringtie/*_genes.gtf; do
    if [[ ${file} != *"cov"* ]]; then 
        name="${file%%.*}"
        name="${name##*/}"
        echo -e $name"\t"$(wc -l < $file)"\t"$(awk '($2 != "NA") {print}' $file | wc -l)"\t"$(awk '($2 == "NA") {print}' $file | wc -l) >> $OUT
    fi
done


From here, I **need** to know if what is considered as novel is really novel. Intersect with annotated genes. Those that do not overlap at all, are novels for sure: maintain.

`$GENOMEDIR/Annot_files_GTF/gencode.v38.primary_assembly.annotation.gtf`

In [None]:
%%bash -s "$DIR"

mkdir $1/analysis/06_stringtie/NOoriented
mkdir $1/analysis/06_stringtie/oriented

for file in $1/analysis/06_stringtie/*.gtf; do 
    if [[ ${file} != *"cov"* ]]; then 
    if [[ ${file} != *"gene"* ]]; then 
    name=${file##*/}
    name=${name%%.*}
    #separation of novel transcripts oriented not oriented
    awk '($7 == ".") {print}' $file > $1/analysis/06_stringtie/NOoriented/${name}_NOoriented.gtf
    awk '($7 != ".") {print}' $file > $1/analysis/06_stringtie/oriented/${name}_oriented.gtf
    fi
    fi
done

**FORCING STRANDNESS**

For the novel transcripts oriented, we force orientation when checking the overlapping with the reference.

In [None]:
%%bash -s "$DIR" "$GENOMEDIR"

module load BEDTools/2.2.1

for file in $1/analysis/06_stringtie/oriented/*.gtf; do 
name=${file##*/}
name=${name%%.*}
#select novel transcripts that do not overlap at all with the gencode annotation (-v)
bedtools intersect -v -s -a $file -b $2/Annot_files_GTF/gencode.v38.primary_assembly.annotation.gtf > $1/analysis/06_stringtie/oriented/${name}_novel.gtf
done

I want to see if there are "alone exons", I don't want them, only full transcripts. Check transcript line, store geneid (STRG.XX), see if later exons contain the same. If so, store in new file, if not, skip.

In [None]:
for file in os.listdir(os.path.join(DIR,"analysis/06_stringtie/oriented")):
    if file.endswith("novel.gtf"):
        outname=file[:-4]+"_complete.gtf"
        with open(os.path.join(os.path.join(DIR,"analysis/06_stringtie/oriented"),file)) as inp:
            with open(os.path.join(os.path.join(DIR,"analysis/06_stringtie/oriented"),outname), 'w') as out:
                lines = inp.readlines()
                t_gene_id = ""
                for line in lines:
                    tab_split_line=line.split("\t")
                    coma_split_line=line.split(";")
                    gene_id=re.findall(r'"([^"]*)"',coma_split_line[0])
                    if tab_split_line[2] == "transcript":
                        t_gene_id = gene_id[0]
                        out.write(line)
                    if tab_split_line[2] == "exon":
                        if gene_id[0] == t_gene_id:
                            out.write(line)
                        else:
                            continue


**WITHOUT FORCING STRANDNESS**

For the novel transcripts NOT oriented, we do not force orientation when checking the overlapping with the reference.

In [None]:
%%bash -s "$DIR" "$GENOMEDIR"

module load BEDTools/2.27.1

for file in $1/analysis/06_stringtie/NOoriented/*.gtf; do 
name=${file##*/}
name=${name%%.*}
#select novel transcripts that do not overlap at all with the gencode annotation (-v)
bedtools intersect -v -a $file -b $2/Annot_files_GTF/gencode.v38.primary_assembly.annotation.gtf > $1/analysis/06_stringtie/NOoriented/${name}_novel.gtf
done

I want to see if there are "alone exons", I don't want them, only full transcripts. Check transcript line, store geneid (STRG.XX), see if later exons contain the same. If so, store in new file, if not, skip.

In [None]:
for file in os.listdir(os.path.join(DIR,"analysis/06_stringtie/NOoriented")):
    if file.endswith("novel.gtf"):
        outname=file[:-4]+"_complete.gtf"
        with open(os.path.join(os.path.join(DIR,"analysis/06_stringtie/NOoriented"),file)) as inp:
            with open(os.path.join(os.path.join(DIR,"analysis/06_stringtie/NOoriented"),outname), 'w') as out:
                lines = inp.readlines()
                t_gene_id = ""
                for line in lines:
                    tab_split_line=line.split("\t")
                    coma_split_line=line.split(";")
                    gene_id=re.findall(r'"([^"]*)"',coma_split_line[0])
                    if tab_split_line[2] == "transcript":
                        t_gene_id = gene_id[0]
                        out.write(line)
                    if tab_split_line[2] == "exon":
                        if gene_id[0] == t_gene_id:
                            out.write(line)
                        else:
                            continue


Concate both results

In [None]:
%%bash -s "$DIR" "$patients_summary"

mkdir $1/analysis/06_stringtie/intersect_novel

while IFS=, read patient normal tumor; do
    #concatenate both results in order to obtain a gtf file with all novel transcripts considered
    cat $1/analysis/06_stringtie/oriented/${normal}_oriented_novel_complete.gtf $1/analysis/06_stringtie/NOoriented/${normal}_NOoriented_novel_complete.gtf > $1/analysis/06_stringtie/intersect_novel/${normal}_novel.gtf
    cat $1/analysis/06_stringtie/oriented/${tumor}_oriented_novel_complete.gtf $1/analysis/06_stringtie/NOoriented/${tumor}_NOoriented_novel_complete.gtf > $1/analysis/06_stringtie/intersect_novel/${tumor}_novel.gtf
done < $2


Make summary file with novel genes per sample

In [None]:
%%bash -s "$DIR"

OUT=$1/results/real_novel_genes.txt
if [ -f "$OUT" ] ; then
    rm "$OUT"
fi
echo -e "sample\tnovel_genes" >> $OUT 

for file in $1/analysis/06_stringtie/intersect_novel/*_novel.gtf; do
name=${file##*/}
echo -e ${name%%_*}"\t"$(awk '($3 == "transcript") {print}' $file | wc -l) >> $OUT
done

Create reference transcriptome file per sample.
For the annotated features, we consider those from gencode, stringtie information is only used for the novel transcriptome assembly

In [None]:
%%bash -s "$DIR" "$GENOMEDIR"

export PATH=/genomics/users/marta/tools/gff3sort-master:$PATH

REF=$2/Annot_files_GTF/gencode.v38.primary_assembly.annotation.gtf 
OUTDIR=$1/analysis/06_stringtie/stringtie_reference_annotations
mkdir $OUTDIR

for file in $1/analysis/06_stringtie/intersect_novel/*novel.gtf; do
name=${file##*/}
name=${name%%_*}
cat $file $REF > ${OUTDIR}/${name}_reference_annotation.gtf
#sort the newly assembled transcriptome
gff3sort.pl --precise --chr_order natural ${OUTDIR}/${name}_reference_annotation.gtf > ${OUTDIR}/${name}_reference_annotation_sorted.gtf
rm ${OUTDIR}/${name}_reference_annotation.gtf
done