# IDENTIFICATION OF NOVEL CLASSES OF NEOANTIGENS IN CANCER | Tumor specific


In [1]:
import os, glob, re, gtfparse
import pandas as pd
import numpy as np
from Bio import SeqIO
from functools import reduce


INFO:numexpr.utils:Note: NumExpr detected 24 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


In [29]:
GENERAL="/users/genomics/marta" # same as previous step
projects=["BLCA"] # more project can be added here
bash_projects = " ".join(projects)

GENOMEDIR="/genomics/users/marta/genomes"
GENOMEFASTA=GENOMEDIR+"/GRCh38/GRCh38.primary_assembly.genome.fa"

In [30]:
transcript_gene=pd.read_csv(os.path.join(GENOMEDIR,"transcript_gene_v41.txt"), skiprows=1, names=['gene_id', 'gene_name', 'gene_type'])

## Tumor-specific

Select tumor-specific genes based on gene expression threshold

In [8]:
for proj in projects:
    print(proj)
    fc = pd.read_csv(os.path.join(GENERAL,proj,"analysis/07_quantification/straightforward/TPMs_genenames.csv"))
    patients=pd.read_csv(os.path.join(GENERAL,proj,"results/paired_patients.csv"))


    for index,patient in patients.iterrows():
        patient_fc = fc[["gene_name",patient.iloc[1],patient.iloc[2]]]
        filename=str(patient.iloc[0])+"_table_of_counts_TPM_complete.csv"
        patient_fc.to_csv(os.path.join(GENERAL,proj,"analysis/07_quantification/straightforward",str(patient.iloc[0]),filename),index=False)

        tumor_specific_patient = patient_fc.loc[(patient_fc[patient.iloc[1]] <= 0.1) & (patient_fc[patient.iloc[2]] >= 1)]

        filename=str(patient.iloc[0])+"_tumor_specific_genes_TPM.csv"
        try:
            os.makedirs(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward",str(patient.iloc[0])))
        except:
            print("Directory exists")
        tumor_specific_patient.to_csv(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward",str(patient.iloc[0]),filename),index=False)



BLCA
Directory exists
Directory exists
Directory exists
Directory exists
Directory exists
Directory exists
Directory exists
Directory exists
Directory exists
Directory exists
Directory exists
Directory exists
Directory exists
Directory exists
Directory exists
Directory exists
Directory exists
Directory exists


Summary

In [7]:
%%bash -s "$bash_projects" "$GENERAL"

for proj in $1; do
    OUT=$2/$proj/results/tumor_specific_genes_v41.csv

    if [ -f "$OUT" ] ; then
        rm "$OUT"
    fi
    dir=$2/$proj/analysis/08_tumor_specific/straightforward/
    cd $dir
    echo -e "Patient,Total tumor-specific genes">> $OUT

    for patient in */; do
        ## transcript level
        echo -e $patient","$(cat $patient/*_tumor_specific_genes_TPM.csv | tail -n +2 | cut -d, -f1 | sort | uniq | wc -l)>> $OUT

    done
done

### Generate list tumor specific

**Known**

Coding or non-coding?

For the annotated features, we are differentially interested in the coding and non-coding ones (lncRNA/processed_pseudogenes)

In [19]:
for proj in projects:
    patients_dir = [ f.path for f in os.scandir(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward")) if f.is_dir() ]
    print("%s, %s patients" %(proj, len(patients_dir)))
    genetypes_needed=['lncRNA','processed_pseudogene']

    #### outfile for the common list
    outfile_noncoding=os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward/common_tumorspecific_noncoding_genes.csv")
    total_noncoding = pd.DataFrame(columns=['gene_name'])

    outfile_coding=os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward/common_tumorspecific_coding_genes.csv")
    total_coding = pd.DataFrame(columns=['gene_name'])

    for p in patients_dir:
        p = str(p.split("/")[-1])
        try:
            csv = pd.read_csv(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward",p)+"/"+p+"_tumor_specific_genes_TPM.csv")
            cds_list = list()
            cds_df = pd.DataFrame(columns=['gene_name'])
            nocds_list = list()
            nocds_df = pd.DataFrame(columns=['gene_name'])

            csv_type = csv.merge(transcript_gene, on=['gene_name'], how="inner")
            #### lncRNA + processed pseudogenes
            NOcsv_selected = csv_type[csv_type['gene_type'].isin(genetypes_needed)]
            NOcsv_selected.to_csv(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward",p)+"/"+p+"_tumor_specific_genes_TPM_NOCDS_selected.csv", index=None)

            noncoding = NOcsv_selected[['gene_name','gene_id']]
            total_noncoding = pd.concat([total_noncoding, noncoding], axis=0)

            #### protein coding
            csv_selected = csv_type[csv_type['gene_type'] == "protein_coding"]
            csv_selected.to_csv(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward",p)+"/"+p+"_tumor_specific_genes_TPM_CDS.csv", index=None)

            coding = csv_selected[['gene_name','gene_id']]
            total_coding = pd.concat([total_coding, coding], axis=0)
        except:
            continue

    #### count how many patients have each transcript
    total_noncoding['n'] = total_noncoding.groupby('gene_name')['gene_name'].transform('count')
    total_noncoding.drop_duplicates(inplace=True)
    total_noncoding.sort_values(by=['n'], ascending=False).to_csv(outfile_noncoding, index=None)

    total_coding['n']=total_coding.groupby('gene_name')['gene_name'].transform('count')
    total_coding.drop_duplicates(inplace=True)
    total_coding.sort_values(by=['n'], ascending=False).to_csv(outfile_coding, index=None)


BLCA, 18 patients


## Tissue expression control (GTEx)

In [21]:
GTEx=pd.read_csv(os.path.join(GENOMEDIR,"GTEx/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct"), sep="\t", skiprows=2)
GTEx.drop('Description', inplace=True, axis=1)
GTEx['Name'] = GTEx['Name'].str[:-2]
for proj in projects:

    nocoding = pd.read_csv(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward/common_tumorspecific_noncoding_genes.csv"))
    to_compare = nocoding.gene_id.values.tolist()
    output = os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward/GTEX_validated_noncoding_genes.csv")

    shared = GTEx[GTEx['Name'].isin(to_compare)]
    shared.to_csv(output, index=False)

    coding = pd.read_csv(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward/common_tumorspecific_coding_genes.csv"))
    to_compare = coding.gene_id.values.tolist()
    output = os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward/GTEX_validated_coding_genes.csv")

    shared = GTEx[GTEx['Name'].isin(to_compare)]
    shared.to_csv(output, index=False)

For now, we are only interested in those genes expressed only in **testis** or **ovary**

In [22]:
##  MEDIAN TPM = 0.5
for proj in projects:
    print(proj)
    ### non-coding
    common_non_coding = pd.read_csv(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward/common_tumorspecific_noncoding_genes.csv"))
    non_coding_GTEx = pd.read_csv(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward/GTEX_validated_noncoding_genes.csv"))
    to_save = non_coding_GTEx[['Name','Testis','Ovary']]
    non_coding_GTEx.drop(['Testis','Ovary','Name'], axis=1, inplace=True)

    # True False greater than 0
    non_coding_GTEx_05 = non_coding_GTEx.gt(0.5)

    # count True columns (greater than 0)
    non_coding_GTEx['gt'] = non_coding_GTEx_05[(non_coding_GTEx_05 == True)].count(axis='columns')

    # get rows with no grater than 0
    selected_non_coding_GTEx = non_coding_GTEx.loc[non_coding_GTEx['gt'] == 0]
    selected_non_coding_GTEx.drop('gt', axis=1, inplace=True)

    selected_non_coding_GTEx = pd.merge(to_save, selected_non_coding_GTEx , left_index=True, right_index=True)
    identifs_no_healthy_tissues = selected_non_coding_GTEx.Name.values.tolist()
    common_non_coding_noGTEx = common_non_coding[common_non_coding['gene_id'].isin(identifs_no_healthy_tissues)]
    print("Non-Coding ", len(common_non_coding_noGTEx))
    common_non_coding_noGTEx.to_csv(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward/common_tumorspecific_noncoding_genes_GTExvalidated_testis+ovary05.csv"), index=False)


    ### coding
    common_coding = pd.read_csv(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward/common_tumorspecific_coding_genes.csv"))
    coding_GTEx = pd.read_csv(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward/GTEX_validated_coding_genes.csv"))
    to_save = coding_GTEx[['Name','Testis','Ovary']]
    coding_GTEx.drop(['Testis','Ovary','Name'], axis=1, inplace=True)

    # True False greater than 0
    coding_GTEx_05 = coding_GTEx.gt(0.5)

    # count True columns (greater than 0)
    coding_GTEx['gt'] = coding_GTEx_05[(coding_GTEx_05 == True)].count(axis='columns')

    # get rows with no grater than 0
    selected_coding_GTEx = coding_GTEx.loc[coding_GTEx['gt'] == 0]
    selected_coding_GTEx.drop('gt', axis=1, inplace=True)
    selected_coding_GTEx = pd.merge(to_save, selected_coding_GTEx , left_index=True, right_index=True)
    identifs_no_healthy_tissues = selected_coding_GTEx.Name.values.tolist()
    common_coding_noGTEx = common_coding[common_coding['gene_id'].isin(identifs_no_healthy_tissues)]
    print("Coding ", len(common_coding_noGTEx))
    common_coding_noGTEx.to_csv(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward/common_tumorspecific_coding_genes_GTExvalidated_testis+ovary05.csv"), index=False)


BLCA
Non-Coding  979
Coding  143


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_non_coding_GTEx.drop('gt', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_coding_GTEx.drop('gt', axis=1, inplace=True)


Generate mini gtf of:

- lncRNA tumor-specific genes not expressed in adult tissues

- protein-coding tumor-specific not expressed in adult tissues



In [39]:
for proj in projects:
    print(proj)

    ref = pd.read_csv("/users/genomics/sergiov/annotations_and_indexes/gencode.v41.primary_assembly.annotation.gtf", sep="\t", header=None, comment="#")
    patients_dir = [ f.path for f in os.scandir(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward")) if f.is_dir() ]

    NOCDS_GTEx = pd.read_csv(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward/common_tumorspecific_noncoding_genes_GTExvalidated_testis+ovary05.csv"))
    GTEx_ids = NOCDS_GTEx.gene_id.values.tolist()

    CDS_GTEx = pd.read_csv(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward/common_tumorspecific_coding_genes_GTExvalidated_testis+ovary05.csv"))
    CDS_GTEx_ids = CDS_GTEx.gene_id.values.tolist()

    for p in patients_dir:
        #### non-coding
        GTEx_full_lncRNA = pd.DataFrame(columns = [0])
        GTEx_full_CDS = pd.DataFrame(columns = [0])
        p = str(p.split("/")[-1])
        if p != "rRNA":
            print(p)
            CSV_REF=pd.read_csv(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward",p)+"/" + p + "_tumor_specific_genes_TPM_NOCDS_selected.csv")

            CSV_GTEx = CSV_REF[CSV_REF['gene_id'].isin(GTEx_ids)]
            CSV_GTEx.to_csv(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward",p)+"/" + p + "_tumor_specific_genes_TPM_NOCDS_selected_GTEx05.csv", index=None)
            CSV_GTEx_ids = CSV_GTEx.gene_id.values.tolist()
            for geneid in CSV_GTEx_ids:
                GTEx = ref[ref[8].str.contains(geneid)]
                GTEx_full_lncRNA = pd.concat([GTEx, GTEx_full_lncRNA])

            GTEx_full_lncRNA.drop_duplicates(inplace=True)
            GTEx_full_lncRNA.to_csv(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward",p)+"/" + p + "_tumor_specific_genes_TPM_NOCDS_selected_GTEx05.gtf", sep="\t", header=None, index=None, quoting = 3)

            #### coding
            CSV_REF=pd.read_csv(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward",p)+"/" + p + "_tumor_specific_genes_TPM_CDS.csv")

            CSV_GTEx = CSV_REF[CSV_REF['gene_id'].isin(CDS_GTEx_ids)]
            CSV_GTEx.to_csv(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward",p)+"/" + p + "_tumor_specific_genes_TPM_CDS_GTEx05.csv", index=None)
            CSV_GTEx_ids = CSV_GTEx.gene_id.values.tolist()
            for geneid in CSV_GTEx_ids:
                GTEx = ref[ref[8].str.contains(geneid)]
                GTEx_full_CDS = pd.concat([GTEx, GTEx_full_CDS])

            GTEx_full_CDS.drop_duplicates(inplace=True)
            GTEx_full_CDS.to_csv(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward",p)+"/" + p + "_tumor_specific_genes_TPM_CDS_GTEx05.gtf", sep="\t", header=None, index=None, quoting = 3)
            print("Patient %s processed" %(p))


BLCA
TCGA-GD-A3OP
TCGA-K4-A54R
TCGA-GD-A3OQ
TCGA-GC-A3WC
TCGA-CU-A0YN
TCGA-GD-A2C5
TCGA-K4-A3WV
TCGA-BT-A20R
TCGA-BT-A20U
TCGA-GC-A3BM
TCGA-BT-A2LA
TCGA-GC-A6I3
TCGA-BT-A2LB
TCGA-BT-A20Q
TCGA-CU-A0YR
TCGA-BT-A20W
TCGA-K4-A5RI
TCGA-BT-A20N


Summary

In [40]:
%%bash -s "$bash_projects" "$GENERAL"

for proj in $1; do
    OUT=$2/$proj/results/tumor_specific_genes_GTEx_TPM05_v41.csv
    if [ -f "$OUT" ] ; then
        rm "$OUT"
    fi
    dir=$2/$proj/analysis/08_tumor_specific/straightforward
    cd $dir
    echo -e "Patient,Coding genes,Non-coding genes">> $OUT

    for patient in */; do
        echo -e $patient","$(cat $patient/*_tumor_specific_genes_TPM_CDS_GTEx05.csv | tail -n +2 | cut -d, -f1 | sort | uniq | wc -l)","$(cat $patient/*_tumor_specific_genes_TPM_NOCDS_selected_GTEx05.csv | tail -n +2 | wc -l) >> $OUT

    done
done

Get fasta of non-coding

In [41]:
%%bash -s "$GENERAL" "$bash_projects" "$GENOMEFASTA"

export PATH=/genomics/users/marta/tools/gffread-0.12.7.Linux_x86_64/:$PATH

for proj in $2; do
    echo $proj
    for patient in $1/$proj/analysis/08_tumor_specific/straightforward/TCGA*; do
        p=${patient##*/}
        echo $p
        p=${p%%/*}
        echo $p
        file=$1/$proj/analysis/08_tumor_specific/straightforward/${p}/${p}_tumor_specific_genes_TPM_NOCDS_selected_GTEx05.gtf

        #get fasta
        gffread --attrs gene_name,transcript_type,transcript_name -w ${file%%.*}.fa -g $3 $file

        #replace spaces by ;
        sed -i 's/\ /;/g' ${file%%.*}.fa
    done
done

BLCA
TCGA-BT-A20N
TCGA-BT-A20N
TCGA-BT-A20Q
TCGA-BT-A20Q
TCGA-BT-A20R
TCGA-BT-A20R
TCGA-BT-A20U
TCGA-BT-A20U
TCGA-BT-A20W
TCGA-BT-A20W
TCGA-BT-A2LA
TCGA-BT-A2LA
TCGA-BT-A2LB
TCGA-BT-A2LB
TCGA-CU-A0YN
TCGA-CU-A0YN
TCGA-CU-A0YR
TCGA-CU-A0YR
TCGA-GC-A3BM
TCGA-GC-A3BM
TCGA-GC-A3WC
TCGA-GC-A3WC
TCGA-GC-A6I3
TCGA-GC-A6I3
TCGA-GD-A2C5
TCGA-GD-A2C5
TCGA-GD-A3OP
TCGA-GD-A3OP
TCGA-GD-A3OQ
TCGA-GD-A3OQ
TCGA-K4-A3WV
TCGA-K4-A3WV
TCGA-K4-A54R
TCGA-K4-A54R
TCGA-K4-A5RI
TCGA-K4-A5RI


In [48]:
%%bash -s "$GENERAL" "$bash_projects" "$GENOMEDIR"

for proj in $2; do
    echo $proj
    for patient in $1/$proj/analysis/08_tumor_specific/straightforward/TCGA*; do
        p=${patient##*/}
        p=${p%%/*}
        echo $p
        file=$1/$proj/analysis/08_tumor_specific/straightforward/${p}/${p}_tumor_specific_genes_TPM_NOCDS_selected_GTEx05.fa
        awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);}  END {printf("\n");}' < $file > ${file%%.*}_oneline.fa
    done
done


BLCA
TCGA-BT-A20N
TCGA-BT-A20Q
TCGA-BT-A20R
TCGA-BT-A20U
TCGA-BT-A20W
TCGA-BT-A2LA
TCGA-BT-A2LB
TCGA-CU-A0YN
TCGA-CU-A0YR
TCGA-GC-A3BM
TCGA-GC-A3WC
TCGA-GC-A6I3
TCGA-GD-A2C5
TCGA-GD-A3OP
TCGA-GD-A3OQ
TCGA-K4-A3WV
TCGA-K4-A54R
TCGA-K4-A5RI


for the fasta sequences, keep the longest transcript

In [103]:
for proj in projects:
    patients = [o for o in os.listdir(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward/")) if os.path.isdir(os.path.join(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward/"),o))]

    for p in patients:
        df = pd.read_csv(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward",p)+"/"+p+"_tumor_specific_genes_TPM_NOCDS_selected_GTEx05_oneline.fa", header=None, sep="\t")

        headers = df[df[0].str.startswith(">")]
        headers = headers[0].values.tolist()

        sequences = df[~df[0].str.startswith(">")]
        sequences = sequences[0].values.tolist()

        fasta = pd.DataFrame({'header':headers, 'seq':sequences})
        fasta['length'] = fasta['seq'].str.len()
        fasta['gene_name'] = fasta['header'].str.split("=", expand=True)[1]
        fasta['gene_name'] = fasta['gene_name'].str.split(";", expand=True)[0]
        
        # select longest transcript per gene
        longest = fasta.groupby('gene_name').max('length').reset_index()

        fasta_longest = fasta.merge(longest, on=['gene_name','length'])
        fasta_longest = fasta_longest[['header','seq']]
        
        with open(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward",p)+"/"+p+"_tumor_specific_genes_TPM_NOCDS_selected_GTEx05_oneline_longest.fa", 'w') as output_file:
            for index,row in fasta_longest.iterrows():
                output_file.write(row['header'] + '\n' + row['seq'] + '\n')
        
        
        

In [104]:
%%bash -s "$GENERAL" "$bash_projects" "$GENOMEDIR"

## count fasta sequences to prove the process has been correctly done
for proj in $2; do
    echo $proj
    for patient in $1/$proj/analysis/08_tumor_specific/straightforward/TCGA*; do
        p=${patient##*/}
        p=${p%%/*}
        echo -e $p"\t"$(cat $1/$proj/analysis/08_tumor_specific/straightforward/${p}/${p}_tumor_specific_genes_TPM_NOCDS_selected_GTEx05.csv | tail -n +2 | wc -l)"\t"$(grep '>' $1/$proj/analysis/08_tumor_specific/straightforward/${p}/${p}_tumor_specific_genes_TPM_NOCDS_selected_GTEx05_oneline_longest.fa | cut -d";" -f1 | sort | uniq | wc -l)
    done
done

BLCA
TCGA-BT-A20N	86	86
TCGA-BT-A20Q	41	41
TCGA-BT-A20R	142	142
TCGA-BT-A20U	34	34
TCGA-BT-A20W	35	35
TCGA-BT-A2LA	446	447
TCGA-BT-A2LB	54	54
TCGA-CU-A0YN	66	66
TCGA-CU-A0YR	33	33
TCGA-GC-A3BM	33	33
TCGA-GC-A3WC	120	120
TCGA-GC-A6I3	44	44
TCGA-GD-A2C5	15	15
TCGA-GD-A3OP	7	7
TCGA-GD-A3OQ	25	25
TCGA-K4-A3WV	152	152
TCGA-K4-A54R	44	44
TCGA-K4-A5RI	58	58


Get fasta of coding CDS

In [107]:
%%bash -s "$GENERAL" "$bash_projects" "$GENOMEFASTA"

export PATH=/genomics/users/marta/tools/gffread-0.12.7.Linux_x86_64/:$PATH

for proj in $2; do
    echo $proj
    for patient in $1/$proj/analysis/08_tumor_specific/straightforward/TCGA*; do
        p=${patient##*/}
        p=${p%%/*}
        echo $p
        file=$1/$proj/analysis/08_tumor_specific/straightforward/${p}/${p}_tumor_specific_genes_TPM_CDS_GTEx05.gtf

        #get fasta
        gffread --attrs gene_name,transcript_name -x ${file%%.*}.fa -g $3 $file

        #replace spaces by ;
        sed -i 's/\ /;/g' ${file%%.*}.fa
    done
done

BLCA
TCGA-BT-A20N
TCGA-BT-A20Q
TCGA-BT-A20R
TCGA-BT-A20U
TCGA-BT-A20W
TCGA-BT-A2LA
TCGA-BT-A2LB
TCGA-CU-A0YN
TCGA-CU-A0YR
TCGA-GC-A3BM
TCGA-GC-A3WC
TCGA-GC-A6I3
TCGA-GD-A2C5
TCGA-GD-A3OP
TCGA-GD-A3OQ
TCGA-K4-A3WV
TCGA-K4-A54R
TCGA-K4-A5RI


In [108]:
%%bash -s "$GENERAL" "$bash_projects" "$GENOMEDIR"

for proj in $2; do
    echo $proj
    for patient in $1/$proj/analysis/08_tumor_specific/straightforward/TCGA*; do
        p=${patient##*/}
        p=${p%%/*}
        echo $p
        file=$1/$proj/analysis/08_tumor_specific/straightforward/${p}/${p}_tumor_specific_genes_TPM_CDS_GTEx05.fa
        awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);}  END {printf("\n");}' < $file > ${file%%.*}_oneline.fa
    done
done


BLCA
TCGA-BT-A20N
TCGA-BT-A20Q
TCGA-BT-A20R
TCGA-BT-A20U
TCGA-BT-A20W
TCGA-BT-A2LA
TCGA-BT-A2LB
TCGA-CU-A0YN
TCGA-CU-A0YR
TCGA-GC-A3BM
TCGA-GC-A3WC
TCGA-GC-A6I3
TCGA-GD-A2C5
TCGA-GD-A3OP
TCGA-GD-A3OQ
TCGA-K4-A3WV
TCGA-K4-A54R
TCGA-K4-A5RI


for the fasta sequences, keep the longest transcript

In [111]:
for proj in projects:
    patients = [o for o in os.listdir(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward/")) if os.path.isdir(os.path.join(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward/"),o))]

    for p in patients:
        try:
            df = pd.read_csv(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward",p)+"/"+p+"_tumor_specific_genes_TPM_CDS_GTEx05_oneline.fa", header=None, sep="\t")

            headers = df[df[0].str.startswith(">")]
            headers = headers[0].values.tolist()

            sequences = df[~df[0].str.startswith(">")]
            sequences = sequences[0].values.tolist()

            fasta = pd.DataFrame({'header':headers, 'seq':sequences})
            fasta['length'] = fasta['seq'].str.len()
            fasta['gene_name'] = fasta['header'].str.split("=", expand=True)[1]
            fasta['gene_name'] = fasta['gene_name'].str.split(";", expand=True)[0]

            # select longest transcript per gene
            longest = fasta.groupby('gene_name').max('length').reset_index()

            fasta_longest = fasta.merge(longest, on=['gene_name','length'])
            fasta_longest = fasta_longest[['header','seq']]

            with open(os.path.join(GENERAL,proj,"analysis/08_tumor_specific/straightforward",p)+"/"+p+"_tumor_specific_genes_TPM_CDS_GTEx05_oneline_longest.fa", 'w') as output_file:
                for index,row in fasta_longest.iterrows():
                    output_file.write(row['header'] + '\n' + row['seq'] + '\n')
        except:
            print("Patient ",p," has no protein-coding tumor-specific genes")
        
        

Patient  TCGA-GC-A3BM  has no protein-coding tumor-specific genes
Patient  TCGA-BT-A20W  has no protein-coding tumor-specific genes


In [113]:
%%bash -s "$GENERAL" "$bash_projects" "$GENOMEDIR"

## count fasta sequences to prove the process has been correctly done
for proj in $2; do
    for patient in $1/$proj/analysis/08_tumor_specific/straightforward/TCGA*; do
        p=${patient##*/}
        p=${p%%/*}
        echo -e $p"\t"$(cat $1/$proj/analysis/08_tumor_specific/straightforward/${p}/${p}_tumor_specific_genes_TPM_CDS_GTEx05.csv | tail -n +2 | wc -l)"\t"$(grep '>' $1/$proj/analysis/08_tumor_specific/straightforward/${p}/${p}_tumor_specific_genes_TPM_CDS_GTEx05_oneline_longest.fa | cut -d";" -f1 | sort | uniq | wc -l)
    done
done

TCGA-BT-A20N	9	12
TCGA-BT-A20Q	2	2
TCGA-BT-A20R	34	47
TCGA-BT-A20U	5	5


grep: /users/genomics/marta/BLCA/analysis/08_tumor_specific/straightforward/TCGA-BT-A20W/TCGA-BT-A20W_tumor_specific_genes_TPM_CDS_GTEx05_oneline_longest.fa: No such file or directory


TCGA-BT-A20W	0	0
TCGA-BT-A2LA	68	79
TCGA-BT-A2LB	12	15
TCGA-CU-A0YN	9	10
TCGA-CU-A0YR	7	8


grep: /users/genomics/marta/BLCA/analysis/08_tumor_specific/straightforward/TCGA-GC-A3BM/TCGA-GC-A3BM_tumor_specific_genes_TPM_CDS_GTEx05_oneline_longest.fa: No such file or directory


TCGA-GC-A3BM	0	0
TCGA-GC-A3WC	22	31
TCGA-GC-A6I3	10	13
TCGA-GD-A2C5	4	5
TCGA-GD-A3OP	1	2
TCGA-GD-A3OQ	9	11
TCGA-K4-A3WV	21	26
TCGA-K4-A54R	7	7
TCGA-K4-A5RI	12	18
