In [1]:
import os,re,glob,csv
import pandas as pd
import numpy as np
from collections import Counter
from rna_seq_normalization import Normalization as Norm
from functools import reduce
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

users_dir = "/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47"
specie = "human"
# GENOMEDIR = "/genomics/users/marta/genomes"
GENOMEDIR = "/data/genomics/marta/genomes"

### cancer data
cancer_dir = users_dir + "/cancers"
raw_cancer_dir="/users/genomics/marta/TCGA_RNASeq"
tcga_projects=["TCGA-BRCA","TCGA-LUSC","TCGA-PRAD","TCGA-KIRC","TCGA-LUAD","TCGA-BLCA"]#,"TCGA-LIHC"]
other_projects=["GSE102101_KIRC","GSE133624_BLCA","GSE22260_PRAD","PRJEB2449_PRAD","SRP238334_KIRC","GSE214846_LIHC","GSE229705_LUAD","SRP107326_COAD","TCGA_COAD"]
# deleted=["GSE103001_BRCA","GSE89223_PRAD"]
manuscript_projects = ["liver_adjacent_totalRNA_LIHC","hcc_normal_totalRNA_LIHC","GSE193567_LIHC","LIHC_TCGA_LIHC"]
all_projects = tcga_projects + other_projects + manuscript_projects

cancertypes = ["BRCA","BLCA","LUAD","KIRC","PRAD","LUSC","LIHC","COAD"]
## annotation file
annotation="/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/newReference_Resconstructed/gencode.v47.gffcompare.TestisLiverBrain.annotation.sorted.1transcript.sorted.NOchr.gtf"
transcript_gene=pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/newReference_Resconstructed/transID_geneID_isoforms_selected.1to1.csv")

def count_greater_than_one(row):
    return (row > 1).sum()

## Quantify with TCGAData and new reference
`featureCounts_newRef_cancer.sh`

output in: `/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/cancers/featureCounts`

## TPM

In [4]:
for file in os.listdir(os.path.join(cancer_dir,"featureCounts")):
    if file.endswith(".txt"):

        proj = file.split("featureCounts_")[-1]
        cancer_type = proj[:-4]
        print(cancer_type)
        toc = pd.read_csv(os.path.join(cancer_dir,"featureCounts",file), sep="\t", comment="#")
        toc = toc[toc['Geneid'].str.contains('PAR_')==False]
        toc['Geneid']=toc['Geneid'].str.split('.').str[0]
        toc.rename(columns={'Geneid':'transcript_id'}, inplace=True)

        filter_col = [col for col in toc if col.startswith('/')]
        for col in filter_col:
            new_col=col.split("Aligned")[0]
            new_col=new_col.split("/")[-1]
            if proj in tcga_projects:
                new_col=cancer_type+new_col[4:]
            toc.rename(columns={col:new_col}, inplace=True)

        length = toc['Length']
        genes = toc['transcript_id']
        # we are only interested in the columns with counts
        counts = toc
        counts.drop(["Chr","Start","End","Strand","Length","transcript_id"],axis=1, inplace=True)
        # calculate TPMs
        tpm_df = Norm.tpm(counts, length)
        # add transcript_id and length again
        tpms = pd.concat([genes,tpm_df, length], axis=1)
        tpms.to_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+cancer_type+".csv", index=None)
 

TCGA-BRCA
GSE102101_KIRC
TCGA-LUSC
TCGA-PRAD
TCGA-BLCA
GSE229705_LUAD
PRJEB2449_PRAD
GSE133624_BLCA
TCGA_COAD_SE
GSE22260_PRAD
GSE214846_LIHC
TCGA_COAD_PE
liver_adjacent_totalRNA_LIHC
TCGA-KIRC
hcc_normal_totalRNA_LIHC
SRP107326_COAD
TCGA-LUAD
GSE193567_LIHC
SRP238334_KIRC
LIHC_TCGA_LIHC


In [5]:
## merge single-end and paired-end TCGA COAD
SE_COAD = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_TCGA_COAD_SE.csv"))
print(len(SE_COAD.columns))
PE_COAD = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_TCGA_COAD_PE.csv"))
print(len(PE_COAD.columns))

COAD = SE_COAD.merge(PE_COAD, on=["transcript_id","Length"])
COAD.to_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_TCGA_COAD.csv"), index=None)
print(len(COAD.columns))

19
66
83


In [6]:
## filter patients we are interested in, from those not TCGA
for proj in all_projects:
    print(proj)
    if proj in tcga_projects:
        cancer_type = proj[5:]

        fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+proj+".csv")
        tumor_transcripts = list()
        patients=pd.read_csv(os.path.join("/users/genomics/marta/TCGA_RNASeq",proj,"results/QC_patients1.csv"))
        print(len(patients)," patients")
        # patients['normal'] = patients['normal'].str.replace('TCGA', cancer_type)
        # patients['tumor'] = patients['tumor'].str.replace('TCGA', cancer_type)

    elif proj in other_projects:
        fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+proj+".csv")
        print(fc)
        tumor_transcripts = list()
        patients=pd.read_csv(os.path.join("/users/genomics/marta/cancers_RNASeq",proj,"results/patients.csv"))
        print(len(patients)," patients")

        samples_to_keep = patients.tumor.values.tolist() + patients.normal.values.tolist() + ['Length','transcript_id']
        fc_selected = fc[fc.columns.intersection(samples_to_keep)]

    elif proj in manuscript_projects:
        fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+proj+".csv")
        tumor_transcripts = list()

        if "liver_adjacent_totalRNA" in proj:
            patients=pd.read_csv(os.path.join("/projects_eg/projects/marta/liver_adjacent_totalRNA/results/clean_patients.csv"))
            # patients.rename(columns={'adjacent':'normal'}, inplace=True)
        elif "hcc_normal_totalRNA" in proj:
            patients=pd.read_csv(os.path.join("/projects_eg/projects/marta/hcc_normal_totalRNA/results/clean_patients.csv"))
            # patients.rename(columns={'adjacent':'normal'}, inplace=True)
        elif "GSE193567" in proj:
            patients=pd.read_csv(os.path.join("/projects_eg/projects/marta/GSE193567/results/clean_patients.csv"))        
            # patients.rename(columns={'adjacent':'normal'}, inplace=True)
        elif "LIHC_TCGA" in proj:
            patients=pd.read_csv(os.path.join("/projects_eg/projects/marta/LIHC_TCGA/results/clean_patients.csv"))    
            patients['normal'] = patients['patient'] + "_normal"
            patients['tumor'] = patients['patient'] + "_tumor"
        print(len(patients)," patients")

    for index,patient in patients.iterrows():
        if proj in tcga_projects:
            patient_fc = fc[["transcript_id","Length",patient['tumor']]]

            tumor_patient_fc = patient_fc[patient_fc[patient['tumor']] >= 1 ]
            tumor_transcripts.extend(tumor_patient_fc.transcript_id.values.tolist())
        if proj in other_projects:
            patient_fc = fc[["transcript_id","Length",patient['tumor']]]

            tumor_patient_fc = patient_fc[patient_fc[patient['tumor']] >= 1 ]
            tumor_transcripts.extend(tumor_patient_fc.transcript_id.values.tolist())       
        if proj in manuscript_projects:
            patient_fc = fc[["transcript_id","Length",patient['tumor']]]

            tumor_patient_fc = patient_fc[patient_fc[patient['tumor']] >= 1 ]
            tumor_transcripts.extend(tumor_patient_fc.transcript_id.values.tolist())                        

    tumor1FPKM = fc[fc['transcript_id'].isin(tumor_transcripts)]
    # print(tumor1FPKM)
    known = tumor1FPKM[tumor1FPKM['transcript_id'].str.contains('ENST')]
    novel = tumor1FPKM[tumor1FPKM['transcript_id'].str.contains('TCONS')]
    merged = tumor1FPKM.merge(transcript_gene, on=['transcript_id'], how="inner")
    merged = merged[merged['gene_type'].isin(['lncRNA','processed_pseudogene','novel','protein_coding'])]

    lncRNA = merged[merged['gene_type'].isin(['lncRNA','processed_pseudogene'])]
    print("lncRNA: ",len(lncRNA))

    cds = merged[merged['gene_type'] == "protein_coding"]
    print("PROTEIN CODING: ",len(cds))

    novel = merged[merged['gene_type'] == "novel"]
    ## length limitations
    novel = novel[novel['Length'].astype(int) < 91667]
    novel = novel[novel['Length'].astype(int) > 300]

    print("NOVEL: ",len(novel))

    # # tumor1FPKM = pd.concat([novel, lncRNA, cds], ignore_index = True)
    # merged[['transcript_id','gene_id','gene_name','gene_type','Length']].to_csv(os.path.join(cancer_dir,"tumorexpressed/tumor_1FPKM_")+cancer_type+".csv",index=False)

    ## keep only tumor samples
    if proj in tcga_projects:
        tumorONLY_merged = merged[merged.columns.drop(list(merged.filter(regex='normal')))]
    else:
        tumorONLY_merged = merged.drop(columns=patients.normal.values.tolist())

    tumorONLY_merged = tumorONLY_merged[['transcript_id','gene_id','gene_id', 'gene_type', 'gene_name','Length', *patients.tumor.values.tolist()]]
    tumorONLY_merged.to_csv(os.path.join(cancer_dir,"tumorexpressed/tumor_1FPKM_table_of_counts_")+proj+".csv",index=False)

    ###### GET TUMOR-EXPRESSED > 1 TPM
    # Filter only integer columns
    int_columns = tumorONLY_merged.select_dtypes(include='int64')
    
    # Create a new column with the count of int columns per row whose value is > 1
    # tumorONLY_merged['n'] = int_columns.apply(lambda row: (row > 1).sum(), axis=1)
    tumorONLY_merged['n'] = tumorONLY_merged.drop(columns=['transcript_id','gene_id','gene_name','gene_type','Length']).apply(count_greater_than_one, axis=1)
    tumorONLY_merged = tumorONLY_merged[['transcript_id','gene_id','gene_name','gene_type','Length','n']]
    tumorONLY_merged.sort_values(by=['n'], ascending=False).to_csv(os.path.join(cancer_dir,"tumorexpressed/tumor_1FPKM_n_")+proj+".csv",index=False)

TCGA-BRCA
109  patients
lncRNA:  15803
PROTEIN CODING:  17313
NOVEL:  304
TCGA-LUSC
49  patients
lncRNA:  14578
PROTEIN CODING:  17328
NOVEL:  271
TCGA-PRAD
52  patients
lncRNA:  10409
PROTEIN CODING:  15949
NOVEL:  235
TCGA-KIRC
71  patients
lncRNA:  11415
PROTEIN CODING:  16419
NOVEL:  250
TCGA-LUAD
56  patients
lncRNA:  14375
PROTEIN CODING:  17107
NOVEL:  278
TCGA-BLCA
18  patients
lncRNA:  11285
PROTEIN CODING:  16530
NOVEL:  222
GSE102101_KIRC
         transcript_id   SRR5885319   SRR5885320   SRR5885321   SRR5885322  \
0       TCONS_00001163   830.787897   202.965330   666.785038   360.316898   
1      ENST00000387314  2477.853965   272.093328  1653.609499   313.237274   
2      ENST00000389680  4034.349225   855.853788  3014.283347   703.347008   
3       TCONS_00001164  7730.138459  1868.191546  5779.664200  1457.492971   
4      ENST00000387342  1515.029280    43.292633  1467.940510   140.294062   
...                ...          ...          ...          ...          ...   


In [7]:
## create a merged table of counts per ctype and a big patients file
## CAN THEY BE CONSIDERED AS ONE?
for ctype in cancertypes:
    print(ctype)
    
    # Lists to store DataFrames for counts and patients
    fc_list = []
    patients_list = []
    
    for proj in all_projects:
        if proj in tcga_projects and ctype in proj:
            print(proj)
            # Read the CSV files for TCGA projects
            patients = pd.read_csv(os.path.join("/users/genomics/marta/TCGA_RNASeq", proj, "results/QC_patients1.csv"))
            patients['project'] = proj  # Add the project column
            fc = pd.read_csv(os.path.join(cancer_dir, "featureCounts", f"table_of_counts_TPMs_{proj}.csv"))
            fc_list.append(fc)
            patients_list.append(patients)
        
        elif proj in other_projects and ctype in proj:
            print(proj)
            # Read the CSV files for other projects
            patients = pd.read_csv(os.path.join("/users/genomics/marta/cancers_RNASeq", proj, "results/patients.csv"))
            patients['project'] = proj  # Add the project column
            fc = pd.read_csv(os.path.join(cancer_dir, "featureCounts", f"table_of_counts_TPMs_{proj}.csv"))
            fc_list.append(fc)
            patients_list.append(patients)
        
        elif proj in manuscript_projects and ctype in proj:
            print(proj)
            # Read the CSV files for manuscript projects
            if proj == "LIHC_TCGA_LIHC":
                patients = pd.read_csv(os.path.join("/projects_eg/projects/marta", proj[:-5], "results/clean_patients.csv"))
                patients['normal'] = patients['patient'] + "_normal"
                patients['tumor'] = patients['patient'] + "_tumor"

            else:
                patients = pd.read_csv(os.path.join("/projects_eg/projects/marta", proj[:-5], "results/clean_patients.csv"))

            patients['project'] = proj  # Add the project column
            fc = pd.read_csv(os.path.join(cancer_dir, "featureCounts", f"table_of_counts_TPMs_{proj}.csv"))
            fc_list.append(fc)
            patients_list.append(patients)
    
    # Merge all fc DataFrames by 'transcript_id' and 'Length'
    if fc_list:
        merged_fc = reduce(lambda left, right: pd.merge(left, right, on=["transcript_id", "Length"], how='outer'), fc_list)
        # Save the merged fc DataFrame
        merged_fc.to_csv(os.path.join(cancer_dir, f"merged_fc_{ctype}.csv"), index=False)
    
    # Combine all patient DataFrames into one big DataFrame
    if patients_list:
        merged_patients = pd.concat(patients_list, ignore_index=True)
        # Save the merged patients DataFrame
        merged_patients.to_csv(os.path.join(cancer_dir, f"merged_patients_{ctype}.csv"), index=False)


BRCA
TCGA-BRCA
BLCA
TCGA-BLCA
GSE133624_BLCA
LUAD
TCGA-LUAD
GSE229705_LUAD
KIRC
TCGA-KIRC
GSE102101_KIRC
SRP238334_KIRC
PRAD
TCGA-PRAD
GSE22260_PRAD
PRJEB2449_PRAD
LUSC
TCGA-LUSC
LIHC
GSE214846_LIHC
liver_adjacent_totalRNA_LIHC
hcc_normal_totalRNA_LIHC
GSE193567_LIHC
LIHC_TCGA_LIHC
COAD
SRP107326_COAD
TCGA_COAD


Here to run Q3.1 - TPM distribution to know if we can treat them as one per cancer type or if there is bias according to the dataset

Instead of selecting the tumor-specific, let's look first at the log2ratio between means


In [8]:
testisRestr = pd.read_csv("/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Multimap_altORFs/Q2_TestisRestricted/human/testisRestricted_GTEx_translatedONLYtestis.noProteome.csv")
testisRestr

Unnamed: 0.1,Unnamed: 0,gene_id,gene_name,orfID,transcript_id,gene_type,orfType,length,geneORFtype,length_aa,start_codon,ORFpep,TranslatedLiver,TranslatedBrain
0,0,ENSG00000039600,SOX30,ENST00000265007.11:5:-|14|3283:361:2623|canoni...,ENST00000265007,protein_coding,canonical,2262,protein_coding_canonical,754,ATG,MERARPEPPPQPRPLRPAPPPLPVEGTSFWAAAMEPPPSSPTLSAA...,no,no
1,1,ENSG00000039600,SOX30,ENST00000265007.11:5:-|9|3283:233:491|ouORF|CTG,ENST00000265007,protein_coding,ouORF,258,protein_coding_ouORF,86,CTG,MRFEPRRVLGSKGLARLLTERGRGKVANKRLSSQSWPGRPPPPWRE...,no,no
2,2,ENSG00000046774,MAGEC2,ENST00000247452.4:X:-|15|1994:349:1471|canonic...,ENST00000247452,protein_coding,canonical,1122,protein_coding_canonical,374,ATG,MPPVPGVPFRNVDNDSPTSVELEDWVDAQHPTDEEEEEASSASSTL...,no,no
3,3,ENSG00000046774,MAGEC2,ENST00000247452.4:X:-|2|1994:165:240|uORF|CTG,ENST00000247452,protein_coding,uORF,75,protein_coding_uORF,25,CTG,MYCAAVRLVLQEPGGDELGVRHTA*,no,no
4,4,ENSG00000046774,MAGEC2,ENST00000247452.4:X:-|1|1994:91:148|uORF|CTG,ENST00000247452,protein_coding,uORF,57,protein_coding_uORF,19,CTG,MASPQGEGPEEELRDLPP*,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
936,981,XLOC_000999,XLOC_000999,TCONS_00001142:9:-|259|6071:5083:5176|noncodin...,TCONS_00001142,novel,noncoding,93,novel_noncoding,31,ATG,MPSLKFGKEALSFIHSLPFGFLSWRKFFII*,no,no
937,982,XLOC_001006,XLOC_001006,TCONS_00001162:9:-|216|3510:2703:2862|noncodin...,TCONS_00001162,novel,noncoding,159,novel_noncoding,53,ATG,MALPEGATHWRHLGLLSQMVFQQVGVGISLQRPGPAPEGGEVSLGL...,no,no
938,983,XLOC_001006,XLOC_001006,TCONS_00001162:9:-|252|3510:3325:3391|noncodin...,TCONS_00001162,novel,noncoding,66,novel_noncoding,22,TTG,MPISTEGQKQGNEPPSGYRKE*,no,no
939,984,XLOC_001006,XLOC_001006,TCONS_00001162:9:-|83|3510:982:1030|noncoding|ATG,TCONS_00001162,novel,noncoding,48,novel_noncoding,16,ATG,MLPLGSVFPEPRRLR*,no,no


In [None]:
## log2ratio3x & 1 TPM
log2ratio3x_general = pd.DataFrame(columns=["gene_name","gene_id","gene_type"])
log2ratio3x_1TPM_general = pd.DataFrame(columns=["gene_name","gene_id","gene_type","ctype"])
log2ratio3x_1TPM_general_5percent = pd.DataFrame(columns=["gene_name","gene_id","gene_type"])

for ctype in cancertypes:

    print(ctype)
    tumorspecific_candidates = pd.DataFrame()

    patients = pd.read_csv(cancer_dir+"/merged_patients_"+ctype+".csv")
    patients_long = patients.melt(id_vars=['patient', 'project'], value_vars=['normal', 'tumor'], 
                    var_name='normal_tumor', value_name='sample')
    patients_long['normal_tumor'] = patients_long['normal_tumor'].replace({'normal': 'normal', 'tumor': 'tumor'})
    
    fc=pd.read_csv(cancer_dir+"/merged_fc_"+ctype+".csv")

    fc_info = fc.merge(transcript_gene, on=['transcript_id'], how="inner")
    fc_info = fc_info[fc_info['gene_type'].isin(['lncRNA','processed_pseudogene','novel','protein_coding'])]

    fc_info_long = pd.melt(fc_info, id_vars=["transcript_id","Length", "gene_id", "gene_type", "gene_name"], ignore_index=False).reset_index()
    fc_info_long.rename(columns={"variable":"sample","value":"TPM"}, inplace=True)
    fc_info_long = fc_info_long.merge(patients_long, on=["sample"])

    means = fc_info_long.groupby(["gene_name","gene_id","gene_type","normal_tumor"])['TPM'].mean().reset_index()
    means_pivot = means.pivot_table(index=['gene_name', 'gene_id', 'gene_type'], 
                          columns='normal_tumor', 
                          values='TPM').reset_index()

    # Avoid division by zero by adding a small constant (epsilon)
    epsilon = 1e-6  # A small value to prevent division by zero
    means_pivot['tumor'] = pd.to_numeric(means_pivot['tumor'], errors='coerce')
    means_pivot['normal'] = pd.to_numeric(means_pivot['normal'], errors='coerce')
    means_pivot['log2ratio'] = np.log2((means_pivot['tumor'] + epsilon) / (means_pivot['normal'] + epsilon))

    log2ratio3x = means_pivot[means_pivot['log2ratio'] >= np.log2(3)]
    log2ratio3x_testisRestr = log2ratio3x[log2ratio3x['gene_id'].isin(testisRestr.gene_id.values.tolist())]

    # try:
    #     os.makedirs(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype))
    # except:
    #     continue
    log2ratio3x_testisRestr.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,"log2ratio3x.csv"), index=None)
    log2ratio3x_general = pd.concat([log2ratio3x_general, log2ratio3x_testisRestr[["gene_name","gene_id","gene_type"]]])

    ### > 1 TPM
    max = fc_info_long.groupby(["gene_name","gene_id","gene_type","normal_tumor"])['TPM'].max().reset_index()
    ## tumor samples max has to be greater than 1TPM
    max = max[max['normal_tumor'] == "tumor"]
    max.rename(columns={'TPM':'max_TPM'}, inplace=True)

    TPM1 = max[max['max_TPM'] > 1]
    log2ratio3x_1TPM = log2ratio3x_testisRestr[log2ratio3x_testisRestr['gene_id'].isin(TPM1.gene_id.values.tolist())]
    log2ratio3x_1TPM.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,"log2ratio3x_1TPM.csv"), index=None)
    log2ratio3x_1TPM['ctype'] = ctype
    log2ratio3x_1TPM_general = pd.concat([log2ratio3x_1TPM_general, log2ratio3x_1TPM[["gene_name","gene_id","gene_type","ctype"]]])

    ### > 1 TPM - 5%
    # TAA_1TPM5percent = pd.read_csv(os.path.join(cancer_dir,"tumorexpressed/cancertypes/tumor_1FPKM_n5percent_pancancer.csv"))
    
    # log2ratio3x_1TPM = log2ratio3x_testisRestr[log2ratio3x_testisRestr['gene_id'].isin(TAA_1TPM5percent.gene_id.values.tolist())]
    # log2ratio3x_1TPM.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,"log2ratio3x_1TPM5percent.csv"), index=None)
    # log2ratio3x_1TPM_general_5percent = pd.concat([log2ratio3x_1TPM_general_5percent, log2ratio3x_1TPM[["gene_name","gene_id","gene_type"]]])

log2ratio3x_1TPM_general.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/log2ratio3x_1TPM_genes.csv"), index=None)

In [31]:
log2ratio3x_1TPM_general.drop_duplicates(inplace=True)
log2ratio3x_1TPM_general[['gene_name','gene_type']].groupby("gene_type").count()
# log2ratio3x_1TPM_general

Unnamed: 0_level_0,gene_name
gene_type,Unnamed: 1_level_1
lncRNA,184
novel,29
processed_pseudogene,15
protein_coding,568


In [37]:
## ORF level   
log2ratio3x_1TPM_general = pd.read_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/log2ratio3x_1TPM_genes.csv"))
log2ratio3x_1TPM_general = log2ratio3x_1TPM_general.merge(transcript_gene[["gene_id","transcript_id"]], on=["gene_id"])
log2ratio3x_1TPM_general.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/onlyStep1/log2ratio3x_1TPM_genes.csv"), index=None)

tumorReact_ORFs = log2ratio3x_1TPM_general.merge(testisRestr, on=["transcript_id","gene_name","gene_id","gene_type"])
tumorReact_ORFs = tumorReact_ORFs[['gene_name','gene_id','transcript_id','gene_type','orfID','length','ctype','orfType','geneORFtype','length_aa','start_codon','ORFpep']]
tumorReact_ORFs.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/log2ratio3x_1TPM_ORFs.csv"), index=None)
tumorReact_ORFs.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/onlyStep1/log2ratio3x_1TPM_ORFs.csv"), index=None)
tumorReact_ORFs.groupby("geneORFtype").count()
tumorReact_ORFs[['gene_name','length_aa','geneORFtype']].drop_duplicates().groupby("geneORFtype").count()


Unnamed: 0_level_0,gene_name,length_aa
geneORFtype,Unnamed: 1_level_1,Unnamed: 2_level_1
lncRNA_noncoding,129,129
novel_noncoding,20,20
processed_pseudogene_noncoding,8,8
protein_coding_canonical,194,194
protein_coding_dORF,11,11
protein_coding_odORF,5,5
protein_coding_ouORF,19,19
protein_coding_uORF,42,42


In [34]:
tumorReact_ORFs[['gene_name','length_aa','geneORFtype','ctype']].drop_duplicates().groupby(["ctype","geneORFtype"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,gene_name,length_aa
ctype,geneORFtype,Unnamed: 2_level_1,Unnamed: 3_level_1
BLCA,lncRNA_noncoding,33,33
BLCA,novel_noncoding,7,7
BLCA,processed_pseudogene_noncoding,1,1
BLCA,protein_coding_canonical,86,86
BLCA,protein_coding_dORF,1,1
...,...,...,...
PRAD,processed_pseudogene_noncoding,1,1
PRAD,protein_coding_canonical,21,21
PRAD,protein_coding_dORF,1,1
PRAD,protein_coding_ouORF,2,2


In [None]:
## get table of counts of candidates
candidates = pd.read_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/log2ratio3x_1TPM_genes.csv"))

full_table_of_counts = pd.DataFrame()
for ctype in cancertypes:
    patients = pd.read_csv(cancer_dir+"/merged_patients_"+ctype+".csv")
    patients_long = pd.melt(patients, id_vars=["patient","project"], ignore_index=False).reset_index()
    patients_long.drop("index", axis=1, inplace=True)
    patients_long.rename(columns={"variable":"normal_tumor","value":"sample"}, inplace=True)

    fc=pd.read_csv(cancer_dir+"/merged_fc_"+ctype+".csv")

    log2ratio_1TPM = pd.read_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,"log2ratio3x_1TPM.csv"))
    fc_info = fc.merge(transcript_gene, on=['transcript_id'], how="inner")
    fc_info = fc_info[fc_info['gene_type'].isin(['lncRNA','processed_pseudogene','novel','protein_coding'])]
    # print("log2ratio 1TPM: ", len(fc_info))

    fc_info_long = pd.melt(fc_info, id_vars=["transcript_id","Length", "gene_id", "gene_type", "gene_name"], ignore_index=False).reset_index()
    fc_info_long.rename(columns={"variable":"sample","value":"TPM"}, inplace=True)
    fc_info_long = fc_info_long.merge(patients_long, on=["sample"])

    ## create full file with TPM per patient and ctype
    fc_info_long['ctype'] = ctype

    full_table_of_counts = pd.concat([full_table_of_counts, fc_info_long])

full_table_of_counts.to_csv(os.path.join(cancer_dir,"full_table_of_counts.csv"))


In [3]:
full_table_of_counts = pd.read_csv(os.path.join(cancer_dir,"full_table_of_counts.csv"))

merged = pd.merge(
    full_table_of_counts,
    candidates[['gene_id']],  # Only keep relevant columns for the merge
    left_on=['gene_id'],
    right_on=['gene_id'],
    how='left'
)
merged = merged[merged['gene_id'].isin(candidates.gene_id.values.tolist())]
merged.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/onlyStep1/TSTR_Expression.csv"), index=False)

# Create the new column 'forced_TPM' using numpy.where
merged_tumor = merged[merged['normal_tumor'] == "tumor"]
merged_tumor.drop(columns=['index','Length','sample'], inplace=True)
merged_tumor.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/onlyStep1/TSTR_Expression_tumor.csv"), index=False)

merged_normal = merged[merged['normal_tumor'] == "normal"]
merged_normal.drop(columns=['index','Length','sample'], inplace=True)
merged_normal.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/onlyStep1/TSTR_Expression_normal.csv"), index=False)

  full_table_of_counts = pd.read_csv(os.path.join(cancer_dir,"full_table_of_counts.csv"))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_tumor.drop(columns=['index','Length','sample'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_normal.drop(columns=['index','Length','sample'], inplace=True)


In [4]:
candidates[['gene_name','gene_type']].drop_duplicates().groupby("gene_type").count()

Unnamed: 0_level_0,gene_name
gene_type,Unnamed: 1_level_1
lncRNA,102
novel,13
processed_pseudogene,7
protein_coding,204


In [5]:
candidates[['gene_name','gene_type','ctype']].drop_duplicates().groupby(["gene_type","ctype"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,gene_name
gene_type,ctype,Unnamed: 2_level_1
lncRNA,BLCA,25
lncRNA,BRCA,11
lncRNA,COAD,38
lncRNA,KIRC,13
lncRNA,LIHC,42
lncRNA,LUAD,25
lncRNA,LUSC,21
lncRNA,PRAD,9
novel,BLCA,2
novel,BRCA,3


In [13]:
tumorReact_ORFs.drop("ctype", axis=1).drop_duplicates().groupby("geneORFtype").count()

Unnamed: 0_level_0,gene_name,gene_id,transcript_id,gene_type,length,orfType,length_aa,start_codon,ORFpep
geneORFtype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
lncRNA_noncoding,130,130,130,130,130,130,130,130,130
novel_noncoding,20,20,20,20,20,20,20,20,20
processed_pseudogene_noncoding,8,8,8,8,8,8,8,8,8
protein_coding_canonical,194,194,194,194,194,194,194,194,194
protein_coding_dORF,11,11,11,11,11,11,11,11,11
protein_coding_odORF,5,5,5,5,5,5,5,5,5
protein_coding_ouORF,19,19,19,19,19,19,19,19,19
protein_coding_uORF,42,42,42,42,42,42,42,42,42


In [14]:
set(tumorReact_ORFs[tumorReact_ORFs['geneORFtype'] == "protein_coding_uORF"].gene_name.values.tolist())

{'ADAD1',
 'ADAM29',
 'C12orf50',
 'CCDC172',
 'COX7B2',
 'DCAF8L2',
 'DYDC1',
 'FSHR',
 'IQCM',
 'LRRC74A',
 'LYPD4',
 'MAGEA1',
 'MAGEB1',
 'MAGEB3',
 'MAGEC1',
 'MAGEC2',
 'MEIOSIN',
 'OR14A2',
 'PLCZ1',
 'PLSCR2',
 'PRSS54',
 'RFPL4B',
 'RHOXF2B',
 'SMIM47',
 'SPANXB1',
 'SPMIP9',
 'TBC1D21',
 'TEX13C',
 'TPTE',
 'TRPC5OS',
 'USP26'}

Generate bed/gtf file for candidates (TSTR) to look for chromatin acessibility (Cova)

In [35]:
TSTR = pd.read_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/onlyStep1/log2ratio3x_1TPM_genes.csv"))
TSTR.drop("ctype", axis=1).drop_duplicates().groupby("gene_type").count()
print(len(set(TSTR.transcript_id.values.tolist())))

annotation_gtf = pd.read_csv(annotation, sep="\t", comment="#", header=None)
annotation_gtf = annotation_gtf[annotation_gtf[2] == "transcript"]
annotation_gtf


326


  annotation_gtf = pd.read_csv(annotation, sep="\t", comment="#", header=None)


Unnamed: 0,0,1,2,3,4,5,6,7,8
1,M,StringTie,transcript,1,577,.,-,.,"transcript_id ""TCONS_00001163""; gene_id ""XLOC_..."
3,M,ENSEMBL,transcript,577,647,.,+,.,"gene_id ""ENSG00000210049.1""; transcript_id ""EN..."
7,M,ENSEMBL,transcript,648,1601,.,+,.,"gene_id ""ENSG00000211459.2""; transcript_id ""EN..."
8,M,StringTie,transcript,683,1035,.,-,.,"transcript_id ""TCONS_00001164""; gene_id ""XLOC_..."
11,M,ENSEMBL,transcript,1602,1670,.,+,.,"gene_id ""ENSG00000210077.1""; transcript_id ""EN..."
...,...,...,...,...,...,...,...,...,...
782862,KI270751.1,HAVANA,transcript,34175,73722,.,-,.,"gene_id ""ENSG00000303867.1""; transcript_id ""EN..."
782865,KI270751.1,HAVANA,transcript,44958,53274,.,+,.,"gene_id ""ENSG00000303902.1""; transcript_id ""EN..."
782871,KI270751.1,HAVANA,transcript,133801,139253,.,-,.,"gene_id ""ENSG00000306528.1""; transcript_id ""EN..."
782875,KI270753.1,HAVANA,transcript,43135,44491,.,+,.,"gene_id ""ENSG00000297844.1""; transcript_id ""EN..."


In [18]:
# Convert the list of values into a regex pattern
pattern = '|'.join(TSTR.transcript_id.values.tolist())

# Filter the DataFrame
filtered_annotation = annotation_gtf[annotation_gtf[8].str.contains(pattern)]

filtered_annotation.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/onlyStep1/TSTR_annotation.gtf"), sep="\t", index=None, header=None, quoting=csv.QUOTE_NONE)


In [24]:
transcript_gene

Unnamed: 0,chr,transcript_id,gene_id,gene_name,transcript_type,gene_type
0,M,ENST00000361390,ENSG00000198888,MT-ND1,protein_coding,protein_coding
1,M,ENST00000361453,ENSG00000198763,MT-ND2,protein_coding,protein_coding
2,M,ENST00000361624,ENSG00000198804,MT-CO1,protein_coding,protein_coding
3,M,ENST00000361739,ENSG00000198712,MT-CO2,protein_coding,protein_coding
4,M,ENST00000361851,ENSG00000228253,MT-ATP8,protein_coding,protein_coding
...,...,...,...,...,...,...
65159,KI270442.1,TCONS_00000004,XLOC_000003,XLOC_000003,novel,novel
65160,KI270442.1,TCONS_00000005,XLOC_000004,XLOC_000004,novel,novel
65161,KI270466.1,TCONS_00000006,XLOC_000005,XLOC_000005,novel,novel
65162,KI270467.1,TCONS_00000008,XLOC_000006,XLOC_000006,novel,novel


In [36]:
TSTR_annot = TSTR.merge(transcript_gene, on=["gene_id","gene_name","transcript_id","gene_type"])
TSTR_annot.drop("ctype", axis=1).drop_duplicates().groupby("gene_type").count()

TSTR_annot['coding_noncoding_chr'] = np.select(
    [
        (TSTR_annot['gene_type'] == "protein_coding") & (TSTR_annot['chr'] == "X"),
        (TSTR_annot['gene_type'] == "protein_coding") & (TSTR_annot['chr'] != "X"),
        (TSTR_annot['gene_type'] != "protein_coding") & (TSTR_annot['chr'] == "X"),
        (TSTR_annot['gene_type'] != "protein_coding") & (TSTR_annot['chr'] != "X"),
    ],
    [
        "CT-X",
        "CT-nonX",
        "Noncoding-X",
        "Noncoding-nonX"
    ]
)

TSTR_annot = TSTR_annot[['gene_name','gene_id','transcript_id','coding_noncoding_chr']].drop_duplicates()
TSTR_annot
TSTR_annot.groupby("coding_noncoding_chr").count()

Unnamed: 0_level_0,gene_name,gene_id,transcript_id
coding_noncoding_chr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CT-X,67,67,67
CT-nonX,137,137,137
Noncoding-X,3,3,3
Noncoding-nonX,119,119,119


In [29]:
## CT-X
CTx = TSTR_annot[TSTR_annot['coding_noncoding_chr'] == "CT-X"]
CTx_annotation = annotation_gtf[annotation_gtf[8].str.contains('|'.join(CTx.transcript_id.values.tolist()))]
CTx_annotation.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding/CTx_annotation.gtf"), sep="\t", index=None, header=None, quoting=csv.QUOTE_NONE)

## CT-nonX
CTnonx = TSTR_annot[TSTR_annot['coding_noncoding_chr'] == "CT-nonX"]
CTnonx_annotation = annotation_gtf[annotation_gtf[8].str.contains('|'.join(CTnonx.transcript_id.values.tolist()))]
CTnonx_annotation.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding/CTnonx_annotation.gtf"), sep="\t", index=None, header=None, quoting=csv.QUOTE_NONE)

## Noncoding-nonX
Noncodingnonx = TSTR_annot[TSTR_annot['coding_noncoding_chr'] == "Noncoding-nonX"]
Noncodingnonx_annotation = annotation_gtf[annotation_gtf[8].str.contains('|'.join(Noncodingnonx.transcript_id.values.tolist()))]
Noncodingnonx_annotation.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding/Noncodingnonx_annotation.gtf"), sep="\t", index=None, header=None, quoting=csv.QUOTE_NONE)

In [30]:
%%bash -s "$cancer_dir"

module load BEDOPS

for file in $1/log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding/*gtf; do

    outfile=${file%%.gtf*}
    outfile=${outfile}.bed
    gtf2bed < $file > $outfile

done



In [41]:
## Get fasta of ORFs per group
dir="/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/cancers/log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding"
ORFs = pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/cancers/log2ratio3x/cancertypes/onlyStep1/TSTR_candidatesORFs_fullcharacterized.csv")
ORFs
def create_seqrecord_PROThuman(row):
    return SeqRecord(Seq(row['ORFpep']), id=row['orfID'], description="")

## CT-X
CTx = ORFs[ORFs['coding_noncoding_chr'] == "CT-X"][['orfID','ORFpep']].drop_duplicates()
seq_records_CTx = CTx.apply(create_seqrecord_PROThuman, axis=1).tolist()

with open(os.path.join(dir,"CTx.fasta"), "w") as fasta_file:
    for record in seq_records_CTx:
        fasta_file.write(f">{record.id}\n{record.seq}\n")

## CT-nonX
CTnonx = ORFs[ORFs['coding_noncoding_chr'] == "CT-nonX"][['orfID','ORFpep']].drop_duplicates()
seq_records_CTnonx = CTnonx.apply(create_seqrecord_PROThuman, axis=1).tolist()
with open(os.path.join(dir,"CTnonx.fasta"), "w") as fasta_file:
    for record in seq_records_CTnonx:
        fasta_file.write(f">{record.id}\n{record.seq}\n")

## NC-nonX
NCnonx = ORFs[ORFs['coding_noncoding_chr'] == "Noncoding-nonX"][['orfID','ORFpep']].drop_duplicates()
seq_records_NCnonx = NCnonx.apply(create_seqrecord_PROThuman, axis=1).tolist()
with open(os.path.join(dir,"Noncodingnonx.fasta"), "w") as fasta_file:
    for record in seq_records_NCnonx:
        fasta_file.write(f">{record.id}\n{record.seq}\n")

## altORFs
altORFs = ORFs[ORFs['coding_noncoding_chr'] == "other ncORFs"][['orfID','ORFpep']].drop_duplicates()
seq_records_altORFs = altORFs.apply(create_seqrecord_PROThuman, axis=1).tolist()
with open(os.path.join(dir,"altORFs.fasta"), "w") as fasta_file:
    for record in seq_records_altORFs:
        fasta_file.write(f">{record.id}\n{record.seq}\n")


In [42]:
## Get bed of ORFs per group
dir="/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/cancers/log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding"
ORFs = pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/cancers/log2ratio3x/cancertypes/onlyStep1/TSTR_candidatesORFs_fullcharacterized.csv")
bed = pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/RiboNovel_MultMap_1to1/RibORF/repre.valid.ORF.genepred.testis.bed", header=None, sep="\t")


## Canonical
CT = ORFs[(ORFs['gene_type'] == "protein_coding") & (ORFs['coding_noncoding_chr'] != "other ncORFs")].orfID.values.tolist()
CT_bed = bed[bed[3].isin(CT)]
## ORF coordinates instead of transcript coordinates
# CT_bed[1] = CT_bed[6]
# CT_bed[2] = CT_bed[7]
CT_bed.to_csv(os.path.join(dir,"canonical_ORFs.bed"), header=None, index=None, sep="\t")

## CT-X
# CTx = ORFs[ORFs['coding_noncoding_chr'] == "CT-X"].orfID.values.tolist()
# CTx_bed = bed[bed[3].isin(CTx)]
# CTx_bed.to_csv(os.path.join(dir,"CTx_ORFs.bed"), header=None, index=None, sep="\t")

## CT-nonX
# CTnonx = ORFs[ORFs['coding_noncoding_chr'] == "CT-nonX"].orfID.values.tolist()
# CTnonx_bed = bed[bed[3].isin(CTnonx)]
# CTnonx_bed.to_csv(os.path.join(dir,"CTnonx_ORFs.bed"), header=None, index=None, sep="\t")

# lncORF
lncORF = ORFs[ORFs['gene_type'] != "protein_coding"].orfID.values.tolist()
lncORF_bed = bed[bed[3].isin(lncORF)]
## ORF coordinates instead of transcript coordinates
# lncORF_bed[1] = lncORF_bed[6]
# lncORF_bed[2] = lncORF_bed[7]
lncORF_bed.to_csv(os.path.join(dir,"lncORF_ORFs.bed"), header=None, index=None, sep="\t")

## other ncORFs
other_ncORFs = ORFs[ORFs['coding_noncoding_chr'] == "other ncORFs"].orfID.values.tolist()
other_ncORFs_bed = bed[bed[3].isin(other_ncORFs)]
## ORF coordinates instead of transcript coordinates
# other_ncORFs_bed[1] = other_ncORFs_bed[6]
# other_ncORFs_bed[2] = other_ncORFs_bed[7]
other_ncORFs_bed.to_csv(os.path.join(dir,"other_ncORFs_ORFs.bed"), header=None, index=None, sep="\t")

## ncORFs
ncORFs = ORFs[~ORFs['coding_noncoding_chr'].str.contains("CT")].orfID.values.tolist()
ncORFs_bed = bed[bed[3].isin(ncORFs)]
## ORF coordinates instead of transcript coordinates
# ncORFs_bed[1] = ncORFs_bed[6]
# ncORFs_bed[2] = ncORFs_bed[7]
ncORFs_bed.to_csv(os.path.join(dir,"ncORFs_ORFs.bed"), header=None, index=None, sep="\t")


## all ORF candidates
candidates_orfIDs = ORFs.orfID.values.tolist()
candidates_bed = bed[bed[3].isin(candidates_orfIDs)]
## ORF coordinates instead of transcript coordinates
# candidates_bed[1] = candidates_bed[6]
# candidates_bed[2] = candidates_bed[7]
candidates_bed.to_csv(os.path.join(dir,"TSTR_ORFs.bed"), header=None, index=None, sep="\t")
candidates_bed[0] = "chr"+candidates_bed[0]
candidates_bed.to_csv(os.path.join(dir,"TSTR_ORFs.chr.bed"), header=None, index=None, sep="\t")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  candidates_bed[0] = "chr"+candidates_bed[0]


In [43]:
%%bash

#add chr

dir="/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/cancers/log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding"
sed -e 's/^/chr/' $dir/TSTR_ORFs.bed > $dir/TSTR_ORFs.chr.bed 

## NetMHCpan

How many putative peptides could arise from the sequences of the candidates? (Fully dependant on their length)

In [46]:
def extract_9mers(fasta_file, output_file):
    with open(output_file, 'w') as out_f:
        for record in SeqIO.parse(fasta_file, "fasta"):
            sequence = str(record.seq)
            for i in range(len(sequence) - 8):  # Extract 9-mers
                peptide = sequence[i:i+9]
                out_f.write(f">{record.id}_pos{i}\n{peptide}\n")

In [47]:
CTx = os.path.join(cancer_dir,"log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding/CTx.fasta")
CTnonx = os.path.join(cancer_dir,"log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding/CTnonx.fasta")
NCnonx = os.path.join(cancer_dir,"log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding/Noncodingnonx.fasta")
altORFs = os.path.join(cancer_dir,"log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding/altORFs.fasta")

files = list([CTx, CTnonx, NCnonx, altORFs])
files

for f in files:
    outfile = f.split(".fasta")[0] + "_9mers.fasta"

    extract_9mers(f, outfile)


In [48]:
%%bash -s "$cancer_dir"

echo "CT-X"
grep -v '^>' $1/log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding/CTx_9mers.fasta | sort | uniq | wc -l

echo "CT-nonX"
grep -v '^>' $1/log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding/CTnonx_9mers.fasta | sort | uniq | wc -l

echo "NC-nonX"
grep -v '^>' $1/log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding/Noncodingnonx_9mers.fasta | sort | uniq | wc -l

echo "altORFs"
grep -v '^>' $1/log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding/altORFs_9mers.fasta | sort | uniq | wc -l

CT-X
19068
CT-nonX
65025
NC-nonX
6397
altORFs
1986


**NetMHCpan**

HLA-A02:01

In [52]:
%%bash -s "$cancer_dir"

## HLA-A02:01
dir=$1/log2ratio3x/cancertypes/onlyStep1/netMHCpan/HLA0201
mkdir -p $dir

CTx="$1/log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding/CTx.fasta"
CTnonx="$1/log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding/CTnonx.fasta"
NCnonx="$1/log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding/Noncodingnonx.fasta"
altORFs="$1/log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding/altORFs.fasta"

samples_array=("$CTx" "$CTnonx" "$NCnonx" "$altORFs")

#=======================================#
# Run netMHCpan.sh + Create output xlsx #
#=======================================#

# Set path for software
module load netMHCpan/4.1
module load R/4.0.4-foss-2020b

for file in "${samples_array[@]}"; do

    name=${file##*/}
    name=${name%%.fasta*}

    echo $name

    ## netMHCpan
    netMHCpan -a HLA-A02:01 -f $file -l 9 -BA -xls -xlsfile ${dir}/${name}.netMHCpan.BA.xls >  ${dir}/${name}.netMHCpan.BA.out

    echo "initial"
    grep -v "^>" $1/log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding/${name}_9mers.fasta | sort | uniq | wc -l

    echo "SB"
    grep "= SB" ${dir}/${name}.netMHCpan.BA.out > ${dir}/${name}.SB.out
    awk '{print $3}' ${dir}/${name}.SB.out | sort | uniq | wc -l

    echo "WB/SB"
    grep "= WB\|= SB" ${dir}/${name}.netMHCpan.BA.out > ${dir}/${name}.WBSB.out
    awk '{print $3}' ${dir}/${name}.WBSB.out | sort | uniq | wc -l
    echo -e "\n"
    # ## create output xlsx
    # excelFile=${dir}/${name}.netMHCpan.BA.xls
    # Rscript /data/genomics/marta/scripts/parse_netmhcpan_EL_BA_out_marta.r $excelFile

done


CTx
initial
19068
SB
200
WB/SB
603


CTnonx
initial
65025
SB
948
WB/SB
2579


Noncodingnonx
initial
6397
SB
53
WB/SB
199


altORFs
initial
1986
SB
19
WB/SB
61




Most frequent alleles

https://doi.org/10.1080/22221751.2021.1978823

In [51]:
%%bash -s "$cancer_dir"

## Most common alleles
dir=$1/log2ratio3x/cancertypes/onlyStep1/netMHCpan/FrequentHLA-I
mkdir -p $dir

CTx="$1/log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding/CTx.fasta"
CTnonx="$1/log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding/CTnonx.fasta"
NCnonx="$1/log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding/Noncodingnonx.fasta"
altORFs="$1/log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding/altORFs.fasta"

samples_array=("$CTx" "$CTnonx" "$NCnonx" "$altORFs")

#=======================================#
# Run netMHCpan.sh + Create output xlsx #
#=======================================#

# Set path for software
module load netMHCpan/4.1
module load R/4.0.4-foss-2020b

for file in "${samples_array[@]}"; do

    name=${file##*/}
    name=${name%%.fasta*}

    echo $name

    ## netMHCpan
    netMHCpan -a "$( < /projects_eg/projects/marta/VaccineCandidates/frequence_HLAalleles.csv)" -f $file -l 9 -BA -xls -xlsfile ${dir}/${name}.netMHCpan.BA.xls >  ${dir}/${name}.netMHCpan.BA.out

    echo "initial"
    grep -v "^>" $1/log2ratio3x/cancertypes/onlyStep1/annotations_coding_noncoding/${name}_9mers.fasta | sort | uniq | wc -l

    echo "SB"
    grep "= SB" ${dir}/${name}.netMHCpan.BA.out > ${dir}/${name}.SB.out
    awk '{print $3}' ${dir}/${name}.SB.out | sort | uniq | wc -l

    echo "WB/SB"
    grep "= WB\|= SB" ${dir}/${name}.netMHCpan.BA.out > ${dir}/${name}.WBSB.out
    awk '{print $3}' ${dir}/${name}.WBSB.out | sort | uniq | wc -l
    echo -e "\n"
    # ## create output xlsx
    # excelFile=${dir}/${name}.netMHCpan.BA.xls
    # Rscript /data/genomics/marta/scripts/parse_netmhcpan_EL_BA_out_marta.r $excelFile

done


CTx
initial
19068
SB
2015
WB/SB
4942


CTnonx
initial
65025
SB
7421
WB/SB
17910


Noncodingnonx
initial
6397
SB
715
WB/SB
1759


altORFs
initial
1986
SB
221
WB/SB
555


