## Immunopeptidomics new search

With alterantive ORFs (uORFs, dORFs...)


In [1]:
import os, re
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq  # Import Seq from Bio.Seq

annot=pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/newReference_Resconstructed/transID_geneID_isoforms_selected.1to1.csv")
annot

Unnamed: 0,chr,transcript_id,gene_id,gene_name,transcript_type,gene_type
0,M,ENST00000361390,ENSG00000198888,MT-ND1,protein_coding,protein_coding
1,M,ENST00000361453,ENSG00000198763,MT-ND2,protein_coding,protein_coding
2,M,ENST00000361624,ENSG00000198804,MT-CO1,protein_coding,protein_coding
3,M,ENST00000361739,ENSG00000198712,MT-CO2,protein_coding,protein_coding
4,M,ENST00000361851,ENSG00000228253,MT-ATP8,protein_coding,protein_coding
...,...,...,...,...,...,...
65159,KI270442.1,TCONS_00000004,XLOC_000003,XLOC_000003,novel,novel
65160,KI270442.1,TCONS_00000005,XLOC_000004,XLOC_000004,novel,novel
65161,KI270466.1,TCONS_00000006,XLOC_000005,XLOC_000005,novel,novel
65162,KI270467.1,TCONS_00000008,XLOC_000006,XLOC_000006,novel,novel


In [2]:
HCC_projs = ["GSE193567", "GSE214846", "liver_adjacent_totalRNA", "hcc_normal_totalRNA", "LIHC_TCGA"]

all_genes = pd.DataFrame()

for hcc_dataset in HCC_projs:
    genes = pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/cancers/tumorexpressed/tumor_1FPKM_n_"+hcc_dataset+"_LIHC.csv")
    genes = genes[["gene_id","transcript_id","gene_name","gene_type"]]

    all_genes = pd.concat([all_genes, genes])

all_genes.drop_duplicates(inplace=True)

In [3]:
all_genes

Unnamed: 0,gene_id,transcript_id,gene_name,gene_type
0,XLOC_001021,TCONS_00001163,XLOC_001021,novel
1,ENSG00000197958,ENST00000361436,RPL12,protein_coding
2,ENSG00000136933,ENST00000373544,RABEPK,protein_coding
3,ENSG00000044574,ENST00000324460,HSPA5,protein_coding
4,ENSG00000165219,ENST00000297933,GAPVD1,protein_coding
...,...,...,...,...
26984,ENSG00000288860,ENST00000689333,ENSG00000288860,lncRNA
27004,ENSG00000307634,ENST00000827594,ENSG00000307634,lncRNA
27012,ENSG00000154975,ENST00000451037,CA10,protein_coding
27013,ENSG00000308371,ENST00000833657,ENSG00000308371,lncRNA


In [None]:
## Create fasta of translated sequences from candidates
specie = "human"
fasta=os.path.join("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47",specie,"RiboNovel_MultMap_1to1/Annotation/candidateORF.fa")
fasta_df_input = pd.read_csv(fasta, sep="\t", header=None)
fasta_df = pd.DataFrame({'header':fasta_df_input[0].iloc[::2].values, 'seq':fasta_df_input[0].iloc[1::2].values})
fasta_df['header'] = fasta_df.header.str[1:]
fasta_df['transcript_id'] = fasta_df['header'].str.split(":", expand=True)[0]
fasta_df['transcript_id'] = fasta_df['transcript_id'].str.split(".", expand=True)[0]
fasta_df['coords'] = fasta_df['header'].str.split("\|", expand=True)[2]
fasta_df['ORFtype'] = fasta_df['header'].str.split("\|", expand=True)[3]


In [None]:
fasta_df = fasta_df.merge(annot, on="transcript_id")

fasta_df['new_header'] = fasta_df['header']+":"+fasta_df['gene_name']+":"+fasta_df['gene_type']
fasta_df.groupby("gene_type").count()

In [None]:
candidates_TAA = fasta_df[fasta_df['transcript_id'].isin(all_genes.transcript_id.values.tolist())]
orftypes=["noncoding","uORF","dORF","ouORF","odORF"]

candidates_TAA_notcanonical = candidates_TAA[candidates_TAA["ORFtype"].isin(orftypes)]
candidates_TAA_notcanonical = candidates_TAA_notcanonical[~candidates_TAA_notcanonical["chr"].str.contains("GL")]
candidates_TAA_notcanonical = candidates_TAA_notcanonical[~candidates_TAA_notcanonical["chr"].str.contains("KI")]

candidates_TAA_notcanonical.groupby("gene_type").count()

In [None]:
to_fasta = candidates_TAA_notcanonical[['new_header','seq']]

TAA_prot = "/home/marta/241205_Immunopeptidomics_HCC/database/TAA.Wang2020candidates.PROTEIN.fa"
out_prot = open(TAA_prot,'w')

for index, row in to_fasta.iterrows():
    nucleotide_seq = Seq(row['seq'])  # Convert string to Biopython Seq object
    protein_seq = nucleotide_seq.translate()  # Translate nucleotide to protein

    out_prot.write(">%s\n%s\n" %(str(row['new_header']), str(protein_seq)[:-1]))

out_prot.close()

In [None]:
## remove redundancies of nested sequences
from collections import defaultdict

def remove_nested_sequences_by_transcript(fasta_file):
    sequences = defaultdict(list)  # A dictionary to hold sequences by transcript ID
    headers = []  # List to store the original headers
    transcript_ids = []  # List to store the transcript IDs for each sequence
    
    # Read the sequences from the FASTA file
    with open(fasta_file, 'r') as f:
        header = None
        sequence = []
        
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if header is not None:
                    # Store the sequence and header
                    sequences[transcript_id].append(("".join(sequence), header))
                header = line[1:]  # Remove the '>' symbol
                sequence = []
                transcript_id = header.split(':')[0]  # Extract the transcript ID
            else:
                sequence.append(line)
        
        # Add the last sequence
        if header is not None:
            sequences[transcript_id].append(("".join(sequence), header))
    
    # List to hold non-redundant sequences
    unique_sequences = []
    
    # Process each transcript group
    for transcript_id, seq_list in sequences.items():
        # Sort sequences by length in descending order
        seq_list.sort(key=lambda x: len(x[0]), reverse=True)
        
        # Iterate over sorted sequences, keeping only the longest non-redundant ones
        for i, (seq_i, header_i) in enumerate(seq_list):
            is_redundant = False
            # Compare with previously added sequences within the same transcript
            for j in range(i):
                seq_j, _ = seq_list[j]
                if seq_i in seq_j:
                    is_redundant = True
                    break
            
            # If not redundant, add the sequence to the output list
            if not is_redundant:
                unique_sequences.append((header_i, seq_i))
    
    # Write the output FASTA file
    output_file = '/home/marta/241205_Immunopeptidomics_HCC/database/TAA.Wang2020candidates.PROTEIN.NOredundant.fa'
    with open(output_file, 'w') as f:
        for header, seq in unique_sequences:
            f.write(f">{header}\n{seq}\n")
    
    print(f"Non-redundant sequences saved to {output_file}")

# Example usage
fasta_file = '/home/marta/241205_Immunopeptidomics_HCC/database/TAA.Wang2020candidates.PROTEIN.fa'
remove_nested_sequences_by_transcript(fasta_file)


In [None]:
%%bash 

DIR=/home/marta/241205_Immunopeptidomics_HCC

## TAA + SP/TREMBL
cat /home/marta/241205_Immunopeptidomics_HCC/database/uniprot_canonicalANDisoforms_oneline.fasta /home/marta/241205_Immunopeptidomics_HCC/database/TAA.Wang2020candidates.PROTEIN.NOredundant.fa > /home/marta/241205_Immunopeptidomics_HCC/database/TAA_SPTREMBL.fasta
## remove empty lines
sed -i '/^$/d' /home/marta/241205_Immunopeptidomics_HCC/database/TAA_SPTREMBL.fasta



In [None]:
%%bash

grep "^>" /home/marta/241205_Immunopeptidomics_HCC/database/TAA_SPTREMBL.fasta | grep -v "sp" | grep -v "tr" | wc -l
grep "^>" /home/marta/241205_Immunopeptidomics_HCC/database/TAA_SPTREMBL.fasta | grep  "sp\|tr" | wc -l
grep "^>TC" /home/marta/241205_Immunopeptidomics_HCC/database/TAA_SPTREMBL.fasta | wc -l


In [2]:
TSASciAdv = pd.read_csv("/projects_eg/projects/marta/table_to_heatmap_noabundantcase3.csv")
TSASciAdv = TSASciAdv[['gene_id','gene_name','n','gene_type','RiboSeq']].drop_duplicates()
TSASciAdv

Unnamed: 0,gene_id,gene_name,n,gene_type,RiboSeq
0,ENSG00000046774,MAGEC2,25.0,protein_coding,1
117,ENSG00000068985,PAGE1,16.0,protein_coding,1
234,ENSG00000071677,PRLH,1.0,protein_coding,1
351,ENSG00000082929,LINC01587,3.0,lncRNA,0
468,ENSG00000083622,AC000061.1,1.0,lncRNA,0
...,...,...,...,...,...
160641,ENSG00000283436,LINC01958,3.0,lncRNA,0
160758,ENSG00000283480,AL512380.2,2.0,lncRNA,0
160875,ENSG00000283573,AL157371.2,2.0,lncRNA,0
160992,ENSG00000283599,BX276092.9,13.0,protein_coding,1


In [3]:
# TSTR = pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/cancers/log2ratio3x/cancertypes/TOv3x_5percent_TestisRestrictedGTEx_Translated_Ctypes_log2ratio3xMEANgenes.csv")
TSTR = pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/cancers/log2ratio3x/cancertypes/onlyStep1/log2ratio3x_1TPM_ORFs.csv")
TSTR = TSTR[['gene_id','gene_type','orfID','ctype']].drop_duplicates()
TSTR.columns = ["gene_id","gene_type","orfID_TSTR_RiboSeq","ctype"]
# Update 'orfID_TSTR_RiboSeq' to empty if 'gene_type' is "protein_coding"
TSTR.loc[TSTR['gene_type'] == 'protein_coding', 'orfID_TSTR_RiboSeq'] = ''
TSTR.drop("gene_type", axis=1, inplace=True)
TSTR.drop_duplicates(inplace=True)
TSTR

Unnamed: 0,gene_id,orfID_TSTR_RiboSeq,ctype
0,ENSG00000117148,,BRCA
2,ENSG00000117148,,BLCA
4,ENSG00000117148,,LUAD
6,ENSG00000117148,,KIRC
8,ENSG00000117148,,PRAD
...,...,...,...
1062,XLOC_000189,TCONS_00000243:11:+|5|322:61:301|noncoding|ATG,COAD
1063,XLOC_000280,TCONS_00000334:13:+|6|443:103:166|noncoding|ATG,COAD
1064,XLOC_000797,TCONS_00000933:5:+|46|6032:841:889|noncoding|ATG,COAD
1065,ENSG00000241560,ENST00000467304.2:3:+|11|4530:178:454|noncodin...,COAD


In [16]:
annot = pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/newReference_Resconstructed/transID_geneID_isoforms_selected.1to1.csv")
uniprotKB = pd.read_csv("/users/genomics/marta/HCC_proteomics/uniprotkb_AND_model_organism_9606_2024_02_28.tsv", sep="\t")
uniprotKB['Gene Names'] = uniprotKB['Gene Names'].str.split()
uniprotKB = uniprotKB.explode('Gene Names')

uniprotKB = uniprotKB[['Entry','Gene Names']]

## database canonical
sp_TrEMBL = pd.read_csv("/users/genomics/marta/241205_Immunopeptidomics_HCC/database/header.NOredundant.fa", header=None, sep="\t")
sp_TrEMBL['Entry'] = sp_TrEMBL[0].str.split("\|", expand=True)[1]
sp_TrEMBL = sp_TrEMBL[~sp_TrEMBL[0].str.contains("ENST")]
sp_TrEMBL = sp_TrEMBL[~sp_TrEMBL[0].str.contains("TCONS")]

sp_TrEMBL = sp_TrEMBL.merge(uniprotKB, on="Entry", how="left")
sp_TrEMBL.columns = ['header','Entry','gene_name']
sp_TrEMBL.to_csv("/users/genomics/marta/241205_Immunopeptidomics_HCC/database/canonicalheaders_entries_genenames.csv", index=None)

sp_TrEMBL

Unnamed: 0,header,Entry,gene_name
0,>tr|A0A075B6G3|A0A075B6G3_HUMAN Dystrophin OS=...,A0A075B6G3,DMD
1,>tr|A0A087WV00|A0A087WV00_HUMAN Diacylglycerol...,A0A087WV00,DGKI
2,>tr|A0A087WZT3|A0A087WZT3_HUMAN BOLA2-SMG1P6 r...,A0A087WZT3,BOLA2-SMG1P6
3,>sp|A0A087X1C5|CP2D7_HUMAN Putative cytochrome...,A0A087X1C5,CYP2D7
4,>tr|A0A0A0MQU7|A0A0A0MQU7_HUMAN HNF1 homeobox ...,A0A0A0MQU7,HNF1A
...,...,...,...
131557,>tr|X6RLU5|X6RLU5_HUMAN Calcium voltage-gated ...,X6RLU5,CACNA2D4
131558,>tr|X6RLV5|X6RLV5_HUMAN DEAD-box helicase 5 OS...,X6RLV5,DDX5
131559,>tr|X6RLY7|X6RLY7_HUMAN Calcium voltage-gated ...,X6RLY7,CACNA2D4
131560,>tr|X6RM00|X6RM00_HUMAN ELKS/RAB6-interacting/...,X6RM00,ERC1


In [4]:
proteinNames = pd.read_csv("/users/genomics/marta/241205_Immunopeptidomics_HCC/database/canonicalheaders_entries_genenames.csv")
proteinNames.drop("header", axis=1, inplace=True)
print(len(set(proteinNames.gene_name.values.tolist())))

## associate entry name with gene_name
annotated_geneNames = annot.merge(proteinNames, on="gene_name")
print(len(set(annotated_geneNames.gene_name.values.tolist())))


46331
19707


In [7]:
annotated_geneNames

Unnamed: 0,chr,transcript_id,gene_id,gene_name,transcript_type,gene_type,Entry
0,M,ENST00000361390,ENSG00000198888,MT-ND1,protein_coding,protein_coding,P03886
1,M,ENST00000361453,ENSG00000198763,MT-ND2,protein_coding,protein_coding,P03891
2,M,ENST00000361624,ENSG00000198804,MT-CO1,protein_coding,protein_coding,P00395
3,M,ENST00000361739,ENSG00000198712,MT-CO2,protein_coding,protein_coding,P00403
4,M,ENST00000361851,ENSG00000228253,MT-ATP8,protein_coding,protein_coding,P03928
...,...,...,...,...,...,...,...
80180,22,ENST00000395595,ENSG00000079974,RABL2B,protein_coding,protein_coding,C9JFZ0
80181,22,ENST00000395595,ENSG00000079974,RABL2B,protein_coding,protein_coding,F2Z2T3
80182,22,ENST00000395595,ENSG00000079974,RABL2B,protein_coding,protein_coding,F2Z3A9
80183,22,ENST00000395595,ENSG00000079974,RABL2B,protein_coding,protein_coding,F2Z3J7


In [5]:
TSTR_SciAdv = TSTR.merge(TSASciAdv, on="gene_id", how="outer")
TSTR_SciAdv

PCG_TSTR_SciAdv = TSTR_SciAdv[TSTR_SciAdv['gene_type'] == "protein_coding"]
PCG_TSTR_SciAdv = PCG_TSTR_SciAdv.merge(annotated_geneNames, on=["gene_id","gene_name","gene_type"])
PCG_TSTR_SciAdv = PCG_TSTR_SciAdv[['gene_name','transcript_id','n','Entry']].drop_duplicates()
PCG_TSTR_SciAdv

Unnamed: 0,gene_name,transcript_id,n,Entry
0,ACTL8,ENST00000375406,8.0,Q9H568
8,CALR3,ENST00000269881,2.0,Q96L12
9,CALR3,ENST00000269881,2.0,M0R0Y8
14,CCDC185,ENST00000366875,1.0,Q8N715
17,CETN1,ENST00000327228,1.0,Q12798
...,...,...,...,...
376,OR12D2,ENST00000642051,2.0,P58182
377,OR12D2,ENST00000642051,2.0,A0A140T931
378,OR12D2,ENST00000642051,2.0,A0A140T9F1
379,CCDC195,ENST00000638102,1.0,A0A1B0GUA6


In [6]:
pattern_coding_genes = '|'.join(PCG_TSTR_SciAdv.Entry.values.tolist())
pattern_coding_genes

'Q9H568|Q96L12|M0R0Y8|Q8N715|Q12798|Q5HYN5|Q8NA75|Q86TM3|Q7Z7J5|Q9BXU8|Q6NT46|P09681|Q8N7C0|Q7Z4W2|P43355|P43365|P43357|E7EMU0|O15479|O60732|Q9UBF1|O75459|Q7Z2X7|X6R922|X6RD31|Q9HBJ0|Q01851|Q16384|A0A0J9YWL9|P56180|A0A087X1C4|A6NN14|A0A0G2JQ93|A6NGE4|Q6PJQ5|A8MYU2|E5RGP7|E5RHP1|E5RJA6|Q6ZMV5|A1L453|Q6ZWI9|Q9NXZ1|Q86UG4|C9J020|H0Y8R6|Q01534|A6NC42|P43361|P43366|Q8TBY0|D6RF41|Q6ZQQ2|P0DMU9|P0DPQ3|Q5GH77|Q9BQM9|P11488|C9JCV8|O15480|P00540|A6NJL1|K7EJD1|K7ESD3|Q9H3M9|Q8NEA9|Q5JUK9|O14990|Q9BQY4|Q9BXU2|Q5H9I0|P0DPD5|A8MXY4|P81277|Q9BXT4|H9KV62|H9KV63|O43193|Q86VD1|Q9Y581|A0A3B3ITZ2|Q9BXI2|Q9UNG2|O60811|Q9BYW3|O95972|P49286|E9PR36|H0YDG4|P57773|Q969M2|A0A1W2PS75|P78386|F5GYI5|Q96Q77|M0QZ29|P01270|Q9NYA3|Q9H339|A0A2R8Y7B5|Q8TCW7|Q8WXF3|K7ENX1|P33032|Q7RTU3|Q9NZP2|Q5T619|A0A669KBK7|Q9H1F0|Q8NH53|Q8NH56|P56851|Q14507|A6NP11|A8MPY1|Q9Y5P0|Q6IFG1|Q8N0Y3|Q3LI61|Q8NH54|P59551|Q7Z6I5|Q6ZN17|A0A1B0GTK2|A0A1B0GVD3|Q6ZR62|Q07627|Q9NZK7|Q765I0|C9JU87|F2Z3I1|F8WCV4|A6NIM6|Q6IE38|H3BT63|Q8IYX0|Q9BYQ7|Q8NG

## Melanoma Cancer, Chong

In [15]:
%%bash

module load Java
module load singularity
module load Python

FDR=0.05
ref=/home/marta/241205_Immunopeptidomics_HCC/database/TAA_SPTREMBL.fasta

# samples="Me275 Me290 OMM475 T1015A T1185B OD5P ONVC"
samples="OD5P"

for s in $samples; do
    outdir=/home/marta/241205_Immunopeptidomics_HCC/Melanoma_Chong/FDR5percent/$s
    mkdir -p $outdir

    /home/marta/nextflow run nf-core/mhcquant -profile singularity --input /home/marta/immunopeptidomics_rawData/samplesheet_${s}.tsv --outdir $outdir --fasta $ref --fdr_threshold $FDR
    # grep 'PEPTIDE' $outdir/${s}_A.tsv | grep -v 'UNASSIGNED' > $outdir/${s}_A_PEPTIDE.tsv
    # grep -v 'non-unique' $outdir/${s}_A_PEPTIDE.tsv > $outdir/${s}_A_PEPTIDE.unique.tsv
    grep -v 'non-unique' $outdir/${s}_A.tsv > $outdir/${s}_A_PEPTIDE.unique.tsv

    if [ -s $outdir/${s}_A_PEPTIDE.unique.tsv ]; then
        mv /home/marta/241205_Immunopeptidomics_HCC/Melanoma_Chong/FDR5percent/$s /users/genomics/marta/241205_Immunopeptidomics_HCC/Melanoma_Chong/FDR5percent
    fi

    rm -r work
done

[33mNextflow 24.10.4 is available - Please consider updating your version to it[m


N E X T F L O W  ~  version 23.10.1
Launching `https://github.com/nf-core/mhcquant` [elated_boyd] DSL2 - revision: 1b3069246d [master]
WARN: Access to undefined parameter `monochromeLogs` -- Initialise it to a default value eg. `params.monochromeLogs = some_value`


------------------------------------------------------
                                        ,--./,-.
        ___     __   __   __   ___     /,-._.--~'
  |\ | |__  __ /  ` /  \ |__) |__         }  {
  | \| |       \__, \__/ |  \ |___     \`-._,-`-,
                                        `._,._,'
  nf-core/mhcquant v2.6.0-g1b30692
------------------------------------------------------
Core Nextflow options
  revision                 : master
  runName                  : elated_boyd
  containerEngine          : singularity
  launchDir                : /home/marta/PROJECT_SCRIPTS/241205_Immunopeptidomics_HCC
  workDir                  : /home/marta/PROJECT_SCRIPTS/241205_Immunopeptidomics_HCC/work
  projectDir              


**Analysis | FDR 5%**

In [7]:
dir="/users/genomics/marta/241205_Immunopeptidomics_HCC/Melanoma_Chong/FDR5percent"

beijer_output_FDR5 = pd.DataFrame()
coding_TSTR = pd.DataFrame()

folders = [ f.name for f in os.scandir(dir) if f.is_dir() ]

for folder in folders:
    for file in os.listdir(os.path.join(dir,folder)):
        statinfo = os.stat(os.path.join(dir,folder,file))
        
        if file.endswith("_A.tsv") and statinfo.st_size > 129:
            print(file)
            output = pd.read_csv(os.path.join(dir,folder,file), sep="\t")
            output['sequence'] = output['sequence'].str.replace('(Oxidation)', '')
            noncoding = output[output['accessions'].str.contains("ENS") | output['accessions'].str.contains("TCONS") ]
            noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
            noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
            noncoding['sample'] = folder

            ## protein coding genes
            coding = output[output['accessions'].str.contains(pattern_coding_genes, na=False)] 
            coding
            coding['sample'] = folder
            coding['Entry'] = coding['accessions'].apply(
                            lambda x: re.search(pattern_coding_genes, x).group(0) if re.search(pattern_coding_genes, x) else None
                        )
            coding = annotated_geneNames[['Entry','transcript_id']].merge(coding, on="Entry")
            coding.drop("Entry", axis=1, inplace=True)
            coding_TSTR = pd.concat([coding,coding_TSTR])    
            beijer_output_FDR5 = pd.concat([noncoding,beijer_output_FDR5])

## add protein-coding
coding_TSTR = coding_TSTR.merge(PCG_TSTR_SciAdv[['Entry','transcript_id']])
coding_TSTR = coding_TSTR[beijer_output_FDR5.columns]
beijer_output_FDR5_TSA = pd.concat([coding_TSTR, beijer_output_FDR5])
beijer_output_FDR5_TSA = beijer_output_FDR5_TSA[beijer_output_FDR5_TSA['target_decoy'] == "target"]

## TSA Science Advances
beijer_output_FDR5_TSA = beijer_output_FDR5_TSA.merge(annot, on="transcript_id")
beijer_output_FDR5_TSA = beijer_output_FDR5_TSA.merge(TSASciAdv, on="gene_id", how="left")

beijer_output_FDR5_TSA = beijer_output_FDR5_TSA[['sequence','sample','chr','accessions','gene_id','gene_name_x','gene_name_y','n','transcript_type','gene_type_x','protein_references']].sort_values(by="n", ascending=False)
beijer_output_FDR5_TSA.to_csv(os.path.join(dir,"FDR5percent_TSA_analysis.csv"), index=None)

## TSTR
beijer_output_FDR5_TSA_TSTR = beijer_output_FDR5_TSA.merge(TSTR, on="gene_id")
beijer_output_FDR5_TSA_TSTR.to_csv(os.path.join(dir,"FDR5percent_TSA_TSTR_analysis.csv"), index=None)
beijer_output_FDR5_TSA_TSTR

Me290_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

ONVC_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

OMM475_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

Me275_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

OD5P_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

T1015A_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

T1185B_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

Unnamed: 0,sequence,sample,chr,accessions,gene_id,gene_name_x,gene_name_y,n,transcript_type,gene_type_x,protein_references,orfID_TSTR_RiboSeq,ctype
0,SAYGEPRKL,T1185B,X,sp|P43355|MAGA1_HUMAN,ENSG00000198681,MAGEA1,MAGEA1,40.0,protein_coding,protein_coding,unique,,BRCA
1,SAYGEPRKL,T1185B,X,sp|P43355|MAGA1_HUMAN,ENSG00000198681,MAGEA1,MAGEA1,40.0,protein_coding,protein_coding,unique,,BLCA
2,SAYGEPRKL,T1185B,X,sp|P43355|MAGA1_HUMAN,ENSG00000198681,MAGEA1,MAGEA1,40.0,protein_coding,protein_coding,unique,,LUAD
3,SAYGEPRKL,T1185B,X,sp|P43355|MAGA1_HUMAN,ENSG00000198681,MAGEA1,MAGEA1,40.0,protein_coding,protein_coding,unique,,PRAD
4,SAYGEPRKL,T1185B,X,sp|P43355|MAGA1_HUMAN,ENSG00000198681,MAGEA1,MAGEA1,40.0,protein_coding,protein_coding,unique,,LUSC
...,...,...,...,...,...,...,...,...,...,...,...,...,...
723,AGRMSKSLVI,OMM475,5,ENST00000296682.4:5:+|97|3193:1750:3004|odORF|...,ENSG00000164256,PRDM9,,,protein_coding,protein_coding,non-unique,,BLCA
724,AGRMSKSLVI,OMM475,5,ENST00000296682.4:5:+|97|3193:1750:3004|odORF|...,ENSG00000164256,PRDM9,,,protein_coding,protein_coding,non-unique,,LUSC
725,AGRMSKSLVI,OMM475,5,ENST00000296682.4:5:+|97|3193:1750:3004|odORF|...,ENSG00000164256,PRDM9,,,protein_coding,protein_coding,non-unique,,LIHC
726,PTSLCLELSY,ONVC,1,ENST00000608183.1:1:-|68|2631:1213:1444|noncod...,ENSG00000228918,LINC01344,,,lncRNA,lncRNA,unique,ENST00000608183.1:1:-|29|2631:358:436|noncodin...,BRCA


## Melanoma Bassani 2016

In [19]:
%%bash

module load Java
module load singularity
module load Python

FDR=0.05
ref=/home/marta/241205_Immunopeptidomics_HCC/database/TAA_SPTREMBL.fasta

## HLA-I
# samples="MM20 MM21 MM24 MM25 MM26 MM27 MM28 MM29 MM30 MM33 MM34 MM35 MM36 MM39 MM40 MM41 MM42 MM12 MM8 MM5 MM4 MM3 MM15"
samples="MM29"

for s in $samples; do
    outdir=/home/marta/241205_Immunopeptidomics_HCC/Melanoma_Bassani/FDR5percent/$s
    mkdir -p $outdir

    /home/marta/nextflow run nf-core/mhcquant -profile singularity --input /home/marta/immunopeptidomics_rawData/samplesheet_${s}.tsv --outdir $outdir --fasta $ref --fdr_threshold $FDR
    # grep 'PEPTIDE' $outdir/${s}_A.tsv | grep -v 'UNASSIGNED' > $outdir/${s}_A_PEPTIDE.tsv
    # grep -v 'non-unique' $outdir/${s}_A_PEPTIDE.tsv > $outdir/${s}_A_PEPTIDE.unique.tsv
    grep -v 'non-unique' $outdir/${s}_A.tsv > $outdir/${s}_A_PEPTIDE.unique.tsv

    if [ -s $outdir/${s}_A_PEPTIDE.unique.tsv ]; then
        mv /home/marta/241205_Immunopeptidomics_HCC/Melanoma_Bassani/FDR5percent/$s /users/genomics/marta/241205_Immunopeptidomics_HCC/Melanoma_Bassani/FDR5percent
    fi

    rm -r work
done

[33mNextflow 24.10.4 is available - Please consider updating your version to it[m


N E X T F L O W  ~  version 23.10.1
Launching `https://github.com/nf-core/mhcquant` [distraught_lattes] DSL2 - revision: 1b3069246d [master]
WARN: Access to undefined parameter `monochromeLogs` -- Initialise it to a default value eg. `params.monochromeLogs = some_value`


------------------------------------------------------
                                        ,--./,-.
        ___     __   __   __   ___     /,-._.--~'
  |\ | |__  __ /  ` /  \ |__) |__         }  {
  | \| |       \__, \__/ |  \ |___     \`-._,-`-,
                                        `._,._,'
  nf-core/mhcquant v2.6.0-g1b30692
------------------------------------------------------
Core Nextflow options
  revision                 : master
  runName                  : distraught_lattes
  containerEngine          : singularity
  launchDir                : /home/marta/PROJECT_SCRIPTS/241205_Immunopeptidomics_HCC
  workDir                  : /home/marta/PROJECT_SCRIPTS/241205_Immunopeptidomics_HCC/work
  projectDir  

In [8]:
dir="/users/genomics/marta/241205_Immunopeptidomics_HCC/Melanoma_Bassani/FDR5percent"

beijer_output_FDR5 = pd.DataFrame()
coding_TSTR = pd.DataFrame()

folders = [ f.name for f in os.scandir(dir) if f.is_dir() ]

for folder in folders:
    for file in os.listdir(os.path.join(dir,folder)):
        statinfo = os.stat(os.path.join(dir,folder,file))
        
        if file.endswith("_A.tsv") and statinfo.st_size > 129:
            print(file)
            output = pd.read_csv(os.path.join(dir,folder,file), sep="\t")
            output['sequence'] = output['sequence'].str.replace('(Oxidation)', '')
            noncoding = output[output['accessions'].str.contains("ENS") | output['accessions'].str.contains("TCONS") ]
            noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
            noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
            noncoding['sample'] = folder

            ## protein coding genes
            coding = output[output['accessions'].str.contains(pattern_coding_genes, na=False)] 
            coding
            coding['sample'] = folder
            coding['Entry'] = coding['accessions'].apply(
                            lambda x: re.search(pattern_coding_genes, x).group(0) if re.search(pattern_coding_genes, x) else None
                        )
            coding = annotated_geneNames[['Entry','transcript_id']].merge(coding, on="Entry")
            coding.drop("Entry", axis=1, inplace=True)
            coding_TSTR = pd.concat([coding,coding_TSTR])    
            beijer_output_FDR5 = pd.concat([noncoding,beijer_output_FDR5])

## add protein-coding
coding_TSTR = coding_TSTR.merge(PCG_TSTR_SciAdv[['Entry','transcript_id']])
coding_TSTR = coding_TSTR[beijer_output_FDR5.columns]
beijer_output_FDR5_TSA = pd.concat([coding_TSTR, beijer_output_FDR5])
beijer_output_FDR5_TSA = beijer_output_FDR5_TSA[beijer_output_FDR5_TSA['target_decoy'] == "target"]

## TSA Science Advances
beijer_output_FDR5_TSA = beijer_output_FDR5_TSA.merge(annot, on="transcript_id")
beijer_output_FDR5_TSA = beijer_output_FDR5_TSA.merge(TSASciAdv, on="gene_id", how="left")

beijer_output_FDR5_TSA = beijer_output_FDR5_TSA[['sequence','sample','chr','accessions','gene_id','gene_name_x','gene_name_y','n','transcript_type','gene_type_x','protein_references']].sort_values(by="n", ascending=False)
beijer_output_FDR5_TSA.to_csv(os.path.join(dir,"FDR5percent_TSA_analysis.csv"), index=None)

## TSTR
beijer_output_FDR5_TSA_TSTR = beijer_output_FDR5_TSA.merge(TSTR, on="gene_id")
beijer_output_FDR5_TSA_TSTR.to_csv(os.path.join(dir,"FDR5percent_TSA_TSTR_analysis.csv"), index=None)
beijer_output_FDR5_TSA_TSTR

MM24_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

MM30_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

MM29_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

MM34_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

MM33_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

MM39_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

MM28_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

MM36_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

MM25_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

MM5_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

MM8_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

MM42_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

MM21_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder


MM35_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

MM3_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

MM4_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

MM12_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

MM20_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

MM40_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

MM41_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder


MM26_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

MM15_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

MM27_A.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['accessions'].str.split(":", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['transcript_id'] = noncoding['transcript_id'].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noncoding['sample'] = folder
A val

Unnamed: 0,sequence,sample,chr,accessions,gene_id,gene_name_x,gene_name_y,n,transcript_type,gene_type_x,protein_references,orfID_TSTR_RiboSeq,ctype
0,KEADPTGHSYVL,MM36,X,sp|P43355|MAGA1_HUMAN,ENSG00000198681,MAGEA1,MAGEA1,40.0,protein_coding,protein_coding,unique,,BRCA
1,KEADPTGHSYVL,MM36,X,sp|P43355|MAGA1_HUMAN,ENSG00000198681,MAGEA1,MAGEA1,40.0,protein_coding,protein_coding,unique,,BLCA
2,KEADPTGHSYVL,MM36,X,sp|P43355|MAGA1_HUMAN,ENSG00000198681,MAGEA1,MAGEA1,40.0,protein_coding,protein_coding,unique,,LUAD
3,KEADPTGHSYVL,MM36,X,sp|P43355|MAGA1_HUMAN,ENSG00000198681,MAGEA1,MAGEA1,40.0,protein_coding,protein_coding,unique,,PRAD
4,KEADPTGHSYVL,MM36,X,sp|P43355|MAGA1_HUMAN,ENSG00000198681,MAGEA1,MAGEA1,40.0,protein_coding,protein_coding,unique,,LUSC
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1414,RVVEMLLHR,MM34,13,ENST00000813109.1:13:-|9|877:205:448|noncoding...,ENSG00000287861,ENSG00000287861,,,lncRNA,lncRNA,unique,ENST00000813109.1:13:-|10|877:247:448|noncodin...,BLCA
1415,RVVEMLLHR,MM34,13,ENST00000813109.1:13:-|9|877:205:448|noncoding...,ENSG00000287861,ENSG00000287861,,,lncRNA,lncRNA,unique,ENST00000813109.1:13:-|10|877:247:448|noncodin...,LUAD
1416,RVVEMLLHR,MM34,13,ENST00000813109.1:13:-|9|877:205:448|noncoding...,ENSG00000287861,ENSG00000287861,,,lncRNA,lncRNA,unique,ENST00000813109.1:13:-|10|877:247:448|noncodin...,KIRC
1417,RVVEMLLHR,MM34,13,ENST00000813109.1:13:-|9|877:205:448|noncoding...,ENSG00000287861,ENSG00000287861,,,lncRNA,lncRNA,unique,ENST00000813109.1:13:-|10|877:247:448|noncodin...,PRAD


## Merge results

In [9]:
## Merge them all (except PT)
dir = "/users/genomics/marta/241205_Immunopeptidomics_HCC"
data_source = ["Melanoma_Chong","Melanoma_Bassani"]

whole_output = pd.DataFrame()

for source in data_source:
    input = pd.read_csv(os.path.join(dir,source,"FDR5percent/FDR5percent_TSA_TSTR_analysis.csv"))
    input['source'] = source

    whole_output = pd.concat([whole_output, input])

whole_output_ordered = whole_output[['sequence','accessions','orfID_TSTR_RiboSeq','protein_references','gene_name_x','gene_name_y','gene_type_x','gene_id','source','sample','gene_id','ctype','n','chr']]
whole_output_ordered.drop_duplicates(inplace=True)

## add immunopep
immunopep = pd.read_csv("/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Multimap_altORFs/Q5_immunopeptidomics/human/TOv3x_5percent_TestisRestrictedGTEx_Translated_Ctypes_log2ratio3x_immuno.csv")
immunopep = immunopep[["Peptide", "orfID","ORFpep","source"]]
immunopep.columns = ["Peptide", "orfID","ORFpep","source_literature"]

whole_output_ordered_immuno = whole_output_ordered.merge(immunopep, left_on=["sequence"], right_on="Peptide", how="left")
whole_output_ordered_immuno.drop_duplicates(inplace=True)
whole_output_ordered_immuno.to_csv(os.path.join(dir,"results/Melanoma_hits_combined.csv"), index=None)

# ## remove redundancies
noredundant = whole_output_ordered[["sequence","accessions","sample"]].drop_duplicates().groupby(["sequence","accessions"]).count().reset_index()[["sequence","accessions","sample"]]
noredundant.columns = ["sequence","accessions","num_samples_with_hit"]
noredundant['study'] = "Melanoma_MHCquant_Camarena"

noredundant_full = noredundant.merge(whole_output_ordered.drop(["sample","source","ctype"], axis=1), on=["sequence","accessions"])
noredundant_full.drop_duplicates(inplace=True)

## how many ctypes 
howmany_ctypes = whole_output_ordered[["sequence","accessions","ctype"]].drop_duplicates().groupby(["sequence","accessions"], as_index=False).agg({
    'ctype': lambda x: ','.join(sorted(x.unique()))  # Concatenate unique values
})
howmany_ctypes.columns = ["sequence","accessions","which_ctypes"]
noredundant_full = noredundant_full.merge(howmany_ctypes, on=["sequence","accessions"], how="outer")
noredundant_full

## immuno
immuno_small = whole_output_ordered_immuno[["sequence","accessions","orfID","source_literature"]].drop_duplicates()
immuno_grouped = immuno_small.groupby(['sequence', 'accessions', 'orfID'], as_index=False).agg({
    'source_literature': lambda x: ','.join(sorted(x.unique()))  # Concatenate unique values
})
noredundant_full = noredundant_full.merge(immuno_grouped, on=["sequence","accessions"], how="outer")
noredundant_full.drop_duplicates(inplace=True)
noredundant_full.to_csv(os.path.join(dir,"results/Melanoma_hits_combined_noredundant.csv"), index=None)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  whole_output_ordered.drop_duplicates(inplace=True)
