In [1]:
import numpy as np
import anndata
import pandas as pd
import scanpy as sc
import scipy
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join, isdir
import re

In [2]:
Path0='/media/kusterlab/internal_projects/active/TOPAS/WP31/Searches'
Path1='/home/mgander/Atlantic/data/copied_data'

In [3]:
def fix_modifications(mods):
    # Fix Modifications
    replace_with=[('_(Acetyl (Protein N-term))','_(ac)'),
                    ('pT','T(ph)'), ('pS','S(ph)'), ('pY','Y(ph)'), ('(Oxidation (M))','M(ox)'),
                  ('(Phospho (STY))', '(ph)')]
    new_mods=[]
    for m in mods:
        for t in replace_with:
            m=m.replace(t[0], t[1])
        new_mods.append(m)
    return(new_mods)

In [4]:
def load_all_exp(cancertype, pattern, cols_kept):
    mypath=Path0+'/'+cancertype
    onlydir = [f for f in listdir(mypath) if isdir(join(mypath, f))]
    
    kept=[]
    for a in onlydir:
        matches = re.findall(pattern, a)
        if len(matches)==1:
            kept.append(matches[0])
        elif len(matches)>1:
            print(matches)
    
    evs=[]
    for k in kept:
        print(k)
        df=pd.read_csv(f'{Path0}/{cancertype}/{k}/combined/txt/evidence.txt', sep='\t')
        dfs=df[cols_kept].copy()
        dfs=dfs.sort_values('Score')
        dfs=dfs.drop_duplicates(subset=['Modified sequence']).copy()
        dfs['Experiment']=k
        dfs['Cancertype']=cancertype
        evs.append(dfs)
    Df=pd.concat(evs, ignore_index=True)
    return(Df)

In [5]:
pattern = r'Batch\d+_[FP]P.*'
cols_kept=['Proteins', 'Modified sequence', 'Gene names', 'Score']+[f'Reporter intensity corrected {i}' for i in range(1,12)]

# CYL sarcoma cell lines

In [7]:
# The translator

In [36]:
# The translator for CYL cell lines
df=pd.read_csv(f'{Path0}/patient_annotation_230922_PAN_cancer_celllines_pdx.csv')
df=df[df['Program']=='CYL'].copy()
df['ccl']=['SW982', 'VAESBJ', 'SKES1', 'RDES', 'SW684', 'HT1080', 'SKLMS1', 'SW827', 'KHOS240S',
         'KHOSNP', 'RD', 'HS729', 'A204', 'G401', 'SYO1', 'MESSA']
df9=df[df['Batch Name']=='CL9'].copy()
df10=df[df['Batch Name']=='CL10'].copy()

D={}
D['Batch9']={df9['TMT Channel'].iloc[i]:df9['ccl'].iloc[i] for i in range(len(df9))}
D['Batch10']={df10['TMT Channel'].iloc[i]:df10['ccl'].iloc[i] for i in range(len(df10))}

Unnamed: 0,Sample name,Cohort,Program,Entity,Histologic Subtype,Batch Name,TMT Channel,QC,Replicate,Material issue,ccl
870,SRCM-SW-982,Workflow_Test,CYL,other,other,CL9,1,passed,,,SW982
871,SRCM-VA-ESBJ,Workflow_Test,CYL,other,other,CL9,2,passed,,,VAESBJ
872,SRCM-SK-ES1,Workflow_Test,CYL,other,other,CL9,3,passed,,,SKES1
873,SRCM-RD-ES,Workflow_Test,CYL,other,other,CL9,4,passed,,,RDES
874,SRCM-SW-684,Workflow_Test,CYL,other,other,CL9,5,passed,,,SW684
875,SRCM-HT-1080,Workflow_Test,CYL,other,other,CL9,6,passed,,,HT1080
876,SRCM-SK-LMS1,Workflow_Test,CYL,other,other,CL9,7,passed,,,SKLMS1
877,SRCM-SW-827,Workflow_Test,CYL,other,other,CL9,8,passed,,,SW827
878,SRCM-KHOS-240S,Workflow_Test,CYL,other,other,CL10,1,passed,,,KHOS240S
879,SRCM-KHOS-NP,Workflow_Test,CYL,other,other,CL10,2,passed,,,KHOSNP


In [38]:
df9=df[df['Batch Name']=='CL9'].copy()
df10=df[df['Batch Name']=='CL10'].copy()

In [39]:
D={}
D['Batch9']={df9['TMT Channel'].iloc[i]:df9['ccl'].iloc[i] for i in range(len(df9))}
D['Batch10']={df10['TMT Channel'].iloc[i]:df10['ccl'].iloc[i] for i in range(len(df10))}

In [40]:
D

{'Batch9': {1: 'SW982',
  2: 'VAESBJ',
  3: 'SKES1',
  4: 'RDES',
  5: 'SW684',
  6: 'HT1080',
  7: 'SKLMS1',
  8: 'SW827'},
 'Batch10': {1: 'KHOS240S',
  2: 'KHOSNP',
  3: 'RD',
  4: 'HS729',
  5: 'A204',
  6: 'G401',
  7: 'SYO1',
  8: 'MESSA'}}

In [18]:
cancertype='Cell_lines'
Df0=load_all_exp(cancertype, pattern, cols_kept)

Batch9_FP_CYL
Batch9_PP_CYL
Batch10_FP_CYL


  df=pd.read_csv(f'{Path0}/{cancertype}/{k}/combined/txt/evidence.txt', sep='\t')


Batch10_PP_CYL


In [19]:
Df0

Unnamed: 0,Proteins,Modified sequence,Gene names,Score,Reporter intensity corrected 1,Reporter intensity corrected 2,Reporter intensity corrected 3,Reporter intensity corrected 4,Reporter intensity corrected 5,Reporter intensity corrected 6,Reporter intensity corrected 7,Reporter intensity corrected 8,Reporter intensity corrected 9,Reporter intensity corrected 10,Reporter intensity corrected 11,Experiment,Cancertype
0,H7BZH4;X6R3N0;Q5K4L6-2;Q5K4L6,_ILQCQGFYQLCGVHQEDVIYLALPLYHMSGSLLGIVGCMGIGATV...,SLC27A3,9.4842,18798.0,5998.1,41290.0,54253.0,4757.9,5932.0,19544.0,0.0,8118.0,7662.4,5700.0,Batch9_FP_CYL,Cell_lines
1,E7ESA7;P29474,_GHMFVCGDVTMATNVLQTVQRILATEGDMELDEAGDVIGVLR_,NOS3,11.1650,47052.0,1310000.0,266480.0,232530.0,565110.0,548130.0,160700.0,1191600.0,25721.0,19786.0,0.0,Batch9_FP_CYL,Cell_lines
2,,_DSSPNSSQSHPSMPSALPSCTHHESPILLSLATASAKSIK_,,11.1930,36092.0,17862.0,20081.0,20245.0,6705.6,16126.0,22243.0,1256.3,22763.0,28838.0,27442.0,Batch9_FP_CYL,Cell_lines
3,,_EGLARSQKELFGVSGASHCTVASSGLSALPCSPEQVCVAK_,,11.4810,50867.0,49072.0,47584.0,52495.0,30265.0,58822.0,51891.0,9940.8,22682.0,12176.0,13441.0,Batch9_FP_CYL,Cell_lines
4,Q96HJ5-2;Q96HJ5,_TWIQNSFGMNIASATIALVGTAFLSLNIAVNIQSLR_,MS4A3,12.8650,71965.0,135450.0,40107.0,19264.0,56865.0,86668.0,69808.0,95512.0,23091.0,5921.1,5888.8,Batch9_FP_CYL,Cell_lines
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295547,Q9Y6D5,_GSS(Phospho (STY))LSGTDDGAQEVVK_,ARFGEF2,595.4700,199760.0,308080.0,96969.0,324580.0,117360.0,154540.0,70836.0,9081.9,171110.0,116400.0,102510.0,Batch10_PP_CYL,Cell_lines
295548,Q02952;Q02952-3;Q02952-2,_EDEKGDDVDDPENQNSALADTDASGGLTKES(Phospho (STY)...,AKAP12,595.5200,83584.0,386860.0,63349.0,660230.0,30709.0,39625.0,7224.6,0.0,232680.0,260050.0,258840.0,Batch10_PP_CYL,Cell_lines
295549,E7ETY2;Q13428-6;Q13428-7;A0A3B3IS06;H0Y8Y7;Q13...,_AGTATS(Phospho (STY))PAGSSPAVAGGTQR_,TCOF1,598.7100,286330.0,367990.0,42457.0,83982.0,86452.0,217010.0,87456.0,0.0,124520.0,135100.0,120790.0,Batch10_PP_CYL,Cell_lines
295550,O14745;O14745-2;J3QRP6,_SAS(Phospho (STY))SDTSEELNSQDSPPK_,SLC9A3R1,603.1000,280880.0,470190.0,339450.0,211230.0,131650.0,286570.0,133210.0,178400.0,232730.0,91584.0,81382.0,Batch10_PP_CYL,Cell_lines


In [42]:
Df0['Modified sequence']=fix_modifications(Df0['Modified sequence'])

In [43]:
Df0.to_pickle(f'{Path1}/TMT/Cell_lines.pkl')

In [2]:
Path1='/home/mgander/Atlantic/data/copied_data'

In [3]:
Df=pd.read_pickle(f'{Path1}/TMT/Cell_lines.pkl')

In [4]:
Df

Unnamed: 0,Proteins,Modified sequence,Gene names,Score,Reporter intensity corrected 1,Reporter intensity corrected 2,Reporter intensity corrected 3,Reporter intensity corrected 4,Reporter intensity corrected 5,Reporter intensity corrected 6,Reporter intensity corrected 7,Reporter intensity corrected 8,Reporter intensity corrected 9,Reporter intensity corrected 10,Reporter intensity corrected 11,Experiment,Cancertype
0,H7BZH4;X6R3N0;Q5K4L6-2;Q5K4L6,_ILQCQGFYQLCGVHQEDVIYLALPLYHMSGSLLGIVGCMGIGATV...,SLC27A3,9.4842,18798.0,5998.1,41290.0,54253.0,4757.9,5932.0,19544.0,0.0,8118.0,7662.4,5700.0,Batch9_FP_CYL,Cell_lines
1,E7ESA7;P29474,_GHMFVCGDVTMATNVLQTVQRILATEGDMELDEAGDVIGVLR_,NOS3,11.1650,47052.0,1310000.0,266480.0,232530.0,565110.0,548130.0,160700.0,1191600.0,25721.0,19786.0,0.0,Batch9_FP_CYL,Cell_lines
2,,_DSSPNSSQSHPSMPSALPSCTHHESPILLSLATASAKSIK_,,11.1930,36092.0,17862.0,20081.0,20245.0,6705.6,16126.0,22243.0,1256.3,22763.0,28838.0,27442.0,Batch9_FP_CYL,Cell_lines
3,,_EGLARSQKELFGVSGASHCTVASSGLSALPCSPEQVCVAK_,,11.4810,50867.0,49072.0,47584.0,52495.0,30265.0,58822.0,51891.0,9940.8,22682.0,12176.0,13441.0,Batch9_FP_CYL,Cell_lines
4,Q96HJ5-2;Q96HJ5,_TWIQNSFGMNIASATIALVGTAFLSLNIAVNIQSLR_,MS4A3,12.8650,71965.0,135450.0,40107.0,19264.0,56865.0,86668.0,69808.0,95512.0,23091.0,5921.1,5888.8,Batch9_FP_CYL,Cell_lines
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295547,Q9Y6D5,_GSS(ph)LSGTDDGAQEVVK_,ARFGEF2,595.4700,199760.0,308080.0,96969.0,324580.0,117360.0,154540.0,70836.0,9081.9,171110.0,116400.0,102510.0,Batch10_PP_CYL,Cell_lines
295548,Q02952;Q02952-3;Q02952-2,_EDEKGDDVDDPENQNSALADTDASGGLTKES(ph)PDTNGPK_,AKAP12,595.5200,83584.0,386860.0,63349.0,660230.0,30709.0,39625.0,7224.6,0.0,232680.0,260050.0,258840.0,Batch10_PP_CYL,Cell_lines
295549,E7ETY2;Q13428-6;Q13428-7;A0A3B3IS06;H0Y8Y7;Q13...,_AGTATS(ph)PAGSSPAVAGGTQR_,TCOF1,598.7100,286330.0,367990.0,42457.0,83982.0,86452.0,217010.0,87456.0,0.0,124520.0,135100.0,120790.0,Batch10_PP_CYL,Cell_lines
295550,O14745;O14745-2;J3QRP6,_SAS(ph)SDTSEELNSQDSPPK_,SLC9A3R1,603.1000,280880.0,470190.0,339450.0,211230.0,131650.0,286570.0,133210.0,178400.0,232730.0,91584.0,81382.0,Batch10_PP_CYL,Cell_lines
