<a href="https://colab.research.google.com/github/MattHodgman/ProteoHist/blob/main/get_cohort.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This script is for getting our cohort and metadata

## Get and merge data

In [None]:
import pandas as pd
import os

In [None]:
df_histology = pd.read_csv('/content/drive/MyDrive/BIOINF_590/Data/Metadata/cohort.csv') # load histology metadata
df_proteomic = pd.read_excel('/content/drive/MyDrive/BIOINF_590/Data/Metadata/mmc1.xlsx') # load df_proteomic metadata
df_normal_proteomics = pd.read_csv('/content/drive/MyDrive/BIOINF_590/Data/ExpressionData/ucec_proteome_normal.txt', delimiter='\t')
df_tumor_proteomics = pd.read_csv('/content/drive/MyDrive/BIOINF_590/Data/ExpressionData/ucec_proteome_tumor.txt', delimiter='\t')

In [None]:
df_hist_prot = pd.merge(df_histology, df_proteomic, 
              left_on='Case_ID', right_on='Proteomics_Participant_ID', 
              suffixes=['_histology','_proteomics'])

In [None]:
# format and combine df
df_normal_proteomics['Group'] = 'normal'
df_tumor_proteomics['Group'] = 'tumor'
df_target_proteins = pd.concat([df_normal_proteomics, df_tumor_proteomics])
df_new = pd.merge(df_hist_prot[['Case_ID','Specimen_ID','Slide_ID','Specimen_Type','Tumor_Percent_Tumor_Nuclei', 'Case_excluded']], df_target_proteins, left_on='Case_ID', right_on='sample')
df_new.loc[df_new['Tumor_Percent_Tumor_Nuclei'] < 90, 'Group'] = 'both'
df_new = df_new[df_new['Case_excluded'] == 'No']

print(df_new['Group'].value_counts())

tumor     317
both      218
normal    132
Name: Group, dtype: int64


In [None]:
df_new[df_new['Case_ID'] == 'C3L-01283']

Unnamed: 0,Case_ID,Specimen_ID,Slide_ID,Specimen_Type,Tumor_Percent_Tumor_Nuclei,Case_excluded,sample,ARID1A,CTNNB1,KRAS,PTEN,TP53,Group


In [None]:
normal_slide_ids = sorted(df_new[(df_new['Group'] == 'normal') & (df_new['Specimen_Type'] == 'normal_tissue')]['Slide_ID'].unique())
tumor_slide_ids = sorted(df_new[(df_new['Group'] == 'tumor') & (df_new['Specimen_Type'] == 'tumor_tissue')]['Slide_ID'].unique())
both_slide_ids = sorted(df_new[(df_new['Group'] == 'both') & (df_new['Specimen_Type'] == 'tumor_tissue')]['Slide_ID'].unique())

In [None]:
print(len(normal_slide_ids))
print(len(tumor_slide_ids))
print(len(both_slide_ids))

38
122
125


In [None]:
df_new['Case_ID'].nunique() # number of patients!

95

In [None]:
# keep wanted columns
df_histology = df_histology[['Case_ID','Specimen_ID','Slide_ID','Specimen_Type',
                             'Weight','Tumor_Site','Tumor_Histological_Type',
                             'Tumor_Segment_Acceptable','Tumor_Percent_Tumor_Nuclei',
                             'Tumor_Percent_Total_Cellularity','Tumor_Percent_Necrosis',
                             'Normal_Free_of_Tumor','Progression_or_Recurrence',
                             'Gender','Age_at_Diagnosis','Ethnicity','Race','Vital_Status']]

# There are a lot more columns we could look at!
df_proteomic = df_proteomic[['idx','Proteomics_Participant_ID','Proteomics_Parent_Sample_IDs',
                             'Case_excluded','Proteomics_TMT_batch','Proteomics_TMT_plex',
                             'Proteomics_TMT_channel','Proteomics_Tumor_Normal',
                             'Country','Histologic_Grade_FIGO','Myometrial_invasion_Specify',
                             'Histologic_type','Tumor_purity','tumor_Stage-Pathological',
                             'BMI','Age','Diabetes','Race','Ethnicity','Gender',
                             'Tumor_Site','Tumor_Site_Other']]

In [None]:
df = pd.merge(df_histology, df_proteomic, 
              left_on='Specimen_ID', right_on='Proteomics_Parent_Sample_IDs', 
              suffixes=['_histology','_proteomics'])

## Filter out excluded samples

In [None]:
df = df[df['Case_excluded'] == 'No']

## Group



In [None]:
# groups
TUMOR = 'tumor'
BOTH = 'both'
NORMAL = 'normal'

In [None]:
# label different groups: tumor, both, normal
df['Group'] = BOTH
df.loc[df['Tumor_Percent_Tumor_Nuclei'] >= 90, 'Group'] = TUMOR
df.loc[df['Specimen_Type'] == 'normal_tissue', 'Group'] = NORMAL

#### IMPORTANT! Some patients have data in multiple groups! We need to prevent data leakage

In [None]:
print(df['Group'].value_counts()) # number of slides/proteomic samples

normal    167
both      156
tumor     150
Name: Group, dtype: int64


In [None]:
tumor     50
both      36
normal    30

In [None]:
print(df[(df['Group'] == TUMOR) | (df['Group'] == NORMAL)]['Case_ID'].nunique()) # number of patients

94


In [None]:
# normal protein expression samples are missing some metadata that is contained in the tumor protein expression sample row of the same patient, copy it over for later analysis

cols_to_copy = ['BMI','Age','Diabetes','Race_proteomics','Ethnicity_proteomics','Gender_proteomics']
normal_patients = df[df['Group'] == NORMAL]['Case_ID'].unique()

for p in normal_patients:

  # get indices of patients samples
  s = df[df['Case_ID'] == p]['Group']
  i_normal = s[s == NORMAL].index[0]

  if s[(s == TUMOR) | (s == BOTH)].shape[0] > 0:
    i_alt = s[(s == TUMOR) | (s == BOTH)].index[0]
    for c in cols_to_copy:
      df.loc[i_normal,c] = df.loc[i_alt,c]

## Delete unwanted slides and list ones to add (**WARNING!** this could delete other files!)

In [None]:
# Make sure we have the correct slides
def delete_excess_slides(slide_list, group):
  path = '/content/drive/MyDrive/BIOINF_590/Data/Images/raw_images'
  dir_list = os.listdir(f'{path}/{group}')

  slide_files_we_have = set()
  for file_name in dir_list:
    if '.svs' in file_name:
      slide_files_we_have.add(file_name)

  # slide_files_we_want = set(dfx[dfx['Group'] == group]['Slide_ID'].astype(str).unique() + '.svs')
  slide_files_we_want = set([slide + '.svs' for slide in slide_list])
  print('we want', len(slide_files_we_want), 'slides total')
  slide_files_to_download = slide_files_we_want.difference(slide_files_we_have)
  print('we need to download', len(slide_files_to_download), 'slides')

  # delete excess slides
  slide_files_to_remove = slide_files_we_have.difference(slide_files_we_want)
  print(f'{len(slide_files_to_remove)} to remove:', slide_files_to_remove)
  # for file_name in slide_files_to_remove:
  #   file_path = f'{path}/{group}/{file_name}'
  #   if os.path.isfile(file_path) and '.svs' in file_name:
  #     os.remove(file_path)
  
  return slide_files_to_download

In [None]:
print(len(normal_slide_ids))
print(len(tumor_slide_ids))
print(len(both_slide_ids))

38
122
125


In [None]:
slide_files_to_download = delete_excess_slides(tumor_slide_ids, 'tumor')
len(slide_files_to_download)

we want 122 slides total
we need to download 73 slides
1 to remove: {'C3L-01284-22.svs'}


73

In [None]:
slide_files_to_download = delete_excess_slides(normal_slide_ids, 'normal')
slide_files_to_download

we want 38 slides total
we need to download 8 slides
0 to remove: set()


{'C3N-00200-27.svs',
 'C3N-00333-26.svs',
 'C3N-00333-28.svs',
 'C3N-00383-27.svs',
 'C3N-00383-28.svs',
 'C3N-00729-27.svs',
 'C3N-00866-24.svs',
 'C3N-01211-27.svs'}

In [None]:
slide_files_to_download = delete_excess_slides(both_slide_ids, 'both')
slide_files_to_download

we want 125 slides total
we need to download 125 slides
0 to remove: set()


{'C3L-00006-21.svs',
 'C3L-00143-21.svs',
 'C3L-00143-22.svs',
 'C3L-00145-21.svs',
 'C3L-00156-21.svs',
 'C3L-00156-22.svs',
 'C3L-00161-21.svs',
 'C3L-00161-22.svs',
 'C3L-00161-23.svs',
 'C3L-00161-24.svs',
 'C3L-00161-25.svs',
 'C3L-00358-21.svs',
 'C3L-00563-22.svs',
 'C3L-00586-21.svs',
 'C3L-00601-21.svs',
 'C3L-00767-21.svs',
 'C3L-00769-21.svs',
 'C3L-00771-21.svs',
 'C3L-00780-21.svs',
 'C3L-00780-22.svs',
 'C3L-00780-23.svs',
 'C3L-00780-24.svs',
 'C3L-00780-25.svs',
 'C3L-00781-22.svs',
 'C3L-00781-23.svs',
 'C3L-00781-25.svs',
 'C3L-00921-22.svs',
 'C3L-00932-21.svs',
 'C3L-00942-21.svs',
 'C3L-00947-21.svs',
 'C3L-00961-21.svs',
 'C3L-00963-21.svs',
 'C3L-01248-21.svs',
 'C3L-01256-21.svs',
 'C3L-01257-21.svs',
 'C3L-01275-21.svs',
 'C3L-01282-21.svs',
 'C3L-01311-21.svs',
 'C3L-01925-23.svs',
 'C3L-01925-27.svs',
 'C3L-01925-28.svs',
 'C3N-00151-21.svs',
 'C3N-00151-22.svs',
 'C3N-00151-23.svs',
 'C3N-00151-24.svs',
 'C3N-00200-23.svs',
 'C3N-00322-21.svs',
 'C3N-00322-2

## Write output

In [None]:
df.to_csv('/content/drive/MyDrive/BIOINF_590/Data/Metadata/metadata.csv', index=False)
df[['idx','Case_ID','Specimen_ID','Slide_ID','Group']].to_csv('/content/drive/MyDrive/BIOINF_590/Data/Metadata/sample_ids.csv', index=False)