In this notebook, the TCGA-TGCT samples are divided into 5 distinct experiments according to histological composition of the tumor. These divisions are taken from the TCGA-TGCT study supplementry document's section on "differentialy abundatnt microRNAs": https://www.cell.com/cms/10.1016/j.celrep.2018.05.039/attachment/c0b32f0d-2761-4f95-b667-b6fe9350049c/mmc1.pdf

The divisions are: "pure seminoma (n=72) vs. samples with no seminoma component  (n=61)", "pure seminoma (n=72) vs pure EC (n =18)", "pure EC vs samples with no EC or seminoma component (n=19)", "pure teratomas (can have both mature and immature component) (n=6) vs samples with no teratoma component (n=107)" and "pure seminomas with a mutation in the KIT gene (n=25) vs pure seminomas without KIT mutations (n=47)".

In [1]:
import pandas as pd

In [2]:
url_sample_data = "https://raw.githubusercontent.com/Ignas12345/masters_project_data_and_notebooks/refs/heads/main/Data/sample_annotations/TCGA_TGCT_sample_data.csv"
sample_data = pd.read_csv(url_sample_data, sep = "|", index_col=0)

In [3]:
sample_data.columns

Index(['SCNA_k5', 'miRNA_k3', 'methylation_k5', 'mRNA_k4', 'RPPA_k4',
       'PARADIGM_k3', 'age_at_diagnosis', 'race', 'ethnicity',
       'two-class updated 20160629', 'multiclass updated 20160629',
       'Seminoma % ', 'Embryonal carcinoma %', 'Yolk sac tumor %', 'Mature %',
       'Immature %', 'Choriocarcinoma %', 'history_of_undescended_testis',
       'Family_History_Testicular_Cancer', 'Family_History_ Any_Cancer',
       'New_Primary_tumor', 'purity', 'ploidy', 'Genome doublings',
       'Cancer DNA fraction', 'Subclonal genome fraction', 'AverageCpH',
       'AverageCpG at CGI', 'AverageCpG at nonCGI', 'CpH Methylation Mode',
       'leukocyte infiltration', 'KIT', 'KRAS', 'NRAS', 'PIK3CA', 'q2020cov',
       '#Silent', '#Nonsilent', '#Total', 'Silent (per Mb)',
       'Nonsilent (Per Mb)', 'Total (Per Mb)', 'C:G>A:T %', 'C:G>G:C %',
       'C:G>T:A %', 'T:A>A:T %', 'T:A>C:G %', 'T:A>G:C %', 'INDEL %', 'DNM %',
       'KRAS_mut', 'KRAS_mut_multiplicity', 'KRAS_copies', 'KRAS

In [4]:
#fill nan with 0
subtype_proportions = ['Seminoma % ','Embryonal carcinoma %', 'Yolk sac tumor %', 'Mature %', 'Immature %', 'Choriocarcinoma %']
sample_data.loc[:,subtype_proportions] = sample_data.loc[:,subtype_proportions].fillna(0)

In [5]:
divisions_by_experiment = pd.DataFrame(index=sample_data.index)

Prepare experiment #1: "pure seminoma (n=72) vs. samples with no seminoma component (n=61)"

In [6]:
#Prepare experiment #1: "pure seminoma (n=72) vs. samples with no seminoma component (n=61)"
label_dict = {}
for sample in sample_data.index:
  if sample_data.loc[sample, 'Seminoma % '] == 0:
    label_dict[sample] = 'non_seminoma'
  elif sample_data.loc[sample, 'Seminoma % '] == 100:
    label_dict[sample] = 'seminoma'
  else:
    label_dict[sample] = 'unused'
divisions_by_experiment['seminoma_vs_non_seminoma'] = pd.Series(label_dict)
divisions_by_experiment['seminoma_vs_non_seminoma'].value_counts()

Unnamed: 0_level_0,count
seminoma_vs_non_seminoma,Unnamed: 1_level_1
seminoma,72
non_seminoma,61
unused,4


Prepare experiment #2: "pure seminoma (n=72) vs pure EC (n =18)"

In [7]:
#Prepare experiment #2: "pure seminoma (n=72) vs pure EC (n =18)"
label_dict = {}
for sample in sample_data.index:
  if sample_data.loc[sample, 'Embryonal carcinoma %'] == 100:
    label_dict[sample] = 'ebryonal_carcinoma'
  elif sample_data.loc[sample, 'Seminoma % '] == 100:
    label_dict[sample] = 'seminoma'
  else:
    label_dict[sample] = 'unused'
divisions_by_experiment['seminoma_vs_embryonal'] = pd.Series(label_dict)
divisions_by_experiment['seminoma_vs_embryonal'].value_counts()

Unnamed: 0_level_0,count
seminoma_vs_embryonal,Unnamed: 1_level_1
seminoma,72
unused,47
ebryonal_carcinoma,18


Prepare experiment #3 : "pure EC (n=18) vs samples with no EC or seminoma component (n=19)"

In [8]:
#Prepare experiment #3 : "pure EC vs samples with no EC or seminoma component (n=19)"
label_dict = {}
for sample in sample_data.index:
  if sample_data.loc[sample, 'Embryonal carcinoma %'] == 100:
    label_dict[sample] = 'ebryonal_carcinoma'
  elif sample_data.loc[sample, 'Seminoma % '] == 0 and sample_data.loc[sample, 'Embryonal carcinoma %'] == 0:
    label_dict[sample] = 'non_embryonal_non_seminoma'
  else:
    label_dict[sample] = 'unused'
divisions_by_experiment['embryonal_vs_non_embryonal_non_seminoma'] = pd.Series(label_dict)
divisions_by_experiment['embryonal_vs_non_embryonal_non_seminoma'].value_counts()

Unnamed: 0_level_0,count
embryonal_vs_non_embryonal_non_seminoma,Unnamed: 1_level_1
unused,100
non_embryonal_non_seminoma,19
ebryonal_carcinoma,18


Prepare experiment #4 : "pure teratomas (n=6) vs samples with no teratoma component (n=107)"

In [9]:
# Prepare experiment #4 : "pure teratomas (n=6) vs samples with no teratoma component (n=107)":
label_dict = {}
for sample in sample_data.index:
  if sample_data.loc[sample, 'Mature %'] + sample_data.loc[sample, 'Immature %'] == 100:
    label_dict[sample] = 'teratoma'
  elif sample_data.loc[sample, 'Mature %'] == 0 and sample_data.loc[sample, 'Immature %'] == 0:
    label_dict[sample] = 'non_teratoma'
  else:
    label_dict[sample] = 'unused'
divisions_by_experiment['teratoma_vs_non_teratoma'] = pd.Series(label_dict)
divisions_by_experiment['teratoma_vs_non_teratoma'].value_counts()

Unnamed: 0_level_0,count
teratoma_vs_non_teratoma,Unnamed: 1_level_1
non_teratoma,107
unused,24
teratoma,6


Prepare experiment #5 : "pure seminomas with a mutation in the KIT gene (n=25) vs pure seminomas without KIT mutations (n=47)"

In [10]:
label_dict = {}
for sample in sample_data.index:
  if sample_data.loc[sample, 'Seminoma % '] == 100 and sample_data.loc[sample, 'KIT_mut'] != '0':
    label_dict[sample] = 'KIT_mutated_seminoma'
  elif sample_data.loc[sample, 'Seminoma % '] == 100 and sample_data.loc[sample, 'KIT_mut'] == '0':
    label_dict[sample] = 'KIT_wildtype_seminoma'
  else:
    label_dict[sample] = 'unused'
divisions_by_experiment['KIT_wildtype_vs_mutated_seminoma'] = pd.Series(label_dict)
divisions_by_experiment['KIT_wildtype_vs_mutated_seminoma'].value_counts()

Unnamed: 0_level_0,count
KIT_wildtype_vs_mutated_seminoma,Unnamed: 1_level_1
unused,65
KIT_wildtype_seminoma,47
KIT_mutated_seminoma,25


Prepare additional experiment #6: "tumors with a teratoma component (n = 29) vs. tumors with no teratoma component (n = 107)"

In [11]:
#Prepare additional experiment #6: "tumors with a teratoma component vs. tumors with no teratoma component"
label_dict = {}
for sample in sample_data.index:
  if sample_data.loc[sample, 'Mature %'] + sample_data.loc[sample, 'Immature %'] >= 10:
    label_dict[sample] = 'teratoma_present'
  elif sample_data.loc[sample, 'Mature %'] + sample_data.loc[sample, 'Immature %'] == 0:
    label_dict[sample] = 'teratoma_absent'
  else:
    label_dict[sample] = 'unused'
divisions_by_experiment['teratoma_present_vs_teratoma_absent'] = pd.Series(label_dict)
divisions_by_experiment['teratoma_present_vs_teratoma_absent'].value_counts()

Unnamed: 0_level_0,count
teratoma_present_vs_teratoma_absent,Unnamed: 1_level_1
teratoma_absent,107
teratoma_present,29
unused,1


Finally, add a multilabel_condition column

In [12]:
label_dict = {}
for sample in sample_data.index:
  label_dict[sample] = sample_data.loc[sample,'multiclass updated 20160629']

divisions_by_experiment['multiclass_label'] = pd.Series(label_dict)
divisions_by_experiment['multiclass_label'].value_counts()

Unnamed: 0_level_0,count
multiclass_label,Unnamed: 1_level_1
seminoma,72
embryonal,18
mature teratoma dominant,10
embryonal dominant,9
non-seminoma- mixed (no dominant component),9
yolk sac dominant,8
yolk sac,5
mature teratoma,3
immature teratoma dominant,3


In [13]:
divisions_by_experiment.to_csv('TCGA_TGCT_divisions_by_experiment.csv')

In [14]:
divisions_by_experiment.index[divisions_by_experiment['teratoma_vs_non_teratoma'] == 'teratoma']

Index(['TCGA-2G-AAG5-01', 'TCGA-2G-AAGV-01', 'TCGA-2G-AAGY-01',
       'TCGA-SN-A84W-01', 'TCGA-XE-AAOB-01', 'TCGA-XE-AAOC-01'],
      dtype='object', name='sample')