---
title: "Data Collection"
format:
    html: 
        code-fold: false
---

{{< include overview.qmd >}} 

# Code 

In [2]:
# Import necessary libraries

import requests
import pandas as pd 
import os
import json
from datetime import datetime
import time

## Breast (BRCA) Cancer Dataset

In [None]:
# Open BRCA clinical tsv data

brca = pd.read_csv('../../data/raw-data/brca/brca-clinical.tsv', sep='\t')
brca.head()

  brca = pd.read_csv('data/raw-data/brca/brca-clinical.tsv', sep='\t')


Unnamed: 0,project.project_id,cases.case_id,cases.consent_type,cases.days_to_consent,cases.days_to_lost_to_followup,cases.disease_type,cases.index_date,cases.lost_to_followup,cases.primary_site,cases.submitter_id,...,treatments.treatment_duration,treatments.treatment_effect,treatments.treatment_effect_indicator,treatments.treatment_frequency,treatments.treatment_id,treatments.treatment_intent_type,treatments.treatment_or_therapy,treatments.treatment_outcome,treatments.treatment_outcome_duration,treatments.treatment_type
0,TCGA-BRCA,001cef41-ff86-4d3f-a140-a647ac4b10a1,Informed Consent,-34,'--,Ductal and Lobular Neoplasms,Diagnosis,'--,Breast,TCGA-E2-A1IU,...,'--,'--,'--,'--,1b884f21-eb24-467f-aba2-208af17070b9,Adjuvant,no,'--,'--,"Radiation Therapy, NOS"
1,TCGA-BRCA,001cef41-ff86-4d3f-a140-a647ac4b10a1,Informed Consent,-34,'--,Ductal and Lobular Neoplasms,Diagnosis,'--,Breast,TCGA-E2-A1IU,...,'--,'--,'--,'--,27868bc3-23c8-5e85-a0e2-314e6cdf9b2a,Adjuvant,yes,Treatment Ongoing,'--,Hormone Therapy
2,TCGA-BRCA,001cef41-ff86-4d3f-a140-a647ac4b10a1,Informed Consent,-34,'--,Ductal and Lobular Neoplasms,Diagnosis,'--,Breast,TCGA-E2-A1IU,...,'--,'--,'--,'--,aedf144c-6b7b-4d76-a3cb-4271aef10f1d,First-Line Therapy,yes,'--,'--,"Surgery, NOS"
3,TCGA-BRCA,0045349c-69d9-4306-a403-c9c1fa836644,Informed Consent,76,'--,Adenomas and Adenocarcinomas,Diagnosis,'--,Breast,TCGA-A1-A0SB,...,'--,'--,'--,'--,0a534cae-de91-5e77-a3e7-b52d46bd3966,First-Line Therapy,yes,'--,'--,"Surgery, NOS"
4,TCGA-BRCA,00807dae-9f4a-4fd1-aac2-82eb11bf2afb,Informed Consent,19,'--,Adnexal and Skin Appendage Neoplasms,Diagnosis,No,Breast,TCGA-A2-A04W,...,'--,'--,'--,'--,024faa94-ec57-4d14-b919-62dcab409958,Adjuvant,yes,Treatment Ongoing,'--,Bisphosphonate Therapy


In [4]:
nrow, ncol = brca.shape
print(f'The data has {nrow} rows and {ncol} columns.')

The data has 5546 rows and 210 columns.


In [6]:
# List column names
print("Column names:")
print(brca.columns.tolist())

Column names:
['project.project_id', 'cases.case_id', 'cases.consent_type', 'cases.days_to_consent', 'cases.days_to_lost_to_followup', 'cases.disease_type', 'cases.index_date', 'cases.lost_to_followup', 'cases.primary_site', 'cases.submitter_id', 'demographic.age_at_index', 'demographic.age_is_obfuscated', 'demographic.cause_of_death', 'demographic.cause_of_death_source', 'demographic.country_of_birth', 'demographic.country_of_residence_at_enrollment', 'demographic.days_to_birth', 'demographic.days_to_death', 'demographic.demographic_id', 'demographic.education_level', 'demographic.ethnicity', 'demographic.gender', 'demographic.marital_status', 'demographic.occupation_duration_years', 'demographic.population_group', 'demographic.premature_at_birth', 'demographic.race', 'demographic.submitter_id', 'demographic.vital_status', 'demographic.weeks_gestation_at_birth', 'demographic.year_of_birth', 'demographic.year_of_death', 'diagnoses.adrenal_hormone', 'diagnoses.age_at_diagnosis', 'diagno

In [5]:
# Number of unique patients
unique_patients = brca['cases.submitter_id'].nunique()
print(f'The data has {unique_patients} unique patients.')

The data has 1098 unique patients.


## Cervical (CESC) Cancer Dataset

### Clinical Data

In [None]:
# Open CESC tsv data

cesc = pd.read_csv('../../data/raw-data/cesc/cesc-clinical.tsv', sep='\t')
cesc.head(3)

Unnamed: 0,project.project_id,cases.case_id,cases.consent_type,cases.days_to_consent,cases.days_to_lost_to_followup,cases.disease_type,cases.index_date,cases.lost_to_followup,cases.primary_site,cases.submitter_id,...,treatments.treatment_duration,treatments.treatment_effect,treatments.treatment_effect_indicator,treatments.treatment_frequency,treatments.treatment_id,treatments.treatment_intent_type,treatments.treatment_or_therapy,treatments.treatment_outcome,treatments.treatment_outcome_duration,treatments.treatment_type
0,TCGA-CESC,00ad0ffe-2105-4829-a495-1c2aceb5bb31,Informed Consent,0,'--,Squamous Cell Neoplasms,Diagnosis,Yes,Cervix uteri,TCGA-EK-A2R9,...,'--,'--,'--,'--,672b3cf9-bb40-4f6f-a1c9-69ac3383fbd5,'--,'--,'--,'--,"Hysterectomy, NOS"
1,TCGA-CESC,00ad0ffe-2105-4829-a495-1c2aceb5bb31,Informed Consent,0,'--,Squamous Cell Neoplasms,Diagnosis,Yes,Cervix uteri,TCGA-EK-A2R9,...,'--,'--,'--,'--,d4baa31f-8c1f-5333-afcd-836816fd1a2a,Adjuvant,unknown,'--,'--,"Pharmaceutical Therapy, NOS"
2,TCGA-CESC,00ad0ffe-2105-4829-a495-1c2aceb5bb31,Informed Consent,0,'--,Squamous Cell Neoplasms,Diagnosis,Yes,Cervix uteri,TCGA-EK-A2R9,...,'--,'--,'--,'--,e79370ba-36f0-4639-bc8f-119ba2b2457b,Adjuvant,unknown,'--,'--,"Radiation Therapy, NOS"
3,TCGA-CESC,00bca18c-b3d4-45a3-8f19-034cc40449a4,Informed Consent,2108,'--,Squamous Cell Neoplasms,Diagnosis,Yes,Cervix uteri,TCGA-C5-A2LV,...,'--,'--,'--,'--,277d525e-9674-4954-b427-3e829d469b8f,'--,'--,'--,'--,"Hysterectomy, NOS"
4,TCGA-CESC,00bca18c-b3d4-45a3-8f19-034cc40449a4,Informed Consent,2108,'--,Squamous Cell Neoplasms,Diagnosis,Yes,Cervix uteri,TCGA-C5-A2LV,...,'--,'--,'--,'--,788ff156-d009-46f7-b832-f39b11ed13ac,Adjuvant,no,'--,'--,"Radiation Therapy, NOS"


In [5]:
nrow, ncol = cesc.shape
print(f'The data has {nrow} rows and {ncol} columns.')

The data has 1535 rows and 210 columns.


In [6]:
# List column names
print("Column names:")
print(cesc.columns.tolist())

Column names:
['project.project_id', 'cases.case_id', 'cases.consent_type', 'cases.days_to_consent', 'cases.days_to_lost_to_followup', 'cases.disease_type', 'cases.index_date', 'cases.lost_to_followup', 'cases.primary_site', 'cases.submitter_id', 'demographic.age_at_index', 'demographic.age_is_obfuscated', 'demographic.cause_of_death', 'demographic.cause_of_death_source', 'demographic.country_of_birth', 'demographic.country_of_residence_at_enrollment', 'demographic.days_to_birth', 'demographic.days_to_death', 'demographic.demographic_id', 'demographic.education_level', 'demographic.ethnicity', 'demographic.gender', 'demographic.marital_status', 'demographic.occupation_duration_years', 'demographic.population_group', 'demographic.premature_at_birth', 'demographic.race', 'demographic.submitter_id', 'demographic.vital_status', 'demographic.weeks_gestation_at_birth', 'demographic.year_of_birth', 'demographic.year_of_death', 'diagnoses.adrenal_hormone', 'diagnoses.age_at_diagnosis', 'diagno

In [5]:
# Number of unique patients
unique_patients = cesc['cases.submitter_id'].nunique()
print(f'The data has {unique_patients} unique patients.')

The data has 307 unique patients.


### Exposure Data

In [None]:
# Open cesc exposure tsv data

cesc_exposure_df = pd.read_csv('../../data/raw-data/cesc/exposure.tsv', sep='\t')
cesc_exposure_df.head(3)

Unnamed: 0,project.project_id,cases.case_id,cases.submitter_id,exposures.age_at_last_exposure,exposures.age_at_onset,exposures.alcohol_days_per_week,exposures.alcohol_drinks_per_day,exposures.alcohol_frequency,exposures.alcohol_history,exposures.alcohol_intensity,...,exposures.smoking_frequency,exposures.submitter_id,exposures.time_between_waking_and_first_smoke,exposures.tobacco_smoking_onset_year,exposures.tobacco_smoking_quit_year,exposures.tobacco_smoking_status,exposures.type_of_smoke_exposure,exposures.type_of_tobacco_used,exposures.use_per_day,exposures.years_smoked
0,TCGA-CESC,00ad0ffe-2105-4829-a495-1c2aceb5bb31,TCGA-EK-A2R9,'--,'--,'--,'--,'--,'--,'--,...,'--,'--,'--,'--,'--,Not Reported,'--,'--,'--,'--
1,TCGA-CESC,00bca18c-b3d4-45a3-8f19-034cc40449a4,TCGA-C5-A2LV,'--,20,'--,'--,'--,'--,'--,...,'--,'--,'--,'--,'--,Current Smoker,'--,'--,'--,'--
2,TCGA-CESC,010a807f-9dc0-4e14-9533-dcf478f3d947,TCGA-C5-A902,'--,14,'--,'--,'--,'--,'--,...,'--,'--,'--,'--,'--,Current Smoker,'--,'--,'--,'--
3,TCGA-CESC,03804f9b-df7c-462c-8984-8eb3a5ed4999,TCGA-VS-A9V2,'--,'--,'--,'--,'--,'--,'--,...,'--,'--,'--,'--,'--,Lifelong Non-Smoker,'--,'--,'--,'--
4,TCGA-CESC,03c3fe57-ae85-4e45-9657-c3182ab5e124,TCGA-C5-A1BL,'--,16,'--,'--,'--,'--,'--,...,'--,'--,'--,'--,1988,Current Reformed Smoker for < or = 15 yrs,'--,'--,'--,'--


The code above can also be found on our repository [here](https://github.com/dsan-5000/fall-2025-project-Munashe22/tree/main/technical-details/data-collection)