# 1. Basic setup

In [1]:
import os
import sys

sys.path.append('..')
from utils import *

USER = "Jiayi"
DATA_ROOT = '/Users/legion/Desktop/Courses/IS389/data'   
OUTPUT_ROOT = '/Users/legion/Desktop/Courses/IS389/output2'
TRAIT = "Paget's Disease of bone"

OUTPUT_DIR = os.path.join(OUTPUT_ROOT, USER, '-'.join(TRAIT.split()))
JSON_PATH = os.path.join(OUTPUT_DIR, "cohort_info.json")
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR, exist_ok=True)


utils.py has been loaded


## The GEO dataset

In [2]:
dataset = 'GEO'
trait_subdir = "Paget_s-Disease-of-bone"

trait_path = os.path.join(DATA_ROOT, dataset, trait_subdir)
os.listdir(trait_path)

['GSE7545', 'GSE7849']

Repeat the below steps for all the accession numbers

In [3]:
# No obvious trait
cohort = accession_num = "GSE7545"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Paget_s-Disease-of-bone\\GSE7545\\GSE7545_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Paget_s-Disease-of-bone\\GSE7545\\GSE7545-GPL3718_series_matrix.txt.gz')

In [7]:
# The clinical data is biased
cohort = accession_num = "GSE7849"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Paget_s-Disease-of-bone\\GSE7849\\GSE7849_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Paget_s-Disease-of-bone\\GSE7849\\GSE7849_series_matrix.txt.gz')

### Initial filtering and clinical data preprocessing

In [13]:
from utils import *
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']    

background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)
print(background_info)

!Series_title	"Age-Specific Differences of Oncogenic Pathway Deregulation Seen in Human Breast Tumors"
!Series_summary	"Breast cancer arising in young women has a poorer prognosis, is less likely to be hormone sensitive, and represents a particularly challenging clinical entity.  The biology driving the aggressive nature of breast cancer arising in young women has yet to be defined."
!Series_summary	"Among 784 patients with early stage breast cancer, using prospectively-defined, age-specific cohorts (young <= 45 years; older >= 65 years), 411 eligible patients (n = 200 < 45 years; n = 211 >= 65 years) with clinically-annotated Affymetrix microarray data were identified. Gene set enrichment analyses, signatures of oncogenic pathway deregulation and predictors of chemotherapy sensitivity were evaluated within the two age-defined cohorts."
!Series_summary	"In comparing deregulation of oncogenic pathways between age groups, a statistically higher probability of PI3K (p = 0.006) and Myc (p 

In [9]:
clinical_data.head()

Unnamed: 0,!Sample_geo_accession,GSM190415,GSM190416,GSM190417,GSM190418,GSM190419,GSM190420,GSM190421,GSM190422,GSM190423,...,GSM190483,GSM190484,GSM190485,GSM190486,GSM190487,GSM190488,GSM190489,GSM190490,GSM190491,GSM190492
0,!Sample_characteristics_ch1,Tumor,Tumor,Tumor,Tumor,Tumor,Tumor,Tumor,Tumor,Tumor,...,Tumor,Tumor,Tumor,Tumor,Tumor,Tumor,Tumor,Tumor,Tumor,Tumor
1,!Sample_characteristics_ch1,EA_Designation = 1,EA_Designation = 2,EA_Designation = 3,EA_Designation = 4,EA_Designation = 5,EA_Designation = 6,EA_Designation = 7,EA_Designation = 8,EA_Designation = 9,...,EA_Designation = 80,EA_Designation = 81,EA_Designation = 82,EA_Designation = 83,EA_Designation = 85,EA_Designation = 86,EA_Designation = 87,EA_Designation = 88,EA_Designation = 89,EA_Designation = 90
2,!Sample_characteristics_ch1,%_Tumor = 80,%_Tumor = 60,%_Tumor = 70,%_Tumor = 50,%_Tumor = 50,%_Tumor = 90,%_Tumor = 60,%_Tumor = 50,%_Tumor = 80,...,%_Tumor = 60,%_Tumor = 60,%_Tumor = 50,%_Tumor = 60,%_Tumor = 70,%_Tumor = 50,%_Tumor = 90,%_Tumor = 60,%_Tumor = 60,%_Tumor = 60
3,!Sample_characteristics_ch1,Race = White,Race = White,Race = White,Race = Black,Race = Asian,Race = White,Race = White,Race = White,Race = White,...,Race = White,Race = White,Race = Black,Race = White,Race = White,Race = White,Race = White,Race = Black,Race = Am Indian,Race = White
4,!Sample_characteristics_ch1,"Histo_Desc = Ductal Carcinoma, NOS","Histo_Desc = Ductal Carcinoma, NOS","Histo_Desc = Ductal Carcinoma, NOS","Histo_Desc = Ductal Carcinoma, NOS","Histo_Desc = Ductal Carcinoma, NOS","Histo_Desc = Ductal Carcinoma, NOS","Histo_Desc = Ductal Carcinoma, NOS",Histo_Desc = Infiltrating Duct & Lobualar Carc...,"Histo_Desc = Ductal Carcinoma, NOS",...,"Histo_Desc = Ductal Carcinoma, NOS","Histo_Desc = Ductal Carcinoma, NOS",Histo_Desc = Infiltrating Duct & Lobualar Carc...,"Histo_Desc = Ductal Carcinoma, NOS",Histo_Desc = Paget Disease & Infiltrating Duct...,"Histo_Desc = Ductal Carcinoma, NOS","Histo_Desc = Carcinoma, NOS","Histo_Desc = Ductal Carcinoma, NOS","Histo_Desc = Ductal Carcinoma, NOS","Histo_Desc = Ductal Carcinoma, NOS"


Analyze the trait row:

In [14]:
tumor_stage_row = clinical_data.iloc[2]
tumor_stage_row.unique()

array(['!Sample_characteristics_ch1', '%_Tumor = 80', '%_Tumor = 60',
       '%_Tumor = 70', '%_Tumor = 50', '%_Tumor = 90'], dtype=object)

Determine the trait row, age row, and gender row. Then implement the conversion functions:

In [15]:
trait_row = 0
age_row = None
gender_row = None

def convert_trait(trait):
    # Check if the trait indicates a normal disease status
    if (trait == 'diabetes reversal status: Yes'):
        return 0
    else:
        return 1


def convert_age(age_string):
    if age_string.lower() == 'n.a.':
        return None
    try:
        age = int(age_string.split(':')[1])/12
        age = round(age, 0)
        return age
    except (ValueError, IndexError):
        return None

def convert_gender(gender_string):
    if (gender_string.lower() == 'sex: female' or gender_string.lower() == 'sex: f' or gender_string.lower() == 'gender: female' or gender_string.lower() == 'gender: f'):
        return 1
    elif (gender_string.lower() == 'sex: male' or gender_string.lower() == 'sex: m' or gender_string.lower() == 'gender: male' or gender_string.lower() == 'gender: m') :  # changeed 
        return 0
    else:
        return None

Check the processed clinical data:

In [16]:
selected_clinical_data = geo_select_clinical_features(clinical_data, TRAIT, trait_row, convert_trait, age_row=age_row,
                                                      convert_age=convert_age, gender_row=gender_row,
                                                      convert_gender=convert_gender)
selected_clinical_data.head()

  clinical_df = clinical_df.applymap(convert_fn)


Unnamed: 0,GSM190415,GSM190416,GSM190417,GSM190418,GSM190419,GSM190420,GSM190421,GSM190422,GSM190423,GSM190424,...,GSM190483,GSM190484,GSM190485,GSM190486,GSM190487,GSM190488,GSM190489,GSM190490,GSM190491,GSM190492
Paget's Disease of bone,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


### Genetic data preprocessing and final filtering

Check the genetic data:

In [17]:
genetic_data = get_genetic_data(matrix_file)
genetic_data.head()

Unnamed: 0_level_0,GSM190415,GSM190416,GSM190417,GSM190418,GSM190419,GSM190420,GSM190421,GSM190422,GSM190423,GSM190424,...,GSM190483,GSM190484,GSM190485,GSM190486,GSM190487,GSM190488,GSM190489,GSM190490,GSM190491,GSM190492
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000_at,1469.4,1213.7,1926.6,932.1,1314.2,2640.2,796.4,1562.6,1501.3,1157.8,...,1013.7,947.7,1393.4,1444.9,1289.9,1268.8,1068.0,1515.5,1348.4,1283.7
1001_at,201.7,109.4,117.1,312.8,111.8,147.2,117.5,540.2,105.6,184.5,...,115.3,167.5,276.5,143.0,98.0,315.4,324.0,249.1,348.9,265.3
1002_f_at,7.2,21.7,24.8,26.9,26.7,17.6,11.6,80.6,22.4,29.1,...,24.0,21.4,24.1,17.8,36.4,16.2,19.7,17.9,21.8,16.0
1003_s_at,209.1,161.0,153.1,528.8,94.8,72.8,46.0,334.9,212.6,221.4,...,253.8,57.8,44.6,259.3,154.1,70.2,295.8,41.9,197.7,47.9
1004_at,139.6,379.7,173.7,755.9,171.7,190.7,160.8,367.3,167.2,139.7,...,224.6,239.0,204.6,766.5,1075.1,75.4,332.2,114.1,100.2,222.6


Check if the gene dataset requires mapping to get the gene symbols corresponding to each data row.

In [18]:
requires_gene_mapping = True

if requires_gene_mapping:
    gene_annotation = get_gene_annotation(soft_file)
    gene_annotation_summary = preview_df(gene_annotation)
    print(gene_annotation_summary)

gene_annotation.columns

{'ID': ['1000_at', '1001_at', '1002_f_at', '1003_s_at', '1004_at'], 'GB_ACC': ['X60188', 'X60957', 'X65962', 'X68149', 'X68149'], 'SPOT_ID': [nan, nan, nan, nan, nan], 'Species Scientific Name': ['Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens'], 'Annotation Date': ['Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014', 'Oct 6, 2014'], 'Sequence Type': ['Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence', 'Exemplar sequence'], 'Sequence Source': ['GenBank', 'GenBank', 'GenBank', 'GenBank', 'GenBank'], 'Target Description': ['X60188 /FEATURE=mRNA /DEFINITION=HSERK1 Human ERK1 mRNA for protein serine/threonine kinase', 'X60957 /FEATURE=cds /DEFINITION=HSTIEMR Human tie mRNA for putative receptor tyrosine kinase', 'X65962 /FEATURE=cds /DEFINITION=HSCP450 H.sapiens mRNA for cytochrome P-450', "X68149 /FEATURE=cds /DEFINITION=HSBLR1A Homo sapiens BLR1 gene for Burkitt's lymphoma receptor 1", "X68149 /FEATURE=exon#2 /DEFINITIO

Index(['ID', 'GB_ACC', 'SPOT_ID', 'Species Scientific Name', 'Annotation Date',
       'Sequence Type', 'Sequence Source', 'Target Description',
       'Representative Public ID', 'Gene Title', 'Gene Symbol',
       'ENTREZ_GENE_ID', 'RefSeq Transcript ID',
       'Gene Ontology Biological Process', 'Gene Ontology Cellular Component',
       'Gene Ontology Molecular Function'],
      dtype='object')

In [19]:
if requires_gene_mapping:
    identifier_key = 'ID'
    gene_symbol_key = 'Gene Symbol'
    gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)
    genetic_data = apply_gene_mapping(genetic_data, gene_mapping)

In [20]:
genetic_data = normalize_gene_symbols_in_index(genetic_data)
genetic_data

Unnamed: 0,GSM190415,GSM190416,GSM190417,GSM190418,GSM190419,GSM190420,GSM190421,GSM190422,GSM190423,GSM190424,...,GSM190483,GSM190484,GSM190485,GSM190486,GSM190487,GSM190488,GSM190489,GSM190490,GSM190491,GSM190492
AADAC,45.00,21.20,27.0,49.0,28.40,9.00,114.70,231.4,14.50,34.30,...,35.30,18.10,9.10,14.40,167.8,26.20,22.20,12.30,50.2,53.80
AAK1,60.56,112.84,96.5,149.2,77.16,85.84,78.98,214.6,78.68,99.08,...,91.36,103.46,84.64,90.62,125.2,114.26,138.88,120.46,126.2,91.68
AAMP,937.60,685.40,851.6,544.0,658.30,751.10,557.80,845.6,665.80,812.50,...,661.20,1031.70,788.40,678.50,98.4,1058.50,1415.20,1308.70,930.6,1476.80
AANAT,1149.40,1015.20,1053.1,1223.1,897.70,983.00,637.40,1951.7,942.10,842.90,...,1024.30,527.10,620.50,647.90,1241.6,748.60,1032.70,845.00,656.6,606.90
AASDHPPT,344.20,282.70,553.7,269.4,477.30,516.30,634.40,215.2,424.60,487.30,...,626.70,573.10,661.70,603.30,500.5,437.90,507.70,728.20,369.1,752.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZW10,55.80,9.10,48.2,17.7,64.30,38.00,35.70,70.0,11.00,18.40,...,43.90,55.10,53.90,56.60,31.4,51.30,31.10,9.60,57.9,48.90
ZWINT,837.90,1452.70,1270.4,287.0,2092.80,459.90,613.30,178.1,3487.60,1434.20,...,934.10,789.00,799.80,1127.30,809.8,365.40,1062.40,1714.00,533.2,1344.60
ZYX,2342.90,1803.00,2707.1,3035.6,1779.30,1813.10,2838.50,1544.8,2684.30,1580.10,...,2023.80,3519.60,2961.00,1501.50,2178.4,1833.00,1372.70,3333.10,1846.1,3228.40
ZZEF1,215.00,254.10,317.1,397.8,258.10,419.70,303.00,351.7,273.10,351.80,...,291.50,381.00,299.90,232.00,171.8,283.70,265.60,486.30,418.8,193.10


Use selected clinical data and genetic data to generate the merged data:

In [21]:
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, genetic_data)
is_available = True

merged_data

Unnamed: 0,Paget's Disease of bone,AADAC,AAK1,AAMP,AANAT,AASDHPPT,AASS,AATF,AATK,ABAT,...,ZRSR2,ZSCAN12,ZSCAN26,ZSCAN9,ZSWIM8,ZW10,ZWINT,ZYX,ZZEF1,ZZZ3
GSM190415,1.0,45.0,60.56,937.6,1149.4,344.2,46.0,1405.1,678.0,24.100000,...,279.05,49.15,290.3,239.8,64.6,55.8,837.9,2342.9,215.0,419.1
GSM190416,1.0,21.2,112.84,685.4,1015.2,282.7,94.5,2213.7,509.5,85.433333,...,233.80,60.05,222.9,210.3,91.9,9.1,1452.7,1803.0,254.1,281.4
GSM190417,1.0,27.0,96.50,851.6,1053.1,553.7,10.7,1526.2,329.7,70.333333,...,230.70,22.00,237.6,238.1,92.2,48.2,1270.4,2707.1,317.1,253.9
GSM190418,1.0,49.0,149.20,544.0,1223.1,269.4,34.0,1085.3,1286.7,103.600000,...,671.75,45.70,179.4,341.1,232.7,17.7,287.0,3035.6,397.8,366.7
GSM190419,1.0,28.4,77.16,658.3,897.7,477.3,16.2,1859.8,580.0,53.566667,...,182.40,12.80,217.8,237.6,65.0,64.3,2092.8,1779.3,258.1,387.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM190488,1.0,26.2,114.26,1058.5,748.6,437.9,29.9,1391.8,881.3,61.766667,...,331.00,52.20,268.4,331.0,102.3,51.3,365.4,1833.0,283.7,271.1
GSM190489,1.0,22.2,138.88,1415.2,1032.7,507.7,104.1,1334.4,848.4,28.233333,...,312.90,47.40,200.8,488.8,132.5,31.1,1062.4,1372.7,265.6,296.5
GSM190490,1.0,12.3,120.46,1308.7,845.0,728.2,72.4,773.2,419.2,105.733333,...,266.00,59.65,210.1,293.1,143.7,9.6,1714.0,3333.1,486.3,131.2
GSM190491,1.0,50.2,126.20,930.6,656.6,369.1,19.1,1121.5,380.9,253.066667,...,284.45,46.60,323.2,550.7,80.2,57.9,533.2,1846.1,418.8,190.5


Check if the merged data biased or not:

In [22]:
trait_type = 'binary'
print(f"The merged dataset contains {len(merged_data)} samples.")
is_trait_biased, merged_data = judge_and_remove_biased_features(merged_data, TRAIT, trait_type=trait_type)
is_trait_biased

The merged dataset contains 78 samples.
For the feature 'Paget's Disease of bone', the least common label is '1.0' with 78 occurrences. This represents 100.00% of the dataset.
The distribution of the feature 'Paget's Disease of bone' in this dataset is severely biased.



True

Save the data as a csv file:

In [23]:
if is_available:
    save_cohort_info(cohort, JSON_PATH, is_available, is_trait_biased, merged_data, note='')
else:
    save_cohort_info(cohort, JSON_PATH, is_available)
merged_data.head()
if not is_trait_biased:
    merged_data.to_csv(os.path.join(OUTPUT_DIR, cohort + '.csv'), index=False)

A new JSON file was created at: /Users/legion/Desktop/Courses/IS389/output2\Jiayi\Paget's-Disease-of-bone\cohort_info.json
