# 1. Basic setup

In [1]:
import os
import sys

sys.path.append('..')
from utils import *

USER = "Jiayi"
DATA_ROOT = '/Users/legion/Desktop/Courses/IS389/data'   
OUTPUT_ROOT = '/Users/legion/Desktop/Courses/IS389/output2'
TRAIT = 'Fibromyalgia'

OUTPUT_DIR = os.path.join(OUTPUT_ROOT, USER, '-'.join(TRAIT.split()))
JSON_PATH = os.path.join(OUTPUT_DIR, "cohort_info.json")
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR, exist_ok=True)


utils.py has been loaded


## The GEO dataset

In [2]:
dataset = 'GEO'
trait_subdir = "Fibromyalgia"

trait_path = os.path.join(DATA_ROOT, dataset, trait_subdir)
os.listdir(trait_path)

['GSE67311']

Repeat the below steps for all the accession numbers

In [3]:
# No gene mapping
cohort = accession_num = "GSE67311"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\Fibromyalgia\\GSE67311\\GSE67311_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\Fibromyalgia\\GSE67311\\GSE67311_series_matrix.txt.gz')

### Initial filtering and clinical data preprocessing

In [4]:
from utils import *
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']    

background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)
print(background_info)

!Series_title	"Peripheral Blood Gene Expression in Fibromyalgia Patients Reveals  Potential Biological Markers and Physiological Pathways"
!Series_summary	"Fibromyalgia (FM) is a common pain disorder characterized by dysregulation in the processing of pain. Although FM has similarities with other rheumatologic pain disorders, the search for objective markers has not been successful. In the current study we analyzed gene expression in the whole blood of 70 fibromyalgia patients and 70 healthy matched controls. Global molecular profiling revealed an upregulation of several inflammatory molecules in FM patients and downregulation of specific pathways related to hypersensitivity and allergy. There was a differential expression of genes in known pathways for pain processing, such as glutamine/glutamate signaling and axonal development. We also identified a panel of candidate gene expression-based classifiers that could establish an objective blood-based molecular diagnostic to objectively i

In [5]:
clinical_data.head()

Unnamed: 0,!Sample_geo_accession,GSM1644447,GSM1644448,GSM1644449,GSM1644450,GSM1644451,GSM1644452,GSM1644453,GSM1644454,GSM1644455,...,GSM1644579,GSM1644580,GSM1644581,GSM1644582,GSM1644583,GSM1644584,GSM1644585,GSM1644586,GSM1644587,GSM1644588
0,!Sample_characteristics_ch1,diagnosis: healthy control,diagnosis: healthy control,diagnosis: healthy control,diagnosis: healthy control,diagnosis: healthy control,diagnosis: healthy control,diagnosis: healthy control,diagnosis: healthy control,diagnosis: healthy control,...,diagnosis: fibromyalgia,diagnosis: fibromyalgia,diagnosis: fibromyalgia,diagnosis: fibromyalgia,diagnosis: fibromyalgia,diagnosis: fibromyalgia,diagnosis: fibromyalgia,diagnosis: fibromyalgia,diagnosis: fibromyalgia,diagnosis: fibromyalgia
1,!Sample_characteristics_ch1,tissue: peripheral blood,tissue: peripheral blood,tissue: peripheral blood,tissue: peripheral blood,tissue: peripheral blood,tissue: peripheral blood,tissue: peripheral blood,tissue: peripheral blood,tissue: peripheral blood,...,tissue: peripheral blood,tissue: peripheral blood,tissue: peripheral blood,tissue: peripheral blood,tissue: peripheral blood,tissue: peripheral blood,tissue: peripheral blood,tissue: peripheral blood,tissue: peripheral blood,tissue: peripheral blood
2,!Sample_characteristics_ch1,fiqr score: 8.5,fiqr score: -2.0,fiqr score: 9.8,fiqr score: 0.5,fiqr score: -1.0,fiqr score: -0.5,fiqr score: 2.2,fiqr score: -2.0,fiqr score: -2.0,...,fiqr score: 41.8,fiqr score: 54.5,fiqr score: 63.0,fiqr score: 64.0,fiqr score: 67.2,fiqr score: 17.8,fiqr score: 17.8,fiqr score: 41.7,fiqr score: 81.2,fiqr score: 54.7
3,!Sample_characteristics_ch1,bmi: 36,bmi: 34,bmi: 33,bmi: 22,bmi: 24,bmi: 28,bmi: 23,bmi: 48,bmi: 48,...,bmi: 22,bmi: 25,bmi: 29,bmi: 38,bmi: 28,bmi: 37,bmi: 0,bmi: 38,bmi: 20,bmi: 34
4,!Sample_characteristics_ch1,migraine: No,migraine: No,migraine: No,migraine: No,migraine: No,migraine: No,migraine: No,migraine: No,migraine: No,...,migraine: No,migraine: Yes,migraine: Yes,migraine: No,migraine: Yes,migraine: Yes,migraine: -,migraine: Yes,migraine: Yes,migraine: No


Analyze the trait row:

In [6]:
tumor_stage_row = clinical_data.iloc[0]
tumor_stage_row.unique()

array(['!Sample_characteristics_ch1', 'diagnosis: healthy control',
       'diagnosis: fibromyalgia'], dtype=object)

Determine the trait row, age row, and gender row. Then implement the conversion functions:

In [7]:
trait_row = 0
age_row = None
gender_row = None

def convert_trait(trait):
    # Check if the trait indicates a normal disease status
    if trait == 'diagnosis: healthy control':
        return 0
    else:
        return 1


def convert_age(age_string):
    if age_string.lower() == 'n.a.':
        return None
    try:
        age = int(age_string.split(':')[1])/12
        age = round(age, 0)
        return age
    except (ValueError, IndexError):
        return None

def convert_gender(gender_string):
    if (gender_string.lower() == 'sex: female' or gender_string.lower() == 'sex: f' or gender_string.lower() == 'gender: female' or gender_string.lower() == 'gender: f'):
        return 1
    elif (gender_string.lower() == 'sex: male' or gender_string.lower() == 'sex: m' or gender_string.lower() == 'gender: male' or gender_string.lower() == 'gender: m') :  # changeed 
        return 0
    else:
        return None

Check the processed clinical data:

In [8]:
selected_clinical_data = geo_select_clinical_features(clinical_data, TRAIT, trait_row, convert_trait, age_row=age_row,
                                                      convert_age=convert_age, gender_row=gender_row,
                                                      convert_gender=convert_gender)
selected_clinical_data.head()

  clinical_df = clinical_df.applymap(convert_fn)


Unnamed: 0,GSM1644447,GSM1644448,GSM1644449,GSM1644450,GSM1644451,GSM1644452,GSM1644453,GSM1644454,GSM1644455,GSM1644456,...,GSM1644579,GSM1644580,GSM1644581,GSM1644582,GSM1644583,GSM1644584,GSM1644585,GSM1644586,GSM1644587,GSM1644588
Fibromyalgia,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1


### Genetic data preprocessing and final filtering

Check the genetic data:

In [9]:
genetic_data = get_genetic_data(matrix_file)
genetic_data.head()

Unnamed: 0_level_0,GSM1644447,GSM1644448,GSM1644449,GSM1644450,GSM1644451,GSM1644452,GSM1644453,GSM1644454,GSM1644455,GSM1644456,...,GSM1644579,GSM1644580,GSM1644581,GSM1644582,GSM1644583,GSM1644584,GSM1644585,GSM1644586,GSM1644587,GSM1644588
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7892501,5.62341,4.54841,4.74053,3.06227,3.65178,4.34336,5.36535,3.69126,2.52748,4.45481,...,2.93373,4.78499,2.86791,5.63311,4.38304,3.72055,3.03068,4.61205,5.38301,3.56539
7892502,5.37542,4.78069,5.70991,5.38193,5.92017,6.09522,5.78167,4.59286,5.16439,5.94071,...,5.68648,5.48387,5.50597,5.27264,6.12783,6.24468,4.73856,5.35777,5.66831,6.07197
7892503,5.14609,4.74459,5.57936,4.72783,4.83541,5.62107,5.26481,4.34715,3.71148,4.25644,...,5.23848,4.28919,4.2125,4.37036,5.32584,4.3627,5.43636,5.27041,5.41011,6.01511
7892504,9.50803,9.64513,9.51809,9.20097,9.20887,9.34666,9.15673,10.1156,9.40959,9.49014,...,9.02612,9.15626,9.21809,8.86528,9.47661,10.3121,9.50119,9.54925,9.48669,8.83406
7892505,3.1536,3.26439,4.3503,2.60544,3.78148,4.04654,5.30473,3.62526,2.82275,2.74823,...,2.64208,2.78818,3.98362,3.54878,2.34695,3.96788,3.31853,2.57307,3.194,3.0189


Check if the gene dataset requires mapping to get the gene symbols corresponding to each data row.

In [10]:
requires_gene_mapping = True

if requires_gene_mapping:
    gene_annotation = get_gene_annotation(soft_file)
    gene_annotation_summary = preview_df(gene_annotation)
    print(gene_annotation_summary)

gene_annotation.columns

{'ID': ['7896736', '7896738', '7896740', '7896742', '7896744'], 'GB_LIST': [nan, nan, 'NM_001005240,NM_001004195,NM_001005484,BC136848,BC136907', 'BC118988,AL137655', 'NM_001005277,NM_001005221,NM_001005224,NM_001005504,BC137547'], 'SPOT_ID': ['chr1:53049-54936', 'chr1:63015-63887', 'chr1:69091-70008', 'chr1:334129-334296', 'chr1:367659-368597'], 'seqname': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], 'RANGE_GB': ['NC_000001.10', 'NC_000001.10', 'NC_000001.10', 'NC_000001.10', 'NC_000001.10'], 'RANGE_STRAND': ['+', '+', '+', '+', '+'], 'RANGE_START': [53049.0, 63015.0, 69091.0, 334129.0, 367659.0], 'RANGE_STOP': [54936.0, 63887.0, 70008.0, 334296.0, 368597.0], 'total_probes': [7.0, 31.0, 24.0, 6.0, 36.0], 'gene_assignment': ['---', '---', 'NM_001005240 // OR4F17 // olfactory receptor, family 4, subfamily F, member 17 // 19p13.3 // 81099 /// NM_001004195 // OR4F4 // olfactory receptor, family 4, subfamily F, member 4 // 15q26.3 // 26682 /// NM_001005484 // OR4F5 // olfactory receptor, fami

Index(['ID', 'GB_LIST', 'SPOT_ID', 'seqname', 'RANGE_GB', 'RANGE_STRAND',
       'RANGE_START', 'RANGE_STOP', 'total_probes', 'gene_assignment',
       'mrna_assignment', 'category'],
      dtype='object')

In [159]:
if requires_gene_mapping:
    identifier_key = 'ID'
    gene_symbol_key = 'Symbol'
    gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)
    genetic_data = apply_gene_mapping(genetic_data, gene_mapping)

In [160]:
genetic_data = normalize_gene_symbols_in_index(genetic_data)
genetic_data

Unnamed: 0,GSM1054461,GSM1054462,GSM1054463,GSM1054464,GSM1054465,GSM1054466,GSM1054467,GSM1054468,GSM1054469,GSM1054470,...,GSM1054504,GSM1054505,GSM1054506,GSM1054507,GSM1054508,GSM1054509,GSM1054510,GSM1054511,GSM1054512,GSM1054513
A1BG,6.1013,5.8975,6.1725,5.9452,5.8125,6.2270,6.6265,6.1640,6.2533,5.9966,...,8.1697,7.3470,7.3858,7.2936,7.3586,7.5003,7.7289,7.2938,7.2880,7.3009
A1BG-AS1,4.1155,3.8231,4.4809,3.8832,3.5292,4.0760,4.2817,4.0797,3.7546,3.7637,...,6.9534,6.8703,6.5639,7.2063,6.9356,6.7637,7.2619,6.9173,6.9877,6.8485
A1CF,3.0490,2.8329,3.0462,3.0998,2.8448,2.9790,3.1286,3.2239,2.9757,2.8277,...,3.8526,3.1797,3.2482,3.1539,3.3267,3.2850,3.4254,3.2007,3.3306,3.1708
A2M,9.5803,8.8574,8.6237,8.7689,9.6166,9.5560,9.5490,8.8824,8.9636,8.7677,...,8.3536,8.4455,6.4056,5.0722,7.2323,6.9800,8.2093,5.0774,5.0126,10.3406
A2ML1,3.2303,3.1281,3.3229,3.3928,3.1351,3.3229,3.7033,3.3818,3.2558,3.2793,...,4.7692,3.7672,3.9695,3.8859,3.8911,4.0563,4.1931,3.9923,3.7533,4.0087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,3.2158,3.7873,3.4058,3.3421,3.3949,3.2535,3.6718,3.5500,3.4669,3.2070,...,4.8983,4.7218,4.8042,4.0250,4.4530,4.5717,4.6689,4.7296,4.4920,4.3177
ZYG11B,9.0668,9.4642,9.3095,9.4767,8.9353,8.6806,8.8417,8.3534,8.3751,8.3136,...,9.4790,9.6251,9.4461,9.4394,9.2585,9.0012,9.2760,9.3080,9.1942,9.4302
ZYX,7.1701,6.9826,6.9313,7.3664,7.2823,6.6743,6.9897,7.2487,7.2203,8.2768,...,10.7341,10.9998,10.7922,10.5329,10.4911,10.2403,11.2125,10.5929,10.4838,10.8218
ZZEF1,6.3858,6.5421,6.7484,6.2827,6.6528,6.2846,6.1841,6.1643,6.1597,6.7696,...,7.7777,8.1910,8.3599,8.4375,8.4334,8.3261,7.9002,8.3949,8.2926,8.2379


Use selected clinical data and genetic data to generate the merged data:

In [161]:
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, genetic_data)
is_available = True

merged_data

Unnamed: 0,Mitochondrial Disorders,A1BG,A1BG-AS1,A1CF,A2M,A2ML1,A4GALT,A4GNT,AA06,AAA1,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
GSM1054461,0.0,6.1013,4.1155,3.049,9.5803,3.2303,5.797,3.4187,3.6406,3.049,...,3.621,4.6627,6.7564,6.9139,6.4468,3.2158,9.0668,7.1701,6.3858,5.2251
GSM1054462,1.0,5.8975,3.8231,2.8329,8.8574,3.1281,5.2234,4.0981,2.8983,3.5197,...,3.6732,4.0082,6.3447,7.2577,6.3363,3.7873,9.4642,6.9826,6.5421,5.3963
GSM1054463,1.0,6.1725,4.4809,3.0462,8.6237,3.3229,5.5002,3.6228,4.0165,3.1844,...,3.7046,4.8828,6.4101,6.6484,6.4234,3.4058,9.3095,6.9313,6.7484,4.8275
GSM1054464,1.0,5.9452,3.8832,3.0998,8.7689,3.3928,5.6048,3.449,3.0753,3.0915,...,3.3512,4.4825,6.423,7.1715,6.423,3.3421,9.4767,7.3664,6.2827,5.0468
GSM1054465,1.0,5.8125,3.5292,2.8448,9.6166,3.1351,5.619,3.5056,3.2483,2.7159,...,3.8325,4.7506,7.4742,7.4161,6.295,3.3949,8.9353,7.2823,6.6528,5.0241
GSM1054466,0.0,6.227,4.076,2.979,9.556,3.3229,5.6816,3.44,3.3479,2.835,...,3.8073,4.5413,6.7705,7.3081,6.4915,3.2535,8.6806,6.6743,6.2846,5.101
GSM1054467,0.0,6.6265,4.2817,3.1286,9.549,3.7033,5.7616,3.8976,3.4092,3.0747,...,3.5066,4.8491,6.7513,7.0802,6.5757,3.6718,8.8417,6.9897,6.1841,4.8599
GSM1054468,0.0,6.164,4.0797,3.2239,8.8824,3.3818,5.6999,3.7429,3.4481,3.0651,...,3.594,4.7115,6.8967,7.3867,6.724,3.55,8.3534,7.2487,6.1643,4.9404
GSM1054469,0.0,6.2533,3.7546,2.9757,8.9636,3.2558,5.5583,4.319,3.5288,3.3684,...,3.5712,4.4865,6.9326,7.1493,6.7174,3.4669,8.3751,7.2203,6.1597,5.0926
GSM1054470,1.0,5.9966,3.7637,2.8277,8.7677,3.2793,6.5516,3.425,3.2469,2.7997,...,3.3897,4.4323,5.666,6.7189,7.6097,3.207,8.3136,8.2768,6.7696,5.2231


Check if the merged data biased or not:

In [162]:
trait_type = 'binary'
print(f"The merged dataset contains {len(merged_data)} samples.")
is_trait_biased, merged_data = judge_and_remove_biased_features(merged_data, TRAIT, trait_type=trait_type)
is_trait_biased

The merged dataset contains 43 samples.
For the feature 'Mitochondrial Disorders', the least common label is '0.0' with 18 occurrences. This represents 41.86% of the dataset.
The distribution of the feature 'Mitochondrial Disorders' in this dataset is fine.



False

Save the data as a csv file:

In [12]:
is_available = False

if is_available:
    save_cohort_info(cohort, JSON_PATH, is_available, is_trait_biased, merged_data, note='')
else:
    save_cohort_info(cohort, JSON_PATH, is_available)
merged_data.head()
if not is_trait_biased:
    merged_data.to_csv(os.path.join(OUTPUT_DIR, cohort + '.csv'), index=False)

A new JSON file was created at: /Users/legion/Desktop/Courses/IS389/output2\Jiayi\Fibromyalgia\cohort_info.json


NameError: name 'merged_data' is not defined