# 1. Basic setup

In [1]:
import os
import sys

sys.path.append('..')
from utils import *

USER = "Jiayi"
DATA_ROOT = '/Users/legion/Desktop/Courses/IS389/data'   
OUTPUT_ROOT = '/Users/legion/Desktop/Courses/IS389/output2'
TRAIT = "COVID-19"

OUTPUT_DIR = os.path.join(OUTPUT_ROOT, USER, '-'.join(TRAIT.split()))
JSON_PATH = os.path.join(OUTPUT_DIR, "cohort_info.json")
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR, exist_ok=True)


utils.py has been loaded


## The GEO dataset

In [2]:
dataset = 'GEO'
trait_subdir = "COVID-19"

trait_path = os.path.join(DATA_ROOT, dataset, trait_subdir)
os.listdir(trait_path)

['GSE207945', 'GSE243348']

Repeat the below steps for all the accession numbers

In [3]:
# Finished
cohort = accession_num = "GSE207945"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\COVID-19\\GSE207945\\GSE207945_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\COVID-19\\GSE207945\\GSE207945_series_matrix.txt.gz')

In [26]:
# No obvious trait
cohort = accession_num = "GSE243348"
cohort_dir = os.path.join(trait_path, accession_num)
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)
soft_file, matrix_file

('/Users/legion/Desktop/Courses/IS389/data\\GEO\\COVID-19\\GSE243348\\GSE243348_family.soft.gz',
 '/Users/legion/Desktop/Courses/IS389/data\\GEO\\COVID-19\\GSE243348\\GSE243348_series_matrix.txt.gz')

### Initial filtering and clinical data preprocessing

In [27]:
from utils import *
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']    

background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)
print(background_info)

!Series_title	"Longitudinal gene expression profiling of self-collected blood samples in COVID-19+ and healthy participants"
!Series_summary	"Longitudinal cohort: 773 host response genes were profiled in previously vaccinated (n=16) and unvaccinated (n=14) COVID-19+ participants along with 5 healthy uninfected controls across a 2-week observational window"
!Series_summary	"Single timepoint cohort: 773 host response genes were profiled in 6 healthy uninfected participants"
!Series_overall_design	"Longitudinal cohort: 30 COVID-19+ and 5 uninfected participants were asked perform self-collection and stabilization of capillary blood using a novel technology (homeRNA) every other day for two weeks (7 longtiudinal timepoints per participant). Temporal kinetics of 773 immune genes were profiled using the nCounter direct digital counting of native mRNA."
!Series_overall_design	"Single timepoint cohort: 6 healthy uninfected participants were asked perform self-collection and stabilization of ca

In [28]:
clinical_data.head()

Unnamed: 0,!Sample_geo_accession,GSM7783810,GSM7783811,GSM7783812,GSM7783813,GSM7783814,GSM7783815,GSM7783816,GSM7783817,GSM7783818,...,GSM7784037,GSM7784038,GSM7784039,GSM7784040,GSM7784041,GSM7784042,GSM7784043,GSM7784044,GSM7784045,GSM7784046
0,!Sample_characteristics_ch1,disease status: COVID-19+,disease status: COVID-19+,disease status: COVID-19+,disease status: COVID-19+,disease status: COVID-19+,disease status: COVID-19+,disease status: COVID-19+,disease status: COVID-19+,disease status: COVID-19+,...,disease status: Healthy uninfected,disease status: Healthy uninfected,disease status: Healthy uninfected,disease status: Healthy uninfected,disease status: Healthy uninfected,disease status: Healthy uninfected,disease status: Healthy uninfected,disease status: Healthy uninfected,disease status: Healthy uninfected,disease status: Healthy uninfected
1,!Sample_characteristics_ch1,participant id: CB0101,participant id: CB0101,participant id: CB0101,participant id: CB0101,participant id: CB0101,participant id: CB0101,participant id: CB0101,participant id: CB0102,participant id: CB0102,...,participant id: HC0105,participant id: HC0105,participant id: HC0105,participant id: HC0105,participant id: CA0101,participant id: CA0108,participant id: CA0112,participant id: CA0113,participant id: CA0119,participant id: CA0120
2,!Sample_characteristics_ch1,Sex: female,Sex: female,Sex: female,Sex: female,Sex: female,Sex: female,Sex: female,Sex: female,Sex: female,...,Sex: female,Sex: female,Sex: female,Sex: female,Sex: male,Sex: female,Sex: female,Sex: female,Sex: male,Sex: male
3,!Sample_characteristics_ch1,age: 44,age: 44,age: 44,age: 44,age: 44,age: 44,age: 44,age: 29,age: 29,...,age: 40,age: 40,age: 40,age: 40,age: 36,age: 24,age: 28,age: 36,age: 27,age: 38
4,!Sample_characteristics_ch1,covid-19 vaccination history: unvaccinated,covid-19 vaccination history: unvaccinated,covid-19 vaccination history: unvaccinated,covid-19 vaccination history: unvaccinated,covid-19 vaccination history: unvaccinated,covid-19 vaccination history: unvaccinated,covid-19 vaccination history: unvaccinated,covid-19 vaccination history: unvaccinated,covid-19 vaccination history: unvaccinated,...,covid-19 vaccination history: vaccinated,covid-19 vaccination history: vaccinated,covid-19 vaccination history: vaccinated,covid-19 vaccination history: vaccinated,covid-19 vaccination history: vaccinated,covid-19 vaccination history: unvaccinated,covid-19 vaccination history: vaccinated,covid-19 vaccination history: unvaccinated,covid-19 vaccination history: unvaccinated,covid-19 vaccination history: vaccinated


Analyze the trait row:

In [29]:
tumor_stage_row = clinical_data.iloc[0]
tumor_stage_row.unique()

array(['!Sample_characteristics_ch1', 'disease status: COVID-19+',
       'disease status: Healthy uninfected'], dtype=object)

Determine the trait row, age row, and gender row. Then implement the conversion functions:

In [30]:
trait_row = 0
age_row = 3
gender_row = 2

def convert_trait(trait):
    if (trait == 'disease status: Healthy uninfected'):
        return 0
    else:
        return 1


def convert_age(age_string):
    if age_string.lower() == 'n.a.':
        return None
    try:
        age = int(age_string.split(':')[1])
        age = round(age, 0)
        return age
    except (ValueError, IndexError):
        return None

def convert_gender(gender_string):
    if (gender_string.lower() == 'sex: female' or gender_string.lower() == 'sex: f' or gender_string.lower() == 'gender: female' or gender_string.lower() == 'gender: f'):
        return 1
    elif (gender_string.lower() == 'sex: male' or gender_string.lower() == 'sex: m' or gender_string.lower() == 'gender: male' or gender_string.lower() == 'gender: m') :  # changeed 
        return 0
    else:
        return None

Check the processed clinical data:

In [31]:
selected_clinical_data = geo_select_clinical_features(clinical_data, TRAIT, trait_row, convert_trait, age_row=age_row,
                                                      convert_age=convert_age, gender_row=gender_row,
                                                      convert_gender=convert_gender)
selected_clinical_data.head()

  clinical_df = clinical_df.applymap(convert_fn)
  clinical_df = clinical_df.applymap(convert_fn)
  clinical_df = clinical_df.applymap(convert_fn)


Unnamed: 0,GSM7783810,GSM7783811,GSM7783812,GSM7783813,GSM7783814,GSM7783815,GSM7783816,GSM7783817,GSM7783818,GSM7783819,...,GSM7784037,GSM7784038,GSM7784039,GSM7784040,GSM7784041,GSM7784042,GSM7784043,GSM7784044,GSM7784045,GSM7784046
COVID-19,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
Age,44,44,44,44,44,44,44,29,29,29,...,40,40,40,40,36,24,28,36,27,38
Gender,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,0,1,1,1,0,0


### Genetic data preprocessing and final filtering

Check the genetic data:

In [32]:
genetic_data = get_genetic_data(matrix_file)
genetic_data.head()

Unnamed: 0_level_0,GSM7783810,GSM7783811,GSM7783812,GSM7783813,GSM7783814,GSM7783815,GSM7783816,GSM7783817,GSM7783818,GSM7783819,...,GSM7784037,GSM7784038,GSM7784039,GSM7784040,GSM7784041,GSM7784042,GSM7784043,GSM7784044,GSM7784045,GSM7784046
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACE,16.29,7.37,9.78,10.5,8.98,6.63,12.97,16.93,16.1,19.92,...,21.18,20.37,22.39,22.27,15.03,21.23,9.07,10.96,8.77,14.4
ACKR2,16.53,19.8,14.36,15.45,16.33,16.01,12.41,25.99,25.12,32.97,...,19.97,11.24,12.66,18.56,22.54,19.71,14.6,10.77,8.03,18.31
ACKR3,20.1,18.44,21.72,21.52,14.5,13.08,16.71,25.99,23.19,10.99,...,29.65,15.45,16.55,23.2,24.05,16.68,21.36,22.42,28.28,21.65
ACKR4,7.57,12.6,8.31,7.6,9.39,14.62,11.53,21.76,19.32,19.23,...,44.78,21.77,12.66,31.55,24.05,30.33,7.59,15.43,10.61,18.53
ACOX1,1054.81,1120.94,1648.33,1673.2,1338.32,1851.71,2153.5,1499.15,1475.12,1340.75,...,1045.69,1004.44,1040.78,1209.12,1199.25,1194.94,1088.64,1232.43,1094.51,975.54


Check if the gene dataset requires mapping to get the gene symbols corresponding to each data row.

In [33]:
requires_gene_mapping = True

if requires_gene_mapping:
    gene_annotation = get_gene_annotation(soft_file)
    gene_annotation_summary = preview_df(gene_annotation)
    print(gene_annotation_summary)

gene_annotation.columns

{'ID': ['ACE', 'ACKR2', 'ACKR3', 'ACKR4', 'ACOX1'], 'GB_ACC': ['NM_000789.2', 'NM_001296.5', 'NM_020311.1', 'NM_016557.2', 'NM_001185039.1'], 'PROBE ID - v1.0': ['NM_000789.2:2115', 'NM_001296.5:741', 'NM_020311.1:375', 'NM_016557.2:854', 'NM_001185039.1:1094'], 'PROBE ID - v1.1': ['NM_000789.2:2115', 'NM_001296.5:741', 'NM_020311.1:375', 'NM_016557.2:854', 'NM_001185039.1:1094'], 'SPOT_ID': [nan, nan, nan, nan, nan]}


Index(['ID', 'GB_ACC', 'PROBE ID - v1.0', 'PROBE ID - v1.1', 'SPOT_ID'], dtype='object')

In [11]:
if requires_gene_mapping:
    identifier_key = 'ID'
    gene_symbol_key = 'Gene Symbol'
    gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)
    genetic_data = apply_gene_mapping(genetic_data, gene_mapping)

In [34]:
genetic_data = normalize_gene_symbols_in_index(genetic_data)
genetic_data

Unnamed: 0,GSM7783810,GSM7783811,GSM7783812,GSM7783813,GSM7783814,GSM7783815,GSM7783816,GSM7783817,GSM7783818,GSM7783819,...,GSM7784037,GSM7784038,GSM7784039,GSM7784040,GSM7784041,GSM7784042,GSM7784043,GSM7784044,GSM7784045,GSM7784046
ACE,16.29,7.37,9.78,10.50,8.98,6.63,12.97,16.93,16.10,19.92,...,21.18,20.37,22.39,22.27,15.03,21.23,9.07,10.96,8.77,14.40
ACKR2,16.53,19.80,14.36,15.45,16.33,16.01,12.41,25.99,25.12,32.97,...,19.97,11.24,12.66,18.56,22.54,19.71,14.60,10.77,8.03,18.31
ACKR3,20.10,18.44,21.72,21.52,14.50,13.08,16.71,25.99,23.19,10.99,...,29.65,15.45,16.55,23.20,24.05,16.68,21.36,22.42,28.28,21.65
ACKR4,7.57,12.60,8.31,7.60,9.39,14.62,11.53,21.76,19.32,19.23,...,44.78,21.77,12.66,31.55,24.05,30.33,7.59,15.43,10.61,18.53
ACOX1,1054.81,1120.94,1648.33,1673.20,1338.32,1851.71,2153.50,1499.15,1475.12,1340.75,...,1045.69,1004.44,1040.78,1209.12,1199.25,1194.94,1088.64,1232.43,1094.51,975.54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XBP1,1482.88,1728.78,1000.26,1187.84,1141.74,1075.05,875.83,1403.04,966.88,850.33,...,1051.14,967.22,1029.10,964.14,804.01,928.05,1186.09,1288.32,1105.08,1220.51
XCR1,10.11,12.47,9.10,5.01,12.16,21.93,13.69,29.02,36.72,19.92,...,24.21,26.69,46.73,46.40,36.07,36.39,10.70,17.73,14.89,26.12
YWHAQ,1804.95,1603.10,1458.75,1563.95,1511.85,1594.34,1377.09,1390.34,1655.49,1526.20,...,1308.93,1371.80,1631.76,1321.41,1523.86,1498.22,1111.90,1436.42,1406.97,1295.89
ZAP70,666.45,665.62,497.28,572.21,550.36,436.22,477.61,588.78,514.68,559.10,...,836.31,714.35,493.62,727.52,811.52,903.79,764.04,899.23,714.05,781.28


Use selected clinical data and genetic data to generate the merged data:

In [35]:
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, genetic_data)
is_available = True

merged_data

Unnamed: 0,COVID-19,Age,Gender,ACE,ACKR2,ACKR3,ACKR4,ACOX1,ACSL1,ACSL3,...,VSIR,VWF,WAS,WIPI1,XAF1,XBP1,XCR1,YWHAQ,ZAP70,ZBP1
GSM7783810,1.0,44.0,1.0,16.29,16.53,20.10,7.57,1054.81,3763.52,28.00,...,859.20,14.18,1346.80,33.66,819.23,1482.88,10.11,1804.95,666.45,573.54
GSM7783811,1.0,44.0,1.0,7.37,19.80,18.44,12.60,1120.94,3348.47,29.11,...,697.07,17.83,1133.98,47.27,240.81,1728.78,12.47,1603.10,665.62,253.06
GSM7783812,1.0,44.0,1.0,9.78,14.36,21.72,8.31,1648.33,5699.41,20.95,...,837.58,29.69,1510.44,48.17,255.67,1000.26,9.10,1458.75,497.28,278.85
GSM7783813,1.0,44.0,1.0,10.50,15.45,21.52,7.60,1673.20,6284.57,28.35,...,1034.45,29.09,1587.53,39.70,216.95,1187.84,5.01,1563.95,572.21,260.41
GSM7783814,1.0,44.0,1.0,8.98,16.33,14.50,9.39,1338.32,5751.24,19.11,...,908.03,15.34,1414.47,33.45,222.65,1141.74,12.16,1511.85,550.36,167.55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM7784042,0.0,24.0,1.0,21.23,19.71,16.68,30.33,1194.94,2932.75,15.16,...,1244.98,63.69,1248.01,54.59,191.07,928.05,36.39,1498.22,903.79,163.77
GSM7784043,0.0,28.0,1.0,9.07,14.60,21.36,7.59,1088.64,2654.54,23.13,...,1088.40,26.29,1177.08,41.46,195.21,1186.09,10.70,1111.90,764.04,185.47
GSM7784044,0.0,36.0,1.0,10.96,10.77,22.42,15.43,1232.43,3737.59,27.47,...,1011.74,25.37,1482.39,48.94,154.93,1288.32,17.73,1436.42,899.23,237.42
GSM7784045,0.0,27.0,0.0,8.77,8.03,28.28,10.61,1094.51,2009.81,26.62,...,944.60,28.19,1221.10,40.14,222.81,1105.08,14.89,1406.97,714.05,233.02


Check if the merged data biased or not:

In [38]:
trait_type = 'binary'
print(f"The merged dataset contains {len(merged_data)} samples.")
is_trait_biased, merged_data = judge_and_remove_biased_features(merged_data, TRAIT, trait_type=trait_type)
is_trait_biased

The merged dataset contains 237 samples.
For the feature 'COVID-19', the least common label is '0.0' with 37 occurrences. This represents 15.61% of the dataset.
The distribution of the feature 'COVID-19' in this dataset is fine.

Quartiles for 'Age':
  25%: 30.0
  50% (Median): 36.0
  75%: 43.0
Min: 24.0
Max: 65.0
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '0.0' with 77 occurrences. This represents 32.49% of the dataset.
The distribution of the feature 'Gender' in this dataset is fine.



False

Save the data as a csv file:

In [39]:
is_available = False

if is_available:
    save_cohort_info(cohort, JSON_PATH, is_available, is_trait_biased, merged_data, note='')
else:
    save_cohort_info(cohort, JSON_PATH, is_available)
merged_data.head()
if not is_trait_biased:
    merged_data.to_csv(os.path.join(OUTPUT_DIR, cohort + '.csv'), index=False)