In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Type_1_Diabetes_Risk/GSE30208'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Genome-wide expression kinetics of children with T1D-associated autoantibodies compared to healthy matched controls I"
!Series_summary	"To unravel genes and molecular pathways involved in the pathogenesis of type 1 diabetes (T1D), we performed genome-wide gene expression profiling of prospective venous blood samples from children developing T1D-associated autoantibodies or progressing towards clinical diagnosis."
!Series_overall_design	"63 peripheral blood RNA samples from 6 autoantibody-positive children (Case) and their matched controls (Control) were analyzed with Illumina Sentrix WG-6 v2 genome-wide arrays, in order to study the gene expression changes occuring during the pathogenesis of Type 1 diabetes (T1D). Each case child (positive for T1D-specific autoantibodies) was matched with a persistently autoantibody-negative control child, with the same HLA-DQB1 risk category, gender, and place and date of birth. Seroconversion is determined as th

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = True
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Your code here

# Check for gene expression data
is_gene_available = True  # Based on the dataset design, it is likely to contain gene expression data.

# Data availability for 'Type_1_Diabetes_Risk' (Trait), 'age', and 'gender'
trait_row = 0  # T1D case/control appears to be represented in key 0
age_row = 1  # age at sample (months) appears in key 1
gender_row = 4  # gender appears in key 4

# Data type conversion functions
def convert_trait(value):
    try:
        return 1 if 'case' in value.lower() else 0
    except:
        return None

def convert_age(value):
    try:
        return float(value.split(":")[1].strip())
    except:
        return None

def convert_gender(value):
    try:
        gender = value.split(":")[1].strip().lower()
        return 1 if gender == 'male' else 0
    except:
        return None

# Save Metadata
save_cohort_info('GSE30208', './preprocessed/Type_1_Diabetes_Risk/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction if Trait Data is Available
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Type_1_Diabetes_Risk', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Type_1_Diabetes_Risk/trait_data/GSE30208.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM747554': [1.0, 110.0, 1.0], 'GSM747555': [1.0, 113.0, 1.0], 'GSM747556': [1.0, 116.0, 1.0], 'GSM747557': [1.0, 119.0, 1.0], 'GSM747558': [1.0, 125.0, 1.0], 'GSM747559': [1.0, 128.0, 1.0], 'GSM747560': [1.0, 131.0, 1.0], 'GSM747561': [1.0, 134.0, 1.0], 'GSM747562': [1.0, 110.0, 1.0], 'GSM747563': [1.0, 115.0, 1.0], 'GSM747564': [1.0, 128.0, 1.0], 'GSM747565': [1.0, 134.0, 1.0], 'GSM747566': [1.0, 140.0, 1.0], 'GSM747567': [1.0, 21.0, 1.0], 'GSM747568': [1.0, 25.0, 1.0], 'GSM747569': [1.0, 31.0, 1.0], 'GSM747570': [1.0, 34.0, 1.0], 'GSM747571': [1.0, 41.0, 1.0], 'GSM747572': [1.0, 45.0, 1.0], 'GSM747573': [1.0, 50.0, 1.0], 'GSM747574': [1.0, 19.0, 1.0], 'GSM747575': [1.0, 22.0, 1.0], 'GSM747576': [1.0, 27.0, 1.0], 'GSM747577': [1.0, 39.0, 1.0], 'GSM747578': [1.0, 49.0, 0.0], 'GSM747579': [1.0, 55.0, 0.0], 'GSM747580': [1.0, 61.0, 0.0], 'GSM747581': [1.0, 66.0, 0.0], 'GSM747582': [1.0, 67.0, 0.0], 'GSM747583': [1.0, 70.0, 0.0], 'GSM747584': [1.0, 73.0, 0.0], 'GSM747585': [1.0, 55.0, 

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['ILMN_1651199', 'ILMN_1651209', 'ILMN_1651210', 'ILMN_1651217',
       'ILMN_1651221', 'ILMN_1651228', 'ILMN_1651229', 'ILMN_1651232',
       'ILMN_1651234', 'ILMN_1651235', 'ILMN_1651236', 'ILMN_1651237',
       'ILMN_1651238', 'ILMN_1651249', 'ILMN_1651253', 'ILMN_1651254',
       'ILMN_1651259', 'ILMN_1651260', 'ILMN_1651261', 'ILMN_1651268'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['ILMN_1725881', 'ILMN_1910180', 'ILMN_1804174', 'ILMN_1810835', 'ILMN_1758197'], 'Species': ['Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens'], 'Source': ['RefSeq', 'Unigene', 'RefSeq', 'RefSeq', 'RefSeq'], 'Search_Key': ['ILMN_44919', 'ILMN_127219', 'ILMN_139282', 'ILMN_10478', 'ILMN_38756'], 'Transcript': ['ILMN_44919', 'ILMN_127219', 'ILMN_139282', 'ILMN_175835', 'ILMN_38756'], 'ILMN_Gene': ['LOC23117', 'HS.575038', 'FCGR2B', 'SPRR3', 'LOC653895'], 'Source_Reference_ID': ['XM_933824.1', 'Hs.575038', 'XM_938851.1', 'NM_005416.1', 'XM_936379.1'], 'RefSeq_ID': ['XM_933824.1', nan, 'XM_938851.1', 'NM_005416.1', 'XM_936379.1'], 'Unigene_ID': [nan, 'Hs.575038', nan, nan, nan], 'Entrez_Gene_ID': [23117.0, nan, 2213.0, 6707.0, 653895.0], 'GI': [89040007.0, 10437021.0, 88952550.0, 4885606.0, 89033487.0], 'Accession': ['XM_933824.1', 'AK024680', 'XM_938851.1', 'NM_005416.1', 'XM_936379.1'], 'Symbol': ['LOC23117', nan, 'FCGR2B', 'S

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Determine the keys for identifiers and gene symbols
identifier_key = 'ID'
gene_symbol_key = 'Symbol'

# 2. Get the dataframe storing the mapping between probe IDs and genes
mapping_df = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping with the 'apply_gene_mapping' function from the library
gene_data = apply_gene_mapping(gene_data, mapping_df)

# 4. Print a preview of the mapped gene data
print("Mapped Gene Data preview:")
print(preview_df(gene_data, n=5))


Mapped Gene Data preview:
{'GSM747554': [6.69357457, 7.226126569, 7.927078762, 6.018310137, 6.168231254], 'GSM747555': [6.635795155, 7.304157238, 7.894304613, 6.039651871, 6.083833887], 'GSM747556': [6.735361807, 7.158465944, 7.726086536, 6.175817933, 5.992379609], 'GSM747557': [6.696948622, 7.203555969, 7.604356965, 5.987945441, 6.114067598], 'GSM747558': [6.753853527, 7.145141006, 7.472981447, 6.070139145, 6.19400874], 'GSM747559': [6.819744335, 6.31024271, 6.959359882, 6.100763812, 6.133148256], 'GSM747560': [6.725637715, 6.962801938, 7.557036829, 5.768729968, 6.042360253], 'GSM747561': [6.80488385, 6.601944801, 7.321189765, 5.960019726, 6.054707364], 'GSM747562': [6.728491728, 7.495027271, 7.42000402, 5.982260234, 6.12312224], 'GSM747563': [6.851886516, 7.462472769, 7.503592126, 6.090982389, 6.012080955], 'GSM747564': [6.688023684, 7.319679208, 7.442140693, 5.933318539, 6.176254483], 'GSM747565': [6.691406646, 7.116931387, 7.293869647, 5.967516322, 6.105452354], 'GSM747566': [6.865

### Step 7: Data Normalization and Merging

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Type_1_Diabetes_Risk/gene_data/GSE30208.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Type_1_Diabetes_Risk')

# 4. Save the cohort information.
trait_biased = trait_biased  # Assign the correct value to 'trait_biased'
save_cohort_info('GSE30208', './preprocessed/Type_1_Diabetes_Risk/cohort_info.json', True, True, trait_biased, unbiased_merged_data)

if not trait_biased:
    # 5. If the trait is not severely biased, save the merged data to a csv file.
    csv_path = './preprocessed/Type_1_Diabetes_Risk/GSE30208.csv'
    unbiased_merged_data.to_csv(csv_path)


Quartiles for 'Type_1_Diabetes_Risk':
  25%: 1.0
  50% (Median): 1.0
  75%: 1.0
Min: 1.0
Max: 1.0
The distribution of the feature 'Type_1_Diabetes_Risk' in this dataset is severely biased.

Quartiles for 'Age':
  25%: 26.5
  50% (Median): 62.0
  75%: 101.5
Min: 12.0
Max: 140.0
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '0.0' with 31 occurrences. This represents 49.21% of the dataset.
The distribution of the feature 'Gender' in this dataset is fine.

A new JSON file was created at: ./preprocessed/Type_1_Diabetes_Risk/cohort_info.json
