In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Bipolar_disorder/GSE62191'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Gene expression profiles of patients with schizophrenia, bipolar disorder and healthy controls"
!Series_summary	"Schizophrenia (SZ) and bipolar disorder (BD) are severe psychiatric conditions, with a lifetime prevalence of about 1%. Both disorders have a neurodevelopment component, with onset of symptoms occurring most frequently during late adolescence or early adulthood. Genetic findings indicate the existence of an overlap in genetic susceptibility across the disorders. These gene expression profiles were used to identify the molecular mechanisms that differentiate SZ and BP from healthy controls but also that distinguish both from healthy individuals. They were also used to expand an analysis from an experiment that searched molecular alterations in human induced pluripotent stem cells derived from fibroblasts from control subject and individual with schizophrenia and further differentiated to neuron to identify genes relevant for the developm

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_age = None  # define the functions when applicable

# Check for gene expression data availability
is_gene_available = True  # This dataset contains gene expression data based on the given description.

# Variable Availability
trait_row = 1 if len(set(['bipolar disorder', 'healthy control', 'schizophrenia'])) > 1 else None
age_row = 2 if len(set(['29 yr', '58 yr', '54 yr', '42 yr', '63 yr', '64 yr', '59 yr', '51 yr', '49 yr', '41 yr', '48 yr', '47 yr', '45 yr', '44 yr', '35 yr', '38 yr', '43 yr', '50 yr', '56 yr', '33 yr', '34 yr', '46 yr', '40 yr', '31 yr', '39 yr', '53 yr', '60 yr', '19 yr', '55 yr', '24 yr'])) > 1 else None
gender_row = None  # Gender data is not explicitly available; inferred from trait_row with only 'male' entry, hence not available.

# Data Type Conversion Functions
def convert_trait(value):
    value = value.split(':')[-1].strip().lower()
    if value == 'bipolar disorder':
        return 1
    elif value == 'healthy control':
        return 0
    elif value == 'schizophrenia':
        return None  # Excluding schizophrenia from the analysis
    else:
        return None

def convert_age(value):
    try:
        return int(value.split(':')[-1].strip().split(' ')[0])
    except:
        return None

def convert_gender(value):
    value = value.split(':')[-1].strip().lower()
    if value == 'male':
        return 1
    elif value == 'female':
        return 0
    else:
        return None

# Save cohort information
save_cohort_info('GSE62191', './preprocessed/Bipolar_disorder/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Bipolar_disorder', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Bipolar_disorder/trait_data/GSE62191.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM1521625': [1, 29], 'GSM1521626': [1, 58], 'GSM1521627': [1, 54], 'GSM1521628': [1, 42], 'GSM1521629': [1, 63], 'GSM1521630': [1, 64], 'GSM1521631': [1, 59], 'GSM1521632': [1, 51], 'GSM1521633': [1, 49], 'GSM1521634': [1, 41], 'GSM1521635': [1, 48], 'GSM1521636': [1, 47], 'GSM1521637': [1, 45], 'GSM1521638': [1, 41], 'GSM1521639': [1, 29], 'GSM1521640': [1, 44], 'GSM1521641': [1, 48], 'GSM1521642': [1, 42], 'GSM1521643': [1, 35], 'GSM1521644': [1, 35], 'GSM1521645': [1, 38], 'GSM1521646': [1, 44], 'GSM1521647': [1, 43], 'GSM1521648': [1, 50], 'GSM1521649': [1, 56], 'GSM1521650': [1, 29], 'GSM1521651': [1, 59], 'GSM1521652': [1, 35], 'GSM1521653': [1, 33], 'GSM1521654': [0, 34], 'GSM1521655': [0, 44], 'GSM1521656': [0, 46], 'GSM1521657': [0, 51], 'GSM1521658': [0, 33], 'GSM1521659': [0, 48], 'GSM1521660': [0, 40], 'GSM1521661': [0, 31], 'GSM1521662': [0, 39], 'GSM1521663': [None, 59], 'GSM1521664': [0, 53], 'GSM1521665': [0, 53], 'GSM1521666': [0, 38], 'GSM1521667': [0, 60], 'GSM152

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['13', '14', '17', '18', '22', '26', '27', '30', '31', '35', '36', '37',
       '38', '40', '42', '44', '46', '47', '49', '51'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['1', '2', '3', '4', '5'], 'COL': ['266', '266', '266', '266', '266'], 'ROW': [170.0, 168.0, 166.0, 164.0, 162.0], 'NAME': ['GE_BrightCorner', 'DarkCorner', 'DarkCorner', 'DarkCorner', 'DarkCorner'], 'SPOT_ID': ['GE_BrightCorner', 'DarkCorner', 'DarkCorner', 'DarkCorner', 'DarkCorner'], 'CONTROL_TYPE': ['pos', 'pos', 'pos', 'pos', 'pos'], 'REFSEQ': [nan, nan, nan, nan, nan], 'GB_ACC': [nan, nan, nan, nan, nan], 'GENE': [nan, nan, nan, nan, nan], 'GENE_SYMBOL': [nan, nan, nan, nan, nan], 'GENE_NAME': [nan, nan, nan, nan, nan], 'UNIGENE_ID': [nan, nan, nan, nan, nan], 'ENSEMBL_ID': [nan, nan, nan, nan, nan], 'TIGR_ID': [nan, nan, nan, nan, nan], 'ACCESSION_STRING': [nan, nan, nan, nan, nan], 'CHROMOSOMAL_LOCATION': [nan, nan, nan, nan, nan], 'CYTOBAND': [nan, nan, nan, nan, nan], 'DESCRIPTION': [nan, nan, nan, nan, nan], 'GO_ID': [nan, nan, nan, nan, nan], 'SEQUENCE': [nan, nan, nan, nan, nan], 'SPOT_ID.1': [nan, nan, nan, nan, nan], 'ORDER': [1.0, 2.0, 3.

### Step 6: Gene Identifier Mapping

In [7]:
# 1. Determine the keys for identifier and gene symbol
identifier_key = 'ID'
gene_symbol_key = 'GENE_SYMBOL'

# 2. Get the dataframe storing the mapping between probe IDs and genes
gene_mapping = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping to convert probe data to gene expression data
gene_data = apply_gene_mapping(gene_data, gene_mapping)

# Ensure variable is updated to reflect changes
print("Updated Gene Data Preview:")
print(preview_df(gene_data))


Updated Gene Data Preview:
{'GSM1521625': [2.859310701, 0.852036496, 3.244941558, 2.368642445, 2.462211593], 'GSM1521626': [2.817676211, 1.395841451, 3.081568111, 1.986140348, 2.436130409], 'GSM1521627': [2.949388806, 1.104210578, 3.303053495, 2.35147419, 2.760864531], 'GSM1521628': [2.724122358, 1.160678237, 3.315110516, 2.348331694, 2.793044383], 'GSM1521629': [2.714862584, 0.919712924, 3.281970505, 2.241533304, 2.446549469], 'GSM1521630': [2.843104702, 1.386967649, 3.263994065, 2.265350097, 2.571051368], 'GSM1521631': [3.168496187, 1.389756075, 3.151114475, 2.387200153, 2.389252797], 'GSM1521632': [2.74081778, 0.883977869, 3.146939614, 2.562757212, 2.380986127], 'GSM1521633': [2.982494717, 0.780527172, 3.201341993, 2.529296285, 2.443517437], 'GSM1521634': [2.961132128, 1.086456161, 3.334041085, 2.438567715, 2.716068643], 'GSM1521635': [2.978087182, 1.723935768, 3.117011336, 2.104928517, 2.295952073], 'GSM1521636': [2.417431722, 0.798592202, 3.221182261, 1.898782536, 2.293112144], 'G

### Step 7: Data Normalization and Merging

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Bipolar_disorder/gene_data/GSE62191.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Bipolar_disorder')

# 4. Save the cohort information.
save_cohort_info('GSE62191', './preprocessed/Bipolar_disorder/cohort_info.json', True, True, trait_biased, merged_data)

if not trait_biased:
    # 5. If the trait is not severely biased, save the merged data to a csv file.
    csv_path = './preprocessed/Bipolar_disorder/GSE62191.csv'
    unbiased_merged_data.to_csv(csv_path)


For the feature 'Bipolar_disorder', the least common label is '1.0' with 29 occurrences. This represents 49.15% of the dataset.
The distribution of the feature 'Bipolar_disorder' in this dataset is fine.

Quartiles for 'Age':
  25%: 38.0
  50% (Median): 45.0
  75%: 49.5
Min: 19.0
Max: 64.0
The distribution of the feature 'Age' in this dataset is fine.

