In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Mitochondrial_Disorders/GSE42986'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Transcriptome profiling in human primary mitochondrial respiratory chain disease"
!Series_summary	"Primary mitochondrial respiratory chain (RC) diseases are heterogeneous in etiology and manifestations but collectively impair cellular energy metabolism.  To identify a common cellular response to RC disease, systems biology level transcriptome investigations were performed in human RC disease skeletal muscle and fibroblasts.  Global transcriptional and post-transcriptional dysregulation in a tissue-specific fashion was identified across diverse RC complex and genetic etiologies.  RC disease muscle was characterized by decreased transcription of cytosolic ribosomal proteins to reduce energy-intensive anabolic processes, increased transcription of mitochondrial ribosomal proteins, shortened 5'-UTRs to improve translational efficiency, and stabilization of 3'-UTRs containing AU-rich elements.  These same modifications in a reversed direction typified 

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Define sample_characteristics based on provided background information
sample_characteristics = {
    0: ['tissue: Skeletal muscle', 'tissue: fibroblast cell line'],
    1: [
        'respiratory chain complex deficiency: No Respiratory Chain Complex Deficiency',
        'respiratory chain complex deficiency: Complexes I and III',
        'respiratory chain complex deficiency: Complex IV',
        'respiratory chain complex deficiency: Complexes II and III',
        'respiratory chain complex deficiency: Not measured; 87% mtDNA depletion in muscle',
        'respiratory chain complex deficiency: Complex IV; 70% mtDNA depletion in liver',
        'respiratory chain complex deficiency: Complex IV; 93% mtDNA depletion in muscle',
        'respiratory chain complex deficiency: Complexes I and IV',
        'respiratory chain complex deficiency: Complex I',
        'respiratory chain complex deficiency: Complex I and IV',
        'respiratory chain complex deficiency in muscle: Not Determined',
        'respiratory chain complex deficiency in muscle: Complex I+III Deficiency',
        'respiratory chain complex deficiency in muscle: No Respiratory Chain Complex Deficiency',
        'respiratory chain complex deficiency in muscle: Complexes I and III',
        'respiratory chain complex deficiency in muscle: Complex IV',
        'respiratory chain complex deficiency in muscle: Complexes II and III',
        'respiratory chain complex deficiency in muscle: Complex IV; 93% mtDNA depletion in muscle',
        'respiratory chain complex deficiency in muscle: Complex I'
    ],
    2: ['gender: F', 'gender: M'],
    3: [
        'age (years): 0.76', 'age (years): 20', 'age (years): 16', 'age (years): 1', 
        'age (years): 0.75', 'age (years): 3', 'age (years): 0.2', 'age (years): 0.9', 
        'age (years): 2', 'age (years): 6', 'age (years): 10', 'age (years): 4', 
        'age (years): 0.3', 'age (years): 8', 'age (years): 72', 'age (years): 54', 
        'age (years): 23', 'age (years): 60', 'age (years): 67', 'age (years): 59', 
        'age (years): 11', 'age (years): 46', 'age (years): 42', 'age (years): not obtained', 
        'age (years): 5', 'age (years): 30', 'age (years): 36', 'age (years): 39', 
        'age (years): 0.1', 'age (years): 0.7'
    ],
    4: [
        'informatic analysis group: Control Group', 
        'informatic analysis group: Mito Disease Group', 
        'informatic analysis group: Excluded - poor quality', 
        'informatic analysis group: Excluded - sample outlier'
    ]
}

# Step 1: Check for gene expression data availability
is_gene_available = True  # Given that the dataset uses Affymetrix arrays appropriate for transcriptome profiling

# Step 2.1: Data availability
# Identify keys in the sample characteristics dictionary where variable is recorded

# 'Mitochondrial_Disorders' is inferred from 'informatic analysis group'
trait_row = 4 if any('informatic analysis group: Mito Disease Group' in val for val in sample_characteristics[4]) else None

# 'age' values are available and seem non-constant
age_row = 3 if len(set(val.split(':')[1].strip() for val in sample_characteristics[3])) > 1 else None

# 'gender' values are available and seem non-constant
gender_row = 2 if len(set(val.split(':')[1].strip() for val in sample_characteristics[2])) > 1 else None

# Step 2.3: Data type conversion

# Function to convert 'Mitochondrial_Disorders'
def convert_trait(value):
    val = value.split(':')[1].strip()
    if val == 'Control Group':
        return 0
    elif val == 'Mito Disease Group':
        return 1
    return None

# Function to convert 'age'
def convert_age(value):
    val = value.split(':')[1].strip()
    try:
        return float(val)
    except ValueError:
        return None  # If age is not obtained

# Function to convert 'gender'
def convert_gender(value):
    val = value.split(':')[1].strip()
    if val == 'F':
        return 0
    elif val == 'M':
        return 1
    return None

# Save cohort information
save_cohort_info('GSE42986', './preprocessed/Mitochondrial_Disorders/cohort_info.json', is_gene_available, trait_row is not None)

# Extract clinical features if trait_row is not None
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Mitochondrial_Disorders', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Mitochondrial_Disorders/trait_data/GSE42986.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM1054461': [0.0, 0.76, 0.0], 'GSM1054462': [1.0, 20.0, 1.0], 'GSM1054463': [1.0, 20.0, 1.0], 'GSM1054464': [1.0, 16.0, 0.0], 'GSM1054465': [1.0, 1.0, 0.0], 'GSM1054466': [0.0, 0.75, 0.0], 'GSM1054467': [0.0, 0.75, 0.0], 'GSM1054468': [0.0, 3.0, 1.0], 'GSM1054469': [0.0, 3.0, 1.0], 'GSM1054470': [1.0, 0.2, 0.0], 'GSM1054471': [1.0, 0.9, 0.0], 'GSM1054472': [0.0, 2.0, 0.0], 'GSM1054473': [1.0, 6.0, 1.0], 'GSM1054474': [0.0, 10.0, 0.0], 'GSM1054475': [1.0, 4.0, 0.0], 'GSM1054476': [1.0, 0.3, 0.0], 'GSM1054477': [1.0, 8.0, 1.0], 'GSM1054478': [nan, 72.0, 1.0], 'GSM1054479': [1.0, 54.0, 0.0], 'GSM1054480': [1.0, 23.0, 0.0], 'GSM1054481': [nan, 0.75, 1.0], 'GSM1054482': [nan, 60.0, 1.0], 'GSM1054483': [0.0, 67.0, 0.0], 'GSM1054484': [0.0, 59.0, 0.0], 'GSM1054485': [0.0, 59.0, 1.0], 'GSM1054486': [nan, 11.0, 0.0], 'GSM1054487': [1.0, 46.0, 0.0], 'GSM1054488': [nan, 42.0, 1.0], 'GSM1054489': [nan, 2.0, 0.0], 'GSM1054490': [0, None, 0], 'GSM1054491': [0, None, 1], 'GSM1054492': [0, None, 1]

### Step 3: Gene Data Extraction

In [4]:
# 1. Use the get_genetic_data function from the library to get the gene_data from the matrix_file previously defined.
gene_data = get_genetic_data(matrix_file)

# 2. Print the first 20 row ids for the following step.
print(gene_data.index[:20])


Index(['100009676_at', '10000_at', '10001_at', '10002_at', '100033416_at',
       '100033422_at', '100033423_at', '100033424_at', '100033425_at',
       '100033426_at', '100033428_at', '100033431_at', '100033434_at',
       '100033436_at', '100033438_at', '100033439_at', '100033444_at',
       '100033800_at', '100033806_at', '100033819_at'],
      dtype='object', name='ID')


### Step 4: Gene Identifier Review

In [5]:
requires_gene_mapping = True


### Step 5: Gene Annotation (Conditional)

In [6]:
# 1. Use the 'get_gene_annotation' function from the library to get gene annotation data from the soft file.
gene_annotation = get_gene_annotation(soft_file)

# 2. Use the 'preview_df' function from the library to preview the data and print out the results.
print("Gene annotation preview:")
print(preview_df(gene_annotation))


Gene annotation preview:
{'ID': ['1_at', '2_at', '9_at', '10_at', '12_at'], 'Gene_ID': ['1', '2', '9', '10', '12'], 'ORF': ['A1BG', 'A2M', 'NAT1', 'NAT2', 'SERPINA3'], 'Symbol': ['A1BG', 'A2M', 'NAT1', 'NAT2', 'SERPINA3'], 'Chromosome': ['19', '12', '8', '8', '14'], 'RefSeq_ID': ['NM_130786;NP_570602', 'NM_000014;NP_000005', 'NM_000662;NM_001160170;NM_001160171;NM_001160172;NM_001160173;NM_001160174;NM_001160175;NM_001160176;NM_001160179;NP_000653;NP_001153642;NP_001153643;NP_001153644;NP_001153645;NP_001153646;NP_001153647;NP_001153648;NP_001153651', 'NM_000015;NP_000006', 'NM_001085;NP_001076'], 'Num_Probes': [47.0, 167.0, 74.0, 20.0, 56.0], 'Full_Name': ['alpha-1-B glycoprotein', 'alpha-2-macroglobulin', 'N-acetyltransferase 1 (arylamine N-acetyltransferase)', 'N-acetyltransferase 2 (arylamine N-acetyltransferase)', 'serpin peptidase inhibitor, clade A (alpha-1 antiproteinase, antitrypsin), member 3']}


### Step 6: Gene Identifier Mapping

In [7]:
# 1. Determine the identifier key and gene_symbol key
identifier_key = 'ID'
gene_symbol_key = 'Symbol'

# 2. Get the dataframe storing the mapping between probe IDs and genes
mapping_df = get_gene_mapping(gene_annotation, identifier_key, gene_symbol_key)

# 3. Apply the mapping and obtain the gene expression dataframe
gene_data = apply_gene_mapping(gene_data, mapping_df)


### Step 7: Data Normalization and Merging

In [8]:
# 1. Normalize the obtained gene data with the 'normalize_gene_symbols_in_index' function from the library.
normalized_gene_data = normalize_gene_symbols_in_index(gene_data)
gene_csv_path = './preprocessed/Mitochondrial_Disorders/gene_data/GSE42986.csv'
normalized_gene_data.to_csv(gene_csv_path)

# 2. Merge the clinical and genetic data with the 'geo_merge_clinical_genetic_data' function from the library.
merged_data = geo_merge_clinical_genetic_data(selected_clinical_data, normalized_gene_data)

# 3. Determine whether the trait and some demographic attributes in the data is severely biased, and remove biased attributes.
trait_biased, unbiased_merged_data = judge_and_remove_biased_features(merged_data, 'Mitochondrial_Disorders')

# If the trait is not severely biased, save the cohort information and the merged data.

# 4. Save the cohort information.
save_cohort_info('GSE42986', './preprocessed/Mitochondrial_Disorders/cohort_info.json', True, True, trait_biased, merged_data)

if not trait_biased:
    # 5. If the trait is not severely biased, save the merged data to a csv file.
    csv_path = './preprocessed/Mitochondrial_Disorders/GSE42986.csv'
    unbiased_merged_data.to_csv(csv_path)


For the feature 'Mitochondrial_Disorders', the least common label is '0.0' with 15 occurrences. This represents 37.50% of the dataset.
The distribution of the feature 'Mitochondrial_Disorders' in this dataset is fine.

Quartiles for 'Age':
  25%: 0.975
  50% (Median): 5.0
  75%: 17.0
Min: 0.2
Max: 67.0
The distribution of the feature 'Age' in this dataset is fine.

For the feature 'Gender', the least common label is '1.0' with 11 occurrences. This represents 27.50% of the dataset.
The distribution of the feature 'Gender' in this dataset is fine.

A new JSON file was created at: ./preprocessed/Mitochondrial_Disorders/cohort_info.json
