In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Hypothyroidism/GSE75669'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"miRNAs Expression of Mexican Patients with Breast Cancer"
!Series_summary	"miRNAs expression of tumor sample of mexican patients with breast cancer."
!Series_summary	"Samples obtained from the Hospital San Jose Tec de Monterrey."
!Series_overall_design	"The experiments were with one color per patient, miRNAs expression profile is from a tumor sample of mexican patients with breast cancer."
Sample Characteristics Dictionary:
{0: ['tissue: Tumor Sample of Breast Cancer'], 1: ['gender: Female'], 2: ['rna ng/ul: 143.6', 'rna ng/ul: 343', 'rna ng/ul: 625.4', 'rna ng/ul: 146.7', 'rna ng/ul: 1365', 'rna ng/ul: 224.7', 'rna ng/ul: 60', 'rna ng/ul: 151.6', 'rna ng/ul: 1566.4', 'rna ng/ul: 670', 'rna ng/ul: 333.9', 'rna ng/ul: 728', 'rna ng/ul: 1083', 'rna ng/ul: 439', 'rna ng/ul: 1116.8', 'rna ng/ul: 682', 'rna ng/ul: 1011.5', 'rna ng/ul: 291.4', 'rna ng/ul: 419.5', 'rna ng/ul: 178.2', 'rna ng/ul: 401', 'rna ng/ul: 1545', 'rna ng/ul: 2055.6', 'rna ng/ul: 4

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check for gene expression data
series_title = "miRNAs Expression of Mexican Patients with Breast Cancer"
series_summary = "miRNAs expression of tumor sample of mexican patients with breast cancer."
is_gene_available = False # since it is miRNA data.

# Determine the availability of variables 'Hypothyroidism', 'age', and 'gender'

# Check for Hypothyroidism
for key, values in sample_characteristics_dict.items():
    for value in values:
        if isinstance(value, str) and 'hypothyroidism' in value.lower():
            trait_row = key
            break

# Check for age
for key, values in sample_characteristics_dict.items():
    for value in values:
        if isinstance(value, str) and 'age' in value.lower():
            age_row = key
            break

# Check for gender
for key, values in sample_characteristics_dict.items():
    for value in values:
        if isinstance(value, str) and 'gender' in value.lower():
            gender_row = key
            break

# Data type conversion functions

# Function to convert trait data
def convert_trait(value):
    try:
        return 1 if 'hypothyroidism' in value.lower() else 0
    except:
        return None

# Function to convert age data
def convert_age(value):
    try:
        return int(value.split(':')[-1].strip())
    except:
        return None

# Function to convert gender data
def convert_gender(value):
    try:
        gender = value.split(':')[-1].strip().lower()
        return 1 if gender == 'male' else 0
    except:
        return None

import json

def save_cohort_info(gs_id, filepath, is_gene_available, is_clinical_data_available):
    cohort_info = {
        "gs_id": gs_id,
        "is_gene_available": is_gene_available,
        "is_clinical_data_available": is_clinical_data_available
    }
    with open(filepath, 'w') as f:
        json.dump(cohort_info, f)

save_cohort_info('GSE75669', './preprocessed/Hypothyroidism/cohort_info.json', is_gene_available, trait_row is not None)

# If clinical data is available, select clinical features and save to CSV
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Hypothyroidism', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Hypothyroidism/trait_data/GSE75669.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM1963127': [0, 20, 0], 'GSM1963128': [0, 7, 0], 'GSM1963129': [0, 40, 0], 'GSM1963130': [0, 30, 0], 'GSM1963131': [0, 67, 0], 'GSM1963132': [0, 35, 0], 'GSM1963133': [0, 57, 0], 'GSM1963134': [0, 39, 0], 'GSM1963135': [1, 16, 0], 'GSM1963136': [0, 35, 0], 'GSM1963137': [0, 22, 0], 'GSM1963138': [0, 25, 0], 'GSM1963139': [0, 51, 0], 'GSM1963140': [0, 30, 0], 'GSM1963141': [0, 41, 0], 'GSM1963142': [0, 33, 0], 'GSM1963143': [0, 30, 0], 'GSM1963144': [0, 50, 0], 'GSM1963145': [0, 86, 0], 'GSM1963146': [0, 90, 0], 'GSM1963147': [0, 7, 0], 'GSM1963148': [0, 37, 0], 'GSM1963149': [0, 30, 0], 'GSM1963150': [0, 60, 0], 'GSM1963151': [0, 40, 0], 'GSM1963152': [0, 22, 0], 'GSM1963153': [0, 80, 0], 'GSM1963154': [0, 70, 0], 'GSM1963155': [0, 35, 0], 'GSM1963156': [0, 30, 0], 'GSM1963157': [0, 40, 0], 'GSM1963158': [0, 4, 0], 'GSM1963159': [0, 60, 0], 'GSM1963160': [0, 70, 0], 'GSM1963161': [0, 70, 0], 'GSM1963162': [0, 60, 0], 'GSM1963163': [0, 20, 0], 'GSM1963164': [0, 40, 0], 'GSM1963165': 