In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Heart_rate/GSE122279'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"High-throughput mediation analysis of human proteome and metabolome identifies mediators of post-bariatric surgical diabetes control"
!Series_summary	"To improve the power of mediation in high-throughput studies, here we introduce High-throughput mediation analysis (Hitman), which accounts for direction of mediation and applies empirical Bayesian linear modeling. We apply Hitman in a retrospective, exploratory analysis of the SLIMM-T2D clinical trial in which participants with type 2 diabetes were randomized to Roux-en-Y gastric bypass (RYGB) or nonsurgical diabetes/weight management, and fasting plasma proteome and metabolome were assayed up to 3 years. RYGB caused greater improvement in HbA1c, which was mediated by growth hormone receptor (GHR). GHR’s mediation is more significant than clinical mediators, including BMI. GHR decreases at 3 months postoperatively alongside increased insulin-like growth factor binding proteins IGFBP1/BP2; plasma GH

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable

def parse_value(cell):
    try:
        return cell.split(": ")[1].strip()
    except IndexError:
        return None

# Check if the dataset is likely to contain gene expression data
# From the background information, only proteome and metabolome were assayed, not gene expression data
is_gene_available = False

# Identification of rows for the variables
sample_dict = {
    0: ['tissue: plasma'], 
    # ... rest of your sample characteristics dictionary 
    1: ['acrostic: FENCE', 'acrostic: HUMOR', 'acrostic: VIDEO', 'acrostic: RADIO', 'acrostic: ELBOW', 'acrostic: HUMAN', 'acrostic: MAGIC', 'acrostic: DAISY', 'acrostic: VISTA', 
        #... other sample values 
        'acrostic: INTRO', 'acrostic: KAYAK'], 
    2: ['group: RYGB', 'group: DWM'], 
    3: ['height (cm): 182', 'height (cm): 164', 'height (cm): 172', 'height (cm): 172.72', 'height (cm): 172.5', 'height (cm): 154.5', 
        #... other sample values 
        'height (cm): 190.5', 'height (cm): 173'], 
    4: ['mo: 12', 'mo: 36', 'mo: 3', 'mo: 0', 'mo: 24'], 
    5: ['albumin/creatinine ratio: 0', 'albumin/creatinine ratio: 4', 
        #... other sample values including 'dbp (mmhg):...'
        ], 
    6: ['dbp (mmhg): 70', 'dbp (mmhg): 87.5', 'sbp (mmhg): 122.5', 
        #... other sample values 
        'sbp (mmhg): 148'], 
    7: ['sbp (mmhg): 111', 'sbp (mmhg): 118.5', 'waist (cm): 110.5', 
        #... other sample values 
        'sbp (mmhg): 148'], 
    8: ['serum creatinine (mg/dl): 0.64', 'serum creatinine (mg/dl): 1.22', 
        #... other sample values
        ], 
    9: ['waist (cm): 90.5', 'waist (cm): 96.85', 'bmi: 37.0747025419145', 
        #... other sample values
        ], 
    10: ['weight (kg): 92.84091', 'weight (kg): 67.86364', 
        #... other sample values
        ], 
    #... other rows
    15: ['fat free mass (lb): 162.4', 'fat free mass (lb): 93.01', '6 min walk test heart rate (beats/min): 99', 
         #... other sample values 
         'fat free mass (lb): 114'], 
    16: ['6 min walk test distance (m): 416.55', '6 min walk test distance (m): 444.28', 'hba1c: 7.7', 
         #... other sample values 
         '6 min walk test distance (m): 399.2'], 
    17: ['6 min walk test heart rate (beats/min): 113', '6 min walk test heart rate (beats/min): 102', 
         '6 min walk test heart rate (beats/min): 122', 
         #... other sample values
        ], 
    #... other rows
}

# Assign keys based on availability
trait_row = 17 if any('6 min walk test heart rate' in item for item in sample_dict.get(17, [])) else None

age_row = None  # No explicit "age" variable found in the provided data

gender_row = None  # No explicit "gender" variable found in the provided data

# Define convert functions
def convert_trait(value):
    value = parse_value(value)
    try:
        return float(value)
    except (ValueError, TypeError):
        return None

def convert_age(value):
    return None  # No specific conversion needed as age is not available

def convert_gender(value):
    gender_mapping = {"female": 0, "male": 1}
    value = parse_value(value).lower()
    return gender_mapping.get(value, None)

# Save cohort information
save_cohort_info('GSE122279', './preprocessed/Heart_rate/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Heart_rate', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Heart_rate/trait_data/GSE122279.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM3463033': [113.0], 'GSM3463034': [102.0], 'GSM3463035': [23.0], 'GSM3463036': [113.0], 'GSM3463037': [122.0], 'GSM3463038': [95.0], 'GSM3463039': [92.0], 'GSM3463040': [108.0], 'GSM3463041': [110.0], 'GSM3463042': [120.0], 'GSM3463043': [45.0], 'GSM3463044': [95.0], 'GSM3463045': [155.0], 'GSM3463046': [101.0], 'GSM3463047': [107.0], 'GSM3463048': [82.0], 'GSM3463049': [145.0], 'GSM3463050': [6.0], 'GSM3463051': [107.0], 'GSM3463052': [87.0], 'GSM3463053': [79.0], 'GSM3463054': [78.0], 'GSM3463055': [83.0], 'GSM3463056': [92.0], 'GSM3463057': [115.0], 'GSM3463058': [25.0], 'GSM3463059': [32.0], 'GSM3463060': [8.0], 'GSM3463061': [13.0], 'GSM3463062': [91.0], 'GSM3463063': [85.0], 'GSM3463064': [24.0], 'GSM3463065': [100.0], 'GSM3463066': [30.0], 'GSM3463067': [9.0], 'GSM3463068': [80.0], 'GSM3463069': [122.0], 'GSM3463070': [94.0], 'GSM3463071': [94.0], 'GSM3463072': [112.0], 'GSM3463073': [100.0], 'GSM3463074': [23.0], 'GSM3463075': [30.0], 'GSM3463076': [118.0], 'GSM3463077': [1