In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Red_Hair/GSE207744'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Transcriptomic study on human skin samples: identification of actinic keratoses two risk classes."
!Series_summary	"Gene expression profile analysis allowed to identify 2 classes of AK."
!Series_overall_design	"A total of 72 tissue samples (24 NL, 23 L, 4 PL and 21 AK) were isolated from 24 patients. For each patient, samples were acquired on the lesion (L or AK), on the perilesional (PL) i.e. safety surgical margin area (often containing AK) and/or on the non-lesional (NL) parts of the elliptical surgical excision."
Sample Characteristics Dictionary:
{0: ['patient number: 001', 'patient number: 006', 'patient number: 016', 'patient number: 017', 'patient number: 018=026=045', 'patient number: 028', 'patient number: 029', 'patient number: 035=041', 'patient number: 048', 'patient number: 056', 'patient number: 057', 'patient number: 074', 'patient number: 075', 'patient number: 077', 'patient number: 082', 'patient number: 090', 'patient number: 0

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Check if the dataset likely contains gene expression data
is_gene_available = True  # inferred from the series title and summary

# Variable availability check
trait_row = None  # 'Red_Hair' data is not provided explicitly in the sample characteristics dictionary
age_row = None    # 'age' data is not provided
gender_row = None  # 'gender' data is not provided

# Define conversion functions
def convert_trait(value):
    return 1 if 'Red_Hair' in value else 0 if 'Other_Hair' in value else None

def convert_age(value):
    try:
        return float(value.split(':')[1].strip())
    except:
        return None

def convert_gender(value):
    gender = value.split(':')[1].strip().lower()
    if gender == 'female':
        return 0
    elif gender == 'male':
        return 1
    else:
        return None

# Save cohort information
save_cohort_info('GSE207744', './preprocessed/Red_Hair/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical feature extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Red_Hair', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Red_Hair/trait_data/GSE207744.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))
else:
    print("Clinical data extraction skipped as trait_row is not available.")


A new JSON file was created at: ./preprocessed/Red_Hair/cohort_info.json
Clinical data extraction skipped as trait_row is not available.
