In [1]:

import sys
sys.path.append('/home/techt/Desktop/a4s')


### Step 1: Initial Data Loading

In [2]:
from utils.preprocess import *
# 1. Identify the paths to the soft file and the matrix file
cohort_dir = '/media/techt/DATA/GEO/Alopecia/GSE80342'
soft_file, matrix_file = geo_get_relevant_filepaths(cohort_dir)

# 2. Read the matrix file to obtain background information and sample characteristics data
background_prefixes = ['!Series_title', '!Series_summary', '!Series_overall_design']
clinical_prefixes = ['!Sample_geo_accession', '!Sample_characteristics_ch1']
background_info, clinical_data = get_background_and_clinical_data(matrix_file, background_prefixes, clinical_prefixes)

# 3. Obtain the sample characteristics dictionary from the clinical dataframe
sample_characteristics_dict = get_unique_values_by_row(clinical_data)

# 4. Explicitly print out all the background information and the sample characteristics dictionary
print("Background Information:")
print(background_info)
print("Sample Characteristics Dictionary:")
print(sample_characteristics_dict)


Background Information:
!Series_title	"Pilot open label clinical trial of oral ruxolitinib in patients with alopecia areata"
!Series_summary	"This goal of these studies were to examine gene expression profiles of skin from patients with alopecia areata undergoing treatment with oral ruxoltinib."
!Series_summary	"Microarray analysis was performed to assess changes in gene expression in affected scalp skin."
!Series_overall_design	"Twelve patients were recruited for this study.  Scalp skin biopsies were performed at baseline and at twelve weeks following the initiation of 20 mg BID ruxolitinib PO.  In addition, biopsies were taken prior to twelve weeks of treatment in some cases.  Biopsies from three healthy controls were also included in the dataset."
Sample Characteristics Dictionary:
{0: ['patientid: NC084', 'patientid: NC098', 'patientid: NC108', 'patientid: R01', 'patientid: R02', 'patientid: R03', 'patientid: R04', 'patientid: R05', 'patientid: R06', 'patientid: R07', 'patientid: R

### Step 2: Dataset Analysis and Clinical Feature Extraction

In [3]:
is_gene_available = False
trait_row = age_row = gender_row = None  # set to different values when applicable
convert_trait = convert_age = convert_gender = None  # define the functions when applicable

# Checking if gene expression data is available
is_gene_available = True  # Based on the summary indicating gene expression profiles

# Checking for Alopecia data availability and type conversion
unique_alopecia_values = ['healthy_control', 'persistent_patchy', 'severe_patchy', 'totalis', 'universalis']
if unique_alopecia_values:
    trait_row = 7

def convert_trait(value):
    if value is None:
        return None
    trait_map = {
        'healthy_control': 0,
        'persistent_patchy': 1,
        'severe_patchy': 1,
        'totalis': 1,
        'universalis': 1
    }
    trait_value = value.split(':')[1].strip()
    return trait_map.get(trait_value, None)

# Checking for age data availability and type conversion
age_values = ['43', '27', '40', '36', '45', '48', '34', '58', '35', '31', '63', '60', '62', '20']
if age_values:
    age_row = 4

def convert_age(value):
    if value is None:
        return None
    try:
        age_value = int(value.split(':')[1].strip())
        return age_value
    except ValueError:
        return None

# Checking for gender data availability and type conversion
unique_gender_values = ['M', 'F']
if unique_gender_values:
    gender_row = 3

def convert_gender(value):
    if value is None:
        return None
    gender_map = {
        'M': 1,
        'F': 0
    }
    gender_value = value.split(':')[1].strip()
    return gender_map.get(gender_value, None)

# Save Metadata
save_cohort_info('GSE80342', './preprocessed/Alopecia/cohort_info.json', is_gene_available, trait_row is not None)

# Clinical Feature Extraction
if trait_row is not None:
    selected_clinical_data = geo_select_clinical_features(clinical_data, 'Alopecia', trait_row, convert_trait, age_row, convert_age, gender_row, convert_gender)
    csv_path = './preprocessed/Alopecia/trait_data/GSE80342.csv'
    selected_clinical_data.to_csv(csv_path)
    print(preview_df(selected_clinical_data))


{'GSM2124815': [0, 43, 1], 'GSM2124816': [0, 27, 0], 'GSM2124817': [0, 40, 0], 'GSM2124818': [1, 36, 0], 'GSM2124819': [1, 45, 0], 'GSM2124820': [1, 48, 1], 'GSM2124821': [1, 34, 1], 'GSM2124822': [1, 34, 1], 'GSM2124823': [1, 58, 0], 'GSM2124824': [1, 35, 0], 'GSM2124825': [1, 31, 0], 'GSM2124826': [1, 63, 1], 'GSM2124827': [1, 60, 0], 'GSM2124828': [1, 62, 0], 'GSM2124829': [1, 20, 1], 'GSM2124830': [1, 60, 0], 'GSM2124831': [1, 58, 0], 'GSM2124832': [1, 35, 0], 'GSM2124833': [1, 31, 0], 'GSM2124834': [1, 48, 1], 'GSM2124835': [1, 34, 1], 'GSM2124836': [1, 36, 0], 'GSM2124837': [1, 45, 0], 'GSM2124838': [1, 48, 1], 'GSM2124839': [1, 34, 1], 'GSM2124840': [1, 58, 0], 'GSM2124841': [1, 31, 0], 'GSM2124842': [1, 63, 1], 'GSM2124843': [1, 60, 0], 'GSM2124844': [1, 62, 0], 'GSM2124845': [1, 45, 0]}
